In [3]:
import pandas as pd
import numpy as np

### Downloading the SMS Spam Collection dataset from the UCI Machine Learning Repository.
### This dataset contains labeled SMS messages (spam or ham) and is commonly used for binary text classification tasks.
#### Source: https://archive.ics.uci.edu/ml/datasets/sms+spam+collection

In [4]:
import urllib.request
import os
import zipfile

DATA_URL = "https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip"
DATA_DIR = "../data/raw"
ZIP_PATH = "smsspamcollection.zip"

os.makedirs(DATA_DIR, exist_ok=True)

if not os.path.exists(os.path.join(DATA_DIR, "SMSSpamCollection")):
    print("Downloading dataset...")
    urllib.request.urlretrieve(DATA_URL, ZIP_PATH)
    with zipfile.ZipFile(ZIP_PATH, 'r') as zip_ref:
        zip_ref.extractall(DATA_DIR)
    os.remove(ZIP_PATH)
    print("Dataset downloaded and extracted.")
else:
    print("Dataset already present.")

Dataset already present.


In [5]:
import pandas as pd

data_path = os.path.join(DATA_DIR, "SMSSpamCollection")

df = pd.read_csv(data_path, sep='\t', header=None, names=['label', 'text'])
print(df.head())
print(f"Spam messages: {df.label.value_counts()['spam']}")
print(f"Ham messages: {df.label.value_counts()['ham']}")

  label                                               text
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Spam messages: 747
Ham messages: 4825


In [6]:
df

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


### Creating a holdout test set for final evaluation after all model training and tuning is complete.
### This helps simulate how the model will perform on truly unseen data in production.
### The holdout set should not be touched during training, validation, or hyperparameter tuning.

In [7]:
from sklearn.model_selection import train_test_split
# Split off 10% holdout set
df_train_val, df_holdout = train_test_split(df, test_size=0.1, random_state=42, stratify=df['label'])

# Save holdout set separately
df_holdout.to_csv('../data/raw/spam_holdout.csv', index=False)

# Save training+validation set separately
df_train_val.to_csv('../data/raw/spam_train_val.csv', index=False)

In [8]:
df_train_val

Unnamed: 0,label,text
3398,ham,Heehee that was so funny tho
3325,ham,I don wake since. I checked that stuff and saw...
2498,ham,Dai what this da.. Can i send my resume to thi...
1553,ham,U too...
46,ham,Didn't you get hep b immunisation in nigeria.
...,...,...
1932,ham,What pa tell me.. I went to bath:-)
5316,ham,Jus finish watching tv... U?
5203,ham,Me fine..absolutly fine
564,spam,GENT! We are trying to contact you. Last weeke...


In [9]:
# checking the distribution of training set
print(df_train_val.head())
print(f"Spam messages: {df_train_val.label.value_counts()['spam']}")
print(f"Ham messages: {df_train_val.label.value_counts()['ham']}")

     label                                               text
3398   ham                       Heehee that was so funny tho
3325   ham  I don wake since. I checked that stuff and saw...
2498   ham  Dai what this da.. Can i send my resume to thi...
1553   ham                                           U too...
46     ham      Didn't you get hep b immunisation in nigeria.
Spam messages: 672
Ham messages: 4342


In [10]:
df_holdout

Unnamed: 0,label,text
966,ham,Or better still can you catch her and let ask ...
3009,spam,"Loan for any purpose £500 - £75,000. Homeowner..."
2240,ham,Every day i use to sleep after &lt;#&gt; so ...
297,ham,Unless it's a situation where YOU GO GURL woul...
1221,spam,No. 1 Nokia Tone 4 ur mob every week! Just txt...
...,...,...
505,spam,+123 Congratulations - in this week's competit...
3961,ham,I sent lanre fakeye's Eckankar details to the ...
2961,ham,Sir send to group mail check it.
2899,ham,If you r @ home then come down within 5 min


In [11]:
# checking the distribution of the holdout set
print(df_holdout.head())
print(f"Spam messages: {df_holdout.label.value_counts()['spam']}")
print(f"Ham messages: {df_holdout.label.value_counts()['ham']}")

     label                                               text
966    ham  Or better still can you catch her and let ask ...
3009  spam  Loan for any purpose £500 - £75,000. Homeowner...
2240   ham  Every day i use to sleep after  &lt;#&gt;  so ...
297    ham  Unless it's a situation where YOU GO GURL woul...
1221  spam  No. 1 Nokia Tone 4 ur mob every week! Just txt...
Spam messages: 75
Ham messages: 483


### Preprocessing the training dataset for text classification.
### This includes steps like lowercasing, removing punctuation, tokenization, and vectorization.
### The goal is to convert raw SMS text messages into numerical feature vectors that can be used by ML models.

In [12]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk

# Download NLTK data files if not already downloaded
nltk.download('punkt_tab')
nltk.download('stopwords')

# Load dataset
df_train_val = pd.read_csv('../data/raw/spam_train_val.csv')

# Convert labels to binary
df_train_val['label_num'] = df_train_val.label.map({'ham': 0, 'spam': 1})

# Text preprocessing function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenize
    tokens = word_tokenize(text, language='english')
    # Remove stopwords
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    # Stemming
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    # Re-join tokens to return a clean string
    return ' '.join(tokens)

# Apply preprocessing
df_train_val['clean_text'] = df_train_val['text'].apply(preprocess_text)

# Prepare final data
X = df_train_val['clean_text']
y = df_train_val['label_num']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/vivekbharti/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivekbharti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000)),
])

pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       869
           1       0.97      0.66      0.79       134

    accuracy                           0.95      1003
   macro avg       0.96      0.83      0.88      1003
weighted avg       0.95      0.95      0.95      1003



### Training and evaluating multiple models to compare performance and select the best one based on performance

#### Choosing the best model isn’t always black and white — while SVM showed marginally better overall accuracy, Logistic Regression had slightly higher recall with comparable performance. Since recall is often critical in spam detection tasks, I leaned toward Logistic Regression — but both are valid choices depending on the priority.

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score

# Models with class_weight where applicable
models = {
    'LogisticRegression': LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42),
    'SVM': SVC(kernel='linear', class_weight='balanced', probability=True, random_state=42),
    'MultinomialNB': MultinomialNB(),
    'RandomForest': RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42),
}

best_model_name = None
best_accuracy = 0
best_pipeline = None

for name, clf in models.items():
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english')),
        ('clf', clf),
    ])
    pipeline.fit(X_train, y_train)
    y_pred = pipeline.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Model: {name}")
    print(f"Accuracy: {acc:.4f}")
    print(classification_report(y_test, y_pred))
    print('-' * 40)

    if acc > best_accuracy:
        best_accuracy = acc
        best_model_name = name
        best_pipeline = pipeline

print(f"Model with best accuracy: {best_model_name} with accuracy {best_accuracy:.4f}")

Model: LogisticRegression
Accuracy: 0.9791
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       869
           1       0.92      0.92      0.92       134

    accuracy                           0.98      1003
   macro avg       0.96      0.95      0.95      1003
weighted avg       0.98      0.98      0.98      1003

----------------------------------------
Model: SVM
Accuracy: 0.9831
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       869
           1       0.97      0.90      0.93       134

    accuracy                           0.98      1003
   macro avg       0.98      0.95      0.96      1003
weighted avg       0.98      0.98      0.98      1003

----------------------------------------
Model: MultinomialNB
Accuracy: 0.9571
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       869
           1       1.00      0.68      0.81   

### Tuning hyperparameters for Logistic Regression to find the best-performing configuration

In [15]:
### optional: in this example it didn't help

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer, recall_score

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000, solver='liblinear')),
])

param_grid = {
    'clf__C': [0.01, 0.1, 1, 10, 100],
    'clf__class_weight': [None, 'balanced'],
}

# Use recall on positive class (spam) as scoring metric
recall_scorer = make_scorer(recall_score, pos_label=1)

grid_search = GridSearchCV(pipeline, param_grid, scoring=recall_scorer, cv=5, verbose=2, n_jobs=-1)
grid_search.fit(X_train, y_train)


Fitting 5 folds for each of 10 candidates, totalling 50 fits


[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ................clf__C=0.01, clf__class_weight=None; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END ............clf__C=0.01, clf__class_weight=balanced; total time=   0.0s
[CV] END .................clf__C=0.1, clf__class_weight=None; total time=   0.0s
[CV] END .................clf__C=0.1, clf__class_weight=None; total time=   0.0s
[CV] END .................cl

In [16]:
print("Best params:", grid_search.best_params_)
print("Best recall:", grid_search.best_score_)

# Evaluate on test set
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print(classification_report(y_test, y_pred))

Best params: {'clf__C': 10, 'clf__class_weight': 'balanced'}
Best recall: 0.9088785046728972
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       869
           1       0.97      0.91      0.94       134

    accuracy                           0.98      1003
   macro avg       0.98      0.95      0.96      1003
weighted avg       0.98      0.98      0.98      1003



### Training the final model using the entire dataset (train + test) with default parameters
### We proceed with default parameters since hyperparameter tuning did not yield significant improvements

In [17]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Merge train and test sets
X_full = pd.concat([X_train, X_test])
y_full = pd.concat([y_train, y_test])

# Define and train pipeline
logreg_pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(class_weight='balanced', max_iter=1000, random_state=42)),
])

logreg_pipeline.fit(X_full, y_full)

### Saving the final Logistic Regression model as a pickle file for later use in model serving

In [18]:
import pickle

with open('../models/logreg_spam_pipeline.pkl', 'wb') as f:
    pickle.dump(logreg_pipeline, f)

print("✅ Logistic Regression model and TF-IDF vectorizer saved to logreg_spam_pipeline.pkl")

✅ Logistic Regression model and TF-IDF vectorizer saved to logreg_spam_pipeline.pkl


### Getting the best theshold cut for the model based on precision recall tradeoff

In [19]:
import numpy as np
from sklearn.metrics import precision_recall_curve

# Get predicted probabilities from the logistic regression pipeline
y_probs = logreg_pipeline.predict_proba(X_test)[:, 1]

# Compute precision-recall curve
precision, recall, thresholds = precision_recall_curve(y_test, y_probs)

# Print thresholds where precision >= 0.9
print("Thresholds with precision ≥ 0.9:")
for p, r, t in zip(precision, recall, thresholds):
    if p >= 0.9:
        print(f"Threshold: {t:.3f}, Precision: {p:.3f}, Recall: {r:.3f}")

Thresholds with precision ≥ 0.9:
Threshold: 0.342, Precision: 0.905, Recall: 1.000
Threshold: 0.355, Precision: 0.912, Recall: 1.000
Threshold: 0.402, Precision: 0.918, Recall: 1.000
Threshold: 0.440, Precision: 0.924, Recall: 1.000
Threshold: 0.463, Precision: 0.931, Recall: 1.000
Threshold: 0.464, Precision: 0.944, Recall: 1.000
Threshold: 0.479, Precision: 0.950, Recall: 1.000
Threshold: 0.480, Precision: 0.957, Recall: 1.000
Threshold: 0.496, Precision: 0.964, Recall: 1.000
Threshold: 0.511, Precision: 0.971, Recall: 1.000
Threshold: 0.533, Precision: 0.971, Recall: 0.993
Threshold: 0.537, Precision: 0.978, Recall: 0.993
Threshold: 0.590, Precision: 0.978, Recall: 0.985
Threshold: 0.595, Precision: 0.978, Recall: 0.978
Threshold: 0.620, Precision: 0.985, Recall: 0.978
Threshold: 0.621, Precision: 0.985, Recall: 0.970
Threshold: 0.626, Precision: 0.985, Recall: 0.963
Threshold: 0.640, Precision: 0.985, Recall: 0.955
Threshold: 0.644, Precision: 0.984, Recall: 0.948
Threshold: 0.655,

best thresh: best_threshold = 0.619

### Testing final model performance on holdout

In [20]:
import pandas as pd
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
import pickle
from sklearn.metrics import classification_report

# Download NLTK resources
nltk.download('punkt')
nltk.download('stopwords')

# === Load saved pipeline ===
with open('../models/logreg_spam_pipeline.pkl', 'rb') as f:
    logreg_pipeline = pickle.load(f)

# === Load holdout dataset ===
df_holdout = pd.read_csv('../data/raw/spam_holdout.csv')
df_holdout['label_num'] = df_holdout.label.map({'ham': 0, 'spam': 1})

# === Preprocessing function ===
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text, language='english')
    tokens = [t for t in tokens if t not in stopwords.words('english')]
    stemmer = PorterStemmer()
    tokens = [stemmer.stem(t) for t in tokens]
    return ' '.join(tokens)

# === Apply preprocessing ===
df_holdout['clean_text'] = df_holdout['text'].apply(preprocess_text)

# === Predict using loaded pipeline ===
X_holdout = df_holdout['clean_text']
y_holdout = df_holdout['label_num']
y_pred = logreg_pipeline.predict(X_holdout)

# === Classification report ===
print(classification_report(y_holdout, y_pred, target_names=['ham', 'spam']))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vivekbharti/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vivekbharti/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


              precision    recall  f1-score   support

         ham       0.98      0.98      0.98       483
        spam       0.88      0.89      0.89        75

    accuracy                           0.97       558
   macro avg       0.93      0.94      0.93       558
weighted avg       0.97      0.97      0.97       558

