
# Task 3: SMS Spam Detection — N-gram and Random Forest Models

This notebook implements SMS spam detection using:
1. **Pure N-gram Models (Unigram, Bigram, Trigram)** — probability-based (no ML).
2. **Random Forest Algorithm** — with TF-IDF features.


In [None]:

import pandas as pd
import string
from collections import Counter
from math import log
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier


In [None]:

# Load dataset
df = pd.read_csv("SMSSpamCollection", sep="\t", names=["label", "message"])

# Preprocess messages
def preprocess(text):
    text = text.lower()
    text = text.translate(str.maketrans("", "", string.punctuation))
    tokens = text.split()
    return tokens

df["tokens"] = df["message"].apply(preprocess)

# Split into train and test
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df["label"], random_state=42)


In [None]:

def train_ngram_model(n=1):
    # Get n-grams for spam and ham
    def get_ngrams(tokens_list, n):
        ngrams = []
        for tokens in tokens_list:
            ngrams += [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        return ngrams

    spam_tokens = train_df[train_df["label"] == "spam"]["tokens"].tolist()
    ham_tokens = train_df[train_df["label"] == "ham"]["tokens"].tolist()

    spam_ngrams = get_ngrams(spam_tokens, n)
    ham_ngrams = get_ngrams(ham_tokens, n)

    spam_counts = Counter(spam_ngrams)
    ham_counts = Counter(ham_ngrams)

    V = len(set(list(spam_counts.keys()) + list(ham_counts.keys())))
    total_spam = sum(spam_counts.values())
    total_ham = sum(ham_counts.values())

    def message_log_prob(tokens, label):
        ngrams = [tuple(tokens[i:i+n]) for i in range(len(tokens)-n+1)]
        log_prob = 0.0
        for ng in ngrams:
            if label == "spam":
                count = spam_counts.get(ng, 0)
                prob = (count + 1) / (total_spam + V)
            else:
                count = ham_counts.get(ng, 0)
                prob = (count + 1) / (total_ham + V)
            log_prob += log(prob)
        return log_prob

    y_true, y_pred = [], []

    for _, row in test_df.iterrows():
        tokens = row["tokens"]
        spam_score = message_log_prob(tokens, "spam")
        ham_score = message_log_prob(tokens, "ham")
        predicted = "spam" if spam_score > ham_score else "ham"

        y_true.append(row["label"])
        y_pred.append(predicted)

    accuracy = sum(1 for a, b in zip(y_true, y_pred) if a == b) / len(y_true)
    print(f"\n✅ Accuracy of {n}-gram model: {accuracy*100:.2f}%")


In [None]:

# ---- Unigram Model ----
train_ngram_model(n=1)


In [None]:

# ---- Bigram Model ----
train_ngram_model(n=2)


In [None]:

# ---- Trigram Model ----
train_ngram_model(n=3)


In [None]:

# ---- Random Forest Model (TF-IDF) ----
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_df["message"])
X_test = vectorizer.transform(test_df["message"])

clf = RandomForestClassifier(random_state=42)
clf.fit(X_train, train_df["label"])
pred = clf.predict(X_test)

print("\n✅ Random Forest Model Results:")
print(classification_report(test_df["label"], pred))
