<a href="https://colab.research.google.com/gist/sumitra288/eeaa2d2c7d1ccfa8243bc8824386c20b/experiment4.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from sklearn.datasets import fetch_20newsgroups
import matplotlib.pyplot as plt
import seaborn as sns
from collections import defaultdict
import re

In [2]:
class MultinomialNaiveBayes:
    def __init__(self, alpha=1.0):
        self.alpha = alpha
        self.class_priors = {}
        self.word_probs = {}
        self.vocabulary = set()
        self.classes = []

    def fit(self, X, y):
        self.classes = np.unique(y)
        n_samples = X.shape[0]
        n_features = X.shape[1]

        for class_label in self.classes:
            class_count = np.sum(y == class_label)
            self.class_priors[class_label] = class_count / n_samples

        self.word_probs = {}
        for class_label in self.classes:
            class_mask = (y == class_label)
            class_docs = X[class_mask]

            word_counts = np.sum(class_docs, axis=0)
            total_words = np.sum(word_counts)

            word_probs = (word_counts + self.alpha) / (total_words + self.alpha * n_features)
            self.word_probs[class_label] = word_probs

    def predict_proba(self, X):
        n_samples = X.shape[0]
        n_classes = len(self.classes)
        probabilities = np.zeros((n_samples, n_classes))

        for i, class_label in enumerate(self.classes):
            log_prob = np.log(self.class_priors[class_label])
            word_log_probs = np.log(self.word_probs[class_label])
            doc_log_probs = X.dot(word_log_probs)
            probabilities[:, i] = log_prob + doc_log_probs

        probabilities = probabilities - np.max(probabilities, axis=1, keepdims=True)
        probabilities = np.exp(probabilities)
        probabilities = probabilities / np.sum(probabilities, axis=1, keepdims=True)

        return probabilities

    def predict(self, X):
        probabilities = self.predict_proba(X)
        return self.classes[np.argmax(probabilities, axis=1)]

In [6]:
def create_realistic_spam_dataset():
    spam_messages = [
        "free money now call immediately urgent offer limited time",
        "win big prizes click here now exclusive deal today only",
        "urgent business proposal money transfer prince nigeria help needed",
        "congratulations you won lottery claim prize now click link",
        "limited time offer act now save money huge discount",
        "call now free consultation prize money winner selected today",
        "click here for instant money rewards cash prize waiting",
        "urgent reply needed money waiting transfer funds immediately",
        "exclusive offer limited time big savings discount percentage off",
        "free gift card click claim now winner announcement today",
        "make money fast work from home easy income guaranteed",
        "lose weight quick miracle pill doctor approved formula",
        "viagra cheap online pharmacy prescription drugs discount prices",
        "refinance mortgage lowest rates approved guaranteed bad credit ok",
        "casino bonus free spins slots jackpot winner play now",
        "investment opportunity guaranteed returns millionaire secrets revealed",
        "credit card debt relief lawyer help bankruptcy avoid",
        "insurance quote save hundreds dollars coverage protection family",
        "dating singles nearby meet tonight local women interested",
        "degree online university accredited diploma fast track program"
    ]

    ham_messages = [
        "meeting scheduled for tomorrow afternoon conference room available",
        "please review the attached document and provide feedback soon",
        "lunch plans with friends this weekend restaurant reservation confirmed",
        "project deadline extended until friday team meeting scheduled",
        "thanks for your help with the presentation slides look great",
        "reminder about dentist appointment tuesday morning don't forget",
        "looking forward to vacation next month flight tickets booked",
        "conference call rescheduled to thursday same time different day",
        "birthday party invitation for saturday bring friends family welcome",
        "grocery list includes milk bread eggs cheese vegetables fruits",
        "weather forecast shows rain tomorrow umbrella might be needed",
        "book recommendation mystery novel author writing style excellent",
        "exercise routine includes running swimming cycling strength training variety",
        "recipe ingredients chicken vegetables spices cooking instructions included",
        "movie tickets purchased weekend plans entertainment family time",
        "university course registration deadline approaching choose classes carefully",
        "library books due next week return policy late fees",
        "garden flowers blooming spring season beautiful colors nature",
        "computer software update available security patches bug fixes",
        "travel itinerary flight hotel rental car confirmation numbers"
    ]


    import random
    random.seed(42)


    all_spam = []
    all_ham = []

    for _ in range(15):
        for msg in spam_messages:
            words = msg.split()

            if len(words) > 5:
                remove_count = random.randint(1, min(3, len(words)-3))
                words = random.sample(words, len(words) - remove_count)
            all_spam.append(' '.join(words))

    for _ in range(15):
        for msg in ham_messages:
            words = msg.split()

            if len(words) > 5:
                remove_count = random.randint(1, min(3, len(words)-3))
                words = random.sample(words, len(words) - remove_count)
            all_ham.append(' '.join(words))

    messages = all_spam + all_ham
    labels = [1] * len(all_spam) + [0] * len(all_ham)

    return messages, labels

In [10]:
def evaluate_model(y_true, y_pred, model_name, vectorizer_name):
    """
    Evaluates a classification model on test data.

    Returns a dictionary with metrics.
    """
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred)
    recall = recall_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)

    print(f"\nModel: {model_name} | Vectorizer: {vectorizer_name}")
    print(f"Accuracy : {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall   : {recall:.4f}")
    print(f"F1 Score : {f1:.4f}")
    print("Confusion Matrix:")
    print(confusion_matrix(y_true, y_pred))

    return {
        'Model': model_name,
        'Vectorizer': vectorizer_name,
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1': f1
    }


messages, labels = create_realistic_spam_dataset()

X_train_text, X_test_text, y_train, y_test = train_test_split(
    messages, labels, test_size=0.3, random_state=42, stratify=labels
)

print("Dataset Information:")
print(f"Training samples: {len(X_train_text)}")
print(f"Testing samples: {len(X_test_text)}")
print(f"Spam ratio: {np.mean(labels):.2f}")

results = []

print("\n" + "="*60)
print("LOGISTIC REGRESSION EXPERIMENT")
print("="*60)

reg_params = [0.01, 0.1, 1.0, 10.0]

for vec_name, vectorizer in [('Count', CountVectorizer(stop_words='english', max_features=1000)),
                           ('TF-IDF', TfidfVectorizer(stop_words='english', max_features=1000))]:
    X_train_vec = vectorizer.fit_transform(X_train_text)
    X_test_vec = vectorizer.transform(X_test_text)

    for reg_param in reg_params:
        lr = LogisticRegression(C=1/reg_param, random_state=42, max_iter=1000)
        lr.fit(X_train_vec, y_train)
        y_pred = lr.predict(X_test_vec)

        result = evaluate_model(y_test, y_pred, 'Logistic Regression', vec_name)
        result['Reg. λ'] = reg_param
        results.append(result)

print("\n" + "="*60)
print("NAIVE BAYES EXPERIMENT")
print("="*60)

for vec_name, vectorizer in [('Count', CountVectorizer(stop_words='english', max_features=1000)),
                           ('TF-IDF', TfidfVectorizer(stop_words='english', max_features=1000))]:
    X_train_vec = vectorizer.fit_transform(X_train_text)
    X_test_vec = vectorizer.transform(X_test_text)

    X_train_dense = X_train_vec.toarray()
    X_test_dense = X_test_vec.toarray()

    nb = MultinomialNaiveBayes(alpha=1.0)
    nb.fit(X_train_dense, y_train)

    y_pred = nb.predict(X_test_dense)
    y_pred_proba = nb.predict_proba(X_test_dense)

    result = evaluate_model(y_test, y_pred, 'Naive Bayes', vec_name)
    result['Reg. λ'] = '–'
    results.append(result)

    print(f"\nSample predictions with {vec_name} Vectorizer:")
    for i in range(min(5, len(y_test))):
        actual = "Spam" if y_test[i] == 1 else "Ham"
        predicted = "Spam" if y_pred[i] == 1 else "Ham"
        prob_spam = y_pred_proba[i, 1]
        print(f"Actual: {actual:4}, Predicted: {predicted:4}, P(Spam): {prob_spam:.3f}")

print("\n" + "="*60)
print("COMPARATIVE ANALYSIS")
print("="*60)

results_df = pd.DataFrame(results)
column_order = ['Model', 'Vectorizer', 'Reg. λ', 'Accuracy', 'Precision', 'Recall', 'F1']
results_df = results_df[column_order]

print("\nResults Summary Table:")
print("-" * 90)
results_display = results_df.copy()
for col in ['Accuracy', 'Precision', 'Recall', 'F1']:
    results_display[col] = results_display[col].apply(lambda x: f"{x:.4f}")
print(results_display.to_string(index=False))

print("\n" + "="*50)
print("BEST PERFORMING MODELS")
print("="*50)

best_accuracy_idx = results_df['Accuracy'].idxmax()
best_f1_idx = results_df['F1'].idxmax()

best_accuracy = results_df.iloc[best_accuracy_idx]
best_f1 = results_df.iloc[best_f1_idx]

print(f"\nBest Accuracy: {best_accuracy['Model']} with {best_accuracy['Vectorizer']} vectorizer")
print(f"Accuracy: {best_accuracy['Accuracy']:.4f}, F1: {best_accuracy['F1']:.4f}")

print(f"\nBest F1-Score: {best_f1['Model']} with {best_f1['Vectorizer']} vectorizer")
print(f"Accuracy: {best_f1['Accuracy']:.4f}, F1: {best_f1['F1']:.4f}")

Dataset Information:
Training samples: 420
Testing samples: 180
Spam ratio: 0.50

LOGISTIC REGRESSION EXPERIMENT

Model: Logistic Regression | Vectorizer: Count
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
Confusion Matrix:
[[90  0]
 [ 0 90]]

Model: Logistic Regression | Vectorizer: Count
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
Confusion Matrix:
[[90  0]
 [ 0 90]]

Model: Logistic Regression | Vectorizer: Count
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
Confusion Matrix:
[[90  0]
 [ 0 90]]

Model: Logistic Regression | Vectorizer: Count
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
Confusion Matrix:
[[90  0]
 [ 0 90]]

Model: Logistic Regression | Vectorizer: TF-IDF
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.0000
F1 Score : 1.0000
Confusion Matrix:
[[90  0]
 [ 0 90]]

Model: Logistic Regression | Vectorizer: TF-IDF
Accuracy : 1.0000
Precision: 1.0000
Recall   : 1.000

In [9]:
print("\n" + "="*50)
print("CONFUSION MATRICES")
print("="*50)

count_vec = CountVectorizer(stop_words='english', max_features=1000)
X_train_count = count_vec.fit_transform(X_train_text).toarray()
X_test_count = count_vec.transform(X_test_text).toarray()

nb_count = MultinomialNaiveBayes(alpha=1.0)
nb_count.fit(X_train_count, y_train)
y_pred_nb_count = nb_count.predict(X_test_count)

print("\nConfusion Matrix - Naive Bayes with Count Vectorizer:")
cm_nb = confusion_matrix(y_test, y_pred_nb_count)
print(f"True Negatives (Ham predicted as Ham): {cm_nb[0,0]}")
print(f"False Positives (Ham predicted as Spam): {cm_nb[0,1]}")
print(f"False Negatives (Spam predicted as Ham): {cm_nb[1,0]}")
print(f"True Positives (Spam predicted as Spam): {cm_nb[1,1]}")

tfidf_vec = TfidfVectorizer(stop_words='english', max_features=1000)
X_train_tfidf = tfidf_vec.fit_transform(X_train_text)
X_test_tfidf = tfidf_vec.transform(X_test_text)

lr_best = LogisticRegression(C=10, random_state=42, max_iter=1000)
lr_best.fit(X_train_tfidf, y_train)
y_pred_lr_tfidf = lr_best.predict(X_test_tfidf)

print("\nConfusion Matrix - Logistic Regression with TF-IDF:")
cm_lr = confusion_matrix(y_test, y_pred_lr_tfidf)
print(f"True Negatives (Ham predicted as Ham): {cm_lr[0,0]}")
print(f"False Positives (Ham predicted as Spam): {cm_lr[0,1]}")
print(f"False Negatives (Spam predicted as Ham): {cm_lr[1,0]}")
print(f"True Positives (Spam predicted as Spam): {cm_lr[1,1]}")

print("\n" + "="*60)
print("ALGORITHM IMPLEMENTATION DETAILS")
print("="*60)

print("\nNaive Bayes Formula Implementation:")
print("P(spam|document) ∝ P(spam) × ∏ P(word|spam)")
print("With Laplace smoothing: P(word|class) = (count + α) / (total + α × vocab_size)")
print(f"Alpha (smoothing parameter): {nb_count.alpha}")

print(f"\nClass Priors:")
for class_label, prior in nb_count.class_priors.items():
    class_name = "Ham" if class_label == 0 else "Spam"
    print(f"P({class_name}) = {prior:.4f}")

print("\n" + "="*60)
print("PERFORMANCE SUMMARY")
print("="*60)

nb_results = results_df[results_df['Model'] == 'Naive Bayes']
lr_results = results_df[results_df['Model'] == 'Logistic Regression']

print(f"Naive Bayes - Average Performance:")
print(f"  Accuracy: {nb_results['Accuracy'].mean():.4f}")
print(f"  F1-Score: {nb_results['F1'].mean():.4f}")

print(f"\nLogistic Regression - Average Performance:")
print(f"  Accuracy: {lr_results['Accuracy'].mean():.4f}")
print(f"  F1-Score: {lr_results['F1'].mean():.4f}")

print(f"\nVectorization Impact:")
count_results = results_df[results_df['Vectorizer'] == 'Count']
tfidf_results = results_df[results_df['Vectorizer'] == 'TF-IDF']

print(f"Count Vectorizer - Average F1: {count_results['F1'].mean():.4f}")
print(f"TF-IDF Vectorizer - Average F1: {tfidf_results['F1'].mean():.4f}")

if tfidf_results['F1'].mean() > count_results['F1'].mean():
    print("→ TF-IDF shows better performance overall")
else:
    print("→ Count vectorizer shows better performance overall")


CONFUSION MATRICES

Confusion Matrix - Naive Bayes with Count Vectorizer:
True Negatives (Ham predicted as Ham): 90
False Positives (Ham predicted as Spam): 0
False Negatives (Spam predicted as Ham): 0
True Positives (Spam predicted as Spam): 90

Confusion Matrix - Logistic Regression with TF-IDF:
True Negatives (Ham predicted as Ham): 90
False Positives (Ham predicted as Spam): 0
False Negatives (Spam predicted as Ham): 0
True Positives (Spam predicted as Spam): 90

ALGORITHM IMPLEMENTATION DETAILS

Naive Bayes Formula Implementation:
P(spam|document) ∝ P(spam) × ∏ P(word|spam)
With Laplace smoothing: P(word|class) = (count + α) / (total + α × vocab_size)
Alpha (smoothing parameter): 1.0

Class Priors:
P(Ham) = 0.5000
P(Spam) = 0.5000

PERFORMANCE SUMMARY
Naive Bayes - Average Performance:
  Accuracy: 1.0000
  F1-Score: 1.0000

Logistic Regression - Average Performance:
  Accuracy: 1.0000
  F1-Score: 1.0000

Vectorization Impact:
Count Vectorizer - Average F1: 1.0000
TF-IDF Vectorize