In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from google.colab import files
files.upload()



df = pd.read_csv("spam.csv", encoding="latin-1")[['Category','Message']]  # adjust filename if needed

# Converting labels to binary
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})

# Features (X) and target (y)
X = df['Message']
y = df['Label']


Saving spam.csv to spam.csv


In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score


df = pd.read_csv("spam.csv")[['Category','Message']]
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})

X_text = df['Message']
y = df['Label'].values.reshape(-1,1)


# Feature Extraction (Bag of Words)
vectorizer = CountVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(X_text).toarray()

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Add bias column
X_train_b = np.c_[np.ones((X_train.shape[0],1)), X_train]
X_test_b  = np.c_[np.ones((X_test.shape[0],1)), X_test]


# Applying Logistic Regression Functions
def sigmoid(z):
    return 1 / (1 + np.exp(-z))

def compute_cost(X, y, theta):
    m = len(y)
    h = sigmoid(X.dot(theta))
    cost = -(1/m) * np.sum(y*np.log(h + 1e-10) + (1-y)*np.log(1-h + 1e-10))  # +1e-10 for stability
    return cost

def gradient_descent(X, y, theta, alpha, iterations):
    m = len(y)
    cost_history = []

    for _ in range(iterations):
        h = sigmoid(X.dot(theta))
        gradient = (1/m) * X.T.dot(h - y)
        theta -= alpha * gradient
        cost_history.append(compute_cost(X, y, theta))
    return theta, cost_history


# Step 4: Train Model
theta = np.zeros((X_train_b.shape[1],1))
alpha = 0.1
iterations = 1000

theta_final, cost_history = gradient_descent(X_train_b, y_train, theta, alpha, iterations)

print("Final Cost:", cost_history[-1])


# Step 5: Predictions & Evaluation
y_pred_probs = sigmoid(X_test_b.dot(theta_final))
y_pred = (y_pred_probs >= 0.5).astype(int)

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Final Cost: 0.14584248087659726
Confusion Matrix:
 [[962   4]
 [ 52  97]]
Accuracy: 0.9497757847533632
Precision: 0.9603960396039604
Recall: 0.6510067114093959
F1 Score: 0.776


In [8]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score, classification_report


# Step 1: Load Dataset
df = pd.read_csv("spam.csv", encoding="latin-1")[['Category','Message']]
df['Label'] = df['Category'].map({'ham': 0, 'spam': 1})

X_text = df['Message']
y = df['Label']


# Step 2: Feature Extraction (Bag of Words)
vectorizer = CountVectorizer(stop_words='english', max_features=2000)
X = vectorizer.fit_transform(X_text)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)


# Step 3: Train Logistic Regression (Sklearn)
model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Predictions
y_pred = model.predict(X_test)


# Step 4: Evaluation
cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:\n", cm)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred))
print("Recall:", recall_score(y_test, y_pred))
print("F1 Score:", f1_score(y_test, y_pred))


Confusion Matrix:
 [[965   1]
 [ 23 126]]
Accuracy: 0.97847533632287
Precision: 0.9921259842519685
Recall: 0.8456375838926175
F1 Score: 0.9130434782608695


The from-scratch model achieved Accuracy = 0.95, Precision = 0.96, Recall = 0.65, F1 = 0.78, meaning it was good at correctly labeling non-spam but weak at catching spam.
The sklearn model performed much better with Accuracy = 0.98, Precision = 0.99, Recall = 0.85, F1 = 0.91, striking a stronger balance between catching spam and avoiding false alarms.
