In [2]:
# ===============================
# Cell 1: Import Libraries
# ===============================

import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [4]:
# ===============================
# Cell 2: Load Dataset
# ===============================

data = pd.read_csv("/content/spam.csv")

print("Dataset shape:", data.shape)
data.head()

Dataset shape: (5572, 2)


Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
# ===============================
# Cell 3: Class Distribution
# ===============================

print(data['Category'].value_counts())

Category
ham     4825
spam     747
Name: count, dtype: int64


In [6]:
# ===============================
# Cell 4: Encode Labels
# ===============================

print("Before encoding:")
print(data['Category'].unique())

data['label'] = data['Category'].map({'ham': 0, 'spam': 1})

print("\nAfter encoding:")
print(data['label'].unique())

Before encoding:
['ham' 'spam']

After encoding:
[0 1]


In [7]:
# ===============================
# Cell 5: Train-Test Split
# ===============================

X_train, X_test, y_train, y_test = train_test_split(
    data['Message'],
    data['label'],
    test_size=0.2,
    random_state=42,
    stratify=data['label']
)

print("Training samples:", X_train.shape[0])
print("Testing samples:", X_test.shape[0])

Training samples: 4457
Testing samples: 1115


In [8]:
# ===============================
# Cell 6: TF-IDF Vectorization
# ===============================

vectorizer = TfidfVectorizer(
    stop_words='english',
    max_features=3000,
    ngram_range=(1, 2)
)

X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

print("TF-IDF Train shape:", X_train_tfidf.shape)
print("TF-IDF Test shape:", X_test_tfidf.shape)

TF-IDF Train shape: (4457, 3000)
TF-IDF Test shape: (1115, 3000)


In [9]:
# ===============================
# Cell 7: Extracted Features
# ===============================

feature_names = vectorizer.get_feature_names_out()

print("Sample features:")
print(feature_names[:20])

Sample features:
['00' '00 sub' '000' '000 bonus' '000 cash' '02' '02 06' '0207' '03'
 '03 2nd' '04' '05' '06' '06 03' '0800' '0800 542' '08000839402'
 '08000839402 2stoptxt' '08000839402 call2optout' '08000930705']


In [10]:
# ===============================
# Cell 8: Train Model
# ===============================

model = MultinomialNB()
model.fit(X_train_tfidf, y_train)

print("Model training completed successfully")

Model training completed successfully


In [11]:
# ===============================
# Cell 9: Predictions
# ===============================

y_pred = model.predict(X_test_tfidf)

print("First 20 predictions:")
print(y_pred[:20])

First 20 predictions:
[0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [12]:
# ===============================
# Cell 10: Actual vs Predicted
# ===============================

comparison = pd.DataFrame({
    "Message": X_test.iloc[:10],
    "Actual": y_test.iloc[:10],
    "Predicted": y_pred[:10]
})

comparison

Unnamed: 0,Message,Actual,Predicted
2825,No need to buy lunch for me.. I eat maggi mee..,0,0
3695,Ok im not sure what time i finish tomorrow but...,0,0
3904,Waiting in e car 4 my mum lor. U leh? Reach ho...,0,0
576,"You have won ?1,000 cash or a ?2,000 prize! To...",1,1
2899,If you r @ home then come down within 5 min,0,0
3456,No need lar. Jus testing e phone card. Dunno n...,0,0
5128,Wot about on wed nite I am 3 then but only til 9!,0,0
919,Hey you gave them your photo when you register...,0,0
2505,"Hello, my boytoy! I made it home and my consta...",0,0
17,Eh u remember how 2 spell his name... Yes i di...,0,0


In [13]:
# ===============================
# Cell 11: Evaluation
# ===============================

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9757847533632287

Confusion Matrix:
 [[966   0]
 [ 27 122]]

Classification Report:
               precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.82      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99      0.91      0.94      1115
weighted avg       0.98      0.98      0.97      1115



In [14]:
# ===============================
# Cell 12: Test on New Message
# ===============================

sample_msg = ["Congratulations! You have won a free prize. Call now"]

sample_vec = vectorizer.transform(sample_msg)
prediction = model.predict(sample_vec)

print("Message:", sample_msg[0])
print("Prediction:", "SPAM" if prediction[0] == 1 else "HAM")

Message: Congratulations! You have won a free prize. Call now
Prediction: SPAM
