In [20]:
# Step 1: Import libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

print("✅ Libraries imported successfully!")

✅ Libraries imported successfully!


In [21]:
#  Load dataset \

url = "https://raw.githubusercontent.com/amankharwal/Website-data/master/spam.csv"
data = pd.read_csv(url, encoding='latin-1')

# 👇 Check first 5 rows to see actual column names

print(data.columns)
data.head()

Index(['label', 'text'], dtype='object')


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [22]:
data = data.rename(columns={'Category': 'label', 'Message': 'message'})
data.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [23]:
print(data.columns)

Index(['label', 'text'], dtype='object')


In [24]:
# Step 4: Split data & convert text to numeric features

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

# Split into train & test using correct column names

X_train, X_test, y_train, y_test = train_test_split(
data['text'], data['label'], test_size=0.2, random_state=42)

# Convert text messages into numeric vectors using TF-IDF

vectorizer = TfidfVectorizer(stop_words='english', lowercase=True)
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

print("✅ Data prepared successfully!")


✅ Data prepared successfully!


In [25]:

# Step 5: Train model

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tf, y_train)
print("✅ Model trained successfully!")


✅ Model trained successfully!


In [26]:
# Step 6: Evaluate performance

from sklearn.metrics import accuracy_score, classification_report

y_pred = model.predict(X_test_tf)
print("✅ Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

✅ Accuracy: 0.9668161434977578

Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.86       150

    accuracy                           0.97      1115
   macro avg       0.98      0.88      0.92      1115
weighted avg       0.97      0.97      0.96      1115



In [27]:
# Step 7: Test your own messages

sample_msgs = [
"Congratulations! You've won a free iPhone! Click to claim.",
"Let's meet tomorrow for the project discussion."
]

sample_tf = vectorizer.transform(sample_msgs)
predictions = model.predict(sample_tf)

for msg, pred in zip(sample_msgs, predictions):
    label = "🚫 Spam" if pred == 'spam' else "✅ Not Spam"
print(f"{label} --> {msg}")


✅ Not Spam --> Let's meet tomorrow for the project discussion.
