In [None]:
# Assignment no.3

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

# Step 1: Load dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
data = pd.read_csv(url, sep="\t", header=None, names=["label", "message"])

print(" Dataset Loaded Successfully")
print(data.head(), "\n")
print("Dataset Size:", data.shape)

# Step 2: Encode labels (spam=1, ham=0)
data["label_num"] = data.label.map({"ham": 0, "spam": 1})

# Step 3: Split data
X_train, X_test, y_train, y_test = train_test_split(
    data["message"], data["label_num"], test_size=0.2, random_state=42
)

# Step 4: Text Vectorization using TF-IDF
vectorizer = TfidfVectorizer(stop_words="english")
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Step 5: Train Model (Naive Bayes)
model = MultinomialNB()
model.fit(X_train_vec, y_train)

# Step 6: Predictions
y_pred = model.predict(X_test_vec)

# Step 7: Evaluate
print("\n Model Evaluation:")
print("Accuracy:", round(accuracy_score(y_test, y_pred), 4))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Step 8: Test with custom messages
samples = [
    "Congratulations! You’ve won a $1,000 Walmart gift card. Go to http://bit.ly/123456",
    "Hey John, are we meeting for lunch today?",
    "URGENT! Your account has been compromised. Reset your password now!",
    "Let's catch up tomorrow at office.",
]
sample_vec = vectorizer.transform(samples)
predictions = model.predict(sample_vec)

print("\n Sample Predictions:")
for msg, pred in zip(samples, predictions):
    print(f"{'SPAM' if pred else 'HAM'}   {msg[:70]}...")


 Dataset Loaded Successfully
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro... 

Dataset Size: (5572, 2)

 Model Evaluation:
Accuracy: 0.9785

Confusion Matrix:
 [[966   0]
 [ 24 125]]

Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       966
           1       1.00      0.84      0.91       149

    accuracy                           0.98      1115
   macro avg       0.99      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115


 Sample Predictions:
HAM   Congratulations! You’ve won a $1,000 Walmart gift card. Go to http://b...
HAM   Hey John, are we meeting for lunch today?...
HAM   