In [29]:
!pip install pandas scikit-learn numpy matplotlib seaborn streamlit



In [30]:
from google.colab import files

uploaded = files.upload()

for fn in uploaded.keys():
  print('User uploaded file "{name}" with length {length} bytes'.format(
      name=fn, length=len(uploaded[fn])))

Saving spam.csv to spam (2).csv
User uploaded file "spam (2).csv" with length 503663 bytes


In [31]:
import pandas as pd

data = pd.read_csv("spam.csv", encoding="latin-1")[['v1','v2']]
data.columns = ['label', 'message']

print(data.head())

print("Total messages:", len(data))
print(data['label'].value_counts())

  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...
Total messages: 5572
label
ham     4825
spam     747
Name: count, dtype: int64


In [32]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

data['label'] = data['label'].map({'ham': 0, 'spam': 1})

# Drop rows with NaN in the 'label' column
data.dropna(subset=['label'], inplace=True)

X = data['message']
y = data['label']

vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

In [33]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Train model
model = MultinomialNB()
model.fit(X_train, y_train)

# Predict
y_pred = model.predict(X_test)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9748878923766816
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       0.99      0.82      0.90       149

    accuracy                           0.97      1115
   macro avg       0.98      0.91      0.94      1115
weighted avg       0.98      0.97      0.97      1115

Confusion Matrix:
 [[965   1]
 [ 27 122]]


In [34]:
test_msg = ["Congratulations! You've won a $500 gift card. Call now!",
            "Hi, are we meeting tomorrow?"]

test_vec = vectorizer.transform(test_msg)
pred = model.predict(test_vec)

for msg, label in zip(test_msg, pred):
    print(msg, "->", "Spam" if label==1 else "Ham")


Congratulations! You've won a $500 gift card. Call now! -> Spam
Hi, are we meeting tomorrow? -> Ham


In [38]:
import joblib

# Save trained model
joblib.dump(model, "spam_model.pkl")

# Save fitted vectorizer
joblib.dump(vectorizer, "vectorizer.pkl")

!ls -lh



total 1.7M
drwxr-xr-x 1 root root 4.0K Aug 15 13:35  sample_data
-rw-r--r-- 1 root root 492K Aug 19 07:20 'spam (1).csv'
-rw-r--r-- 1 root root 492K Aug 19 07:43 'spam (2).csv'
-rw-r--r-- 1 root root 492K Aug 19 07:17  spam.csv
-rw-r--r-- 1 root root  95K Aug 19 07:45  spam_model.pkl
-rw-r--r-- 1 root root 105K Aug 19 07:45  vectorizer.pkl


In [39]:
from google.colab import files
files.download("spam_model.pkl")
files.download("vectorizer.pkl")




<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>