In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from tqdm import tqdm

In [2]:
data = pd.read_csv('spam.csv', encoding='latin-1')
data.drop_duplicates(inplace=True)
data['label'] = data['v1'].map({'ham': 'ham', 'spam': 'spam'})
X = data['v2']
y = data['label']

In [3]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)

In [4]:
classifier = MultinomialNB()
classifier.fit(X_train_tfidf, y_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = classifier.predict(X_test_tfidf)
print(y_pred)

['ham' 'ham' 'ham' ... 'spam' 'ham' 'ham']


In [5]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Legitimate SMS', 'Spam SMS'])
progress_bar = tqdm(total=100, position=0, leave=True)
for i in range(10, 101, 10):
    progress_bar.update(10)
    progress_bar.set_description(f'Progress: {i}%')

Progress: 100%: 100%|██████████| 100/100 [00:00<00:00, 15924.91it/s]

In [6]:
progress_bar.close()

Progress: 100%: 100%|██████████| 100/100 [00:10<00:00,  9.38it/s]   


In [7]:
print(f'Accuracy: {accuracy:.2f}')
print('Classification Report:')
print(report)

Accuracy: 0.96
Classification Report:
                precision    recall  f1-score   support

Legitimate SMS       0.95      1.00      0.97       889
      Spam SMS       1.00      0.68      0.81       145

      accuracy                           0.96      1034
     macro avg       0.98      0.84      0.89      1034
  weighted avg       0.96      0.96      0.95      1034

