In [1]:
import pandas as pd
import numpy as np
df = pd.read_csv('SMSSpamCollection', sep='\t',  names=["label", "message"])
df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [2]:
df['message'][4]

"Nah I don't think he goes to usf, he lives around here though"

In [3]:
#now first make 'label' column as label encoding
df['category'] = df['label'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,label,message,category
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [4]:
df.drop('label', axis=1, inplace=True)

In [5]:
df['category'].value_counts()

0    4825
1     747
Name: category, dtype: int64

In [6]:
# first split data
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.message, df.category, test_size=0.2, random_state=0)

In [7]:
print(X_train.shape)
print(X_test.shape)

(4457,)
(1115,)


In [8]:
#Now apply BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
transformed = cv.fit_transform(X_train)

In [9]:
transformed.toarray()[0]

array([0, 0, 0, ..., 0, 0, 0], dtype=int64)

In [10]:
cv.get_feature_names_out()

array(['00', '000', '000pes', ..., 'èn', 'ú1', '〨ud'], dtype=object)

In [11]:
X_train_np = transformed.toarray()
X_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
np.where(X_train_np[0] != 0)

(array([ 961, 2948, 3201, 3534, 3620, 3666, 3774, 3786, 4089, 4628, 4814,
        4937, 6866], dtype=int64),)

In [13]:
X_train_np[0][961]

1

In [14]:
from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB()
mnb.fit(X_train_np, y_train)

In [15]:
# i need to apply count vectorizer first to predict on X_test for that
X_test_cv = cv.transform(X_test)

In [16]:
y_pred = mnb.predict(X_test_cv)

In [17]:
from sklearn.metrics import accuracy_score, classification_report
print(classification_report(y_test, y_pred))
print("total accuracy is: ",accuracy_score(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      1.00      0.99       955
           1       0.98      0.93      0.96       160

    accuracy                           0.99      1115
   macro avg       0.98      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

total accuracy is:  0.9874439461883409


In [18]:
email = ['Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&Cs apply 08452810075over18s',
        'Nah I dont think he goes to usf, he lives around here though']
email_pred = cv.transform(email)
mnb.predict(email_pred)

array([1, 0], dtype=int64)

In [19]:
from sklearn.ensemble import RandomForestClassifier

In [20]:
# also we can use sklearn sklearn pipeline to do all above steps in sort
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer',CountVectorizer()),
    ('rfc', RandomForestClassifier(criterion='entropy', n_estimators=50))
])
clf.fit(X_train, y_train)

In [21]:
y_preds = clf.predict(X_test)

In [22]:
print(classification_report(y_preds, y_test))

              precision    recall  f1-score   support

           0       1.00      0.97      0.99       982
           1       0.83      1.00      0.91       133

    accuracy                           0.98      1115
   macro avg       0.92      0.99      0.95      1115
weighted avg       0.98      0.98      0.98      1115



In [24]:
import pickle
with open('spam-ham.pkl', 'wb') as f:
    pickle.dump(mnb, f)

In [25]:
import pickle
with open('cv.pkl', 'wb') as f:
    pickle.dump(cv, f)