In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [3]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df["spam"] = df["Category"].apply(lambda x: 1 if x=="spam" else 0)

In [6]:
df

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,1
5568,ham,Will ü b going to esplanade fr home?,0
5569,ham,"Pity, * was in mood for that. So...any other s...",0
5570,ham,The guy did some bitching but I acted like i'd...,0


In [7]:
df.shape


(5572, 3)

### Train test split


In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [9]:
X_train.shape


(4457,)

In [10]:
X_test.shape


(1115,)

In [11]:
type(X_train)


pandas.core.series.Series

In [12]:
X_train[:4]


520     Usually the person is unconscious that's in ch...
1851    Then cant get da laptop? My matric card wif ü ...
1018               Shall i send that exe to your mail id.
2711    Hope you enjoyed your new content. text stop t...
Name: Message, dtype: object

### Create bag of words representation using CountVectorizer


In [13]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

X_train_cv = v.fit_transform(X_train.values)
X_train_cv

<4457x7787 sparse matrix of type '<class 'numpy.int64'>'
	with 59488 stored elements in Compressed Sparse Row format>

In [14]:
X_train_cv.toarray()[:2][0]


array([0, 0, 0, ..., 0, 0, 0])

In [15]:
X_train_cv.shape


(4457, 7787)

In [16]:
v.get_feature_names_out()[1771]


'cheaper'

In [17]:
v.vocabulary_


{'usually': 7270,
 'the': 6841,
 'person': 5182,
 'is': 3769,
 'unconscious': 7178,
 'that': 6837,
 'in': 3660,
 'children': 1803,
 'but': 1587,
 'adults': 838,
 'they': 6866,
 'may': 4408,
 'just': 3896,
 'behave': 1313,
 'abnormally': 766,
 'll': 4180,
 'call': 1621,
 'you': 7747,
 'now': 4860,
 'then': 6853,
 'cant': 1657,
 'get': 3147,
 'da': 2167,
 'laptop': 4037,
 'my': 4677,
 'matric': 4396,
 'card': 1666,
 'wif': 7555,
 'lei': 4092,
 'shall': 6080,
 'send': 6029,
 'exe': 2726,
 'to': 6962,
 'your': 7752,
 'mail': 4330,
 'id': 3607,
 'hope': 3513,
 'enjoyed': 2626,
 'new': 4772,
 'content': 2004,
 'text': 6812,
 'stop': 6518,
 '61610': 586,
 'unsubscribe': 7219,
 'help': 3421,
 '08712400602450p': 108,
 'provided': 5484,
 'by': 1604,
 'tones2you': 6995,
 'co': 1891,
 'uk': 7159,
 'ha': 3313,
 'nan': 4704,
 'yalrigu': 7710,
 'heltini': 3428,
 'iyo': 3797,
 'kothi': 3995,
 'chikku': 1799,
 'shared': 6088,
 'many': 4363,
 'things': 6872,
 'wit': 7595,
 'me': 4417,
 'so': 6298,
 'far

In [18]:
X_train_np = X_train_cv.toarray()
X_train_np[0]

array([0, 0, 0, ..., 0, 0, 0])

In [19]:
np.where(X_train_np[0]!=0)


(array([ 766,  838, 1313, 1587, 1621, 1803, 3660, 3769, 3896, 4180, 4408,
        4860, 5182, 6837, 6841, 6866, 7178, 7270, 7747]),)

### Train the naive bayes model


In [24]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv, y_train)

In [25]:
X_test_cv = v.transform(X_test)

### Evaluate Performance

In [26]:
from sklearn.metrics import classification_report

y_pred = model.predict(X_test_cv)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.95      0.93      0.94       147

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [27]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

### Train the model using sklearn pipeline and reduce number of lines of code


In [28]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [29]:

clf.fit(X_train, y_train)

In [30]:

y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       968
           1       0.95      0.93      0.94       147

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

