In [70]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

In [71]:
df = pd.read_csv('spam.csv')
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [72]:
df.Category.value_counts()

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,4825
spam,747


In [73]:
df.Category.value_counts()/len(df)*100

Unnamed: 0_level_0,count
Category,Unnamed: 1_level_1
ham,86.593683
spam,13.406317


In [74]:
df['spam'] = df['Category'].apply(lambda x: 1 if x == 'spam' else 0)

In [75]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [76]:
new_df = pd.read_csv('spam.csv')
new_df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [77]:
new_df['Category'] = new_df['Category'].replace({'ham': 0, 'spam': 1})

  new_df['Category'] = new_df['Category'].replace({'ham': 0, 'spam': 1})


In [78]:
new_df.head()

Unnamed: 0,Category,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [79]:
df.shape

(5572, 3)

In [80]:
from numpy.random import test
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [81]:
len(x_train)

4457

In [82]:
x_train[:4]

Unnamed: 0,Message
4822,Im good! I have been thinking about you...
1598,URGENT! Your Mobile number has been awarded wi...
588,Pete can you please ring meive hardly gotany c...
3424,Had your mobile 10 mths? Update to latest Oran...


In [83]:
y_train[:4]

Unnamed: 0,spam
4822,0
1598,1
588,0
3424,1


#### Create bag of words representation using CountVectoriser

In [84]:
vectorizer = CountVectorizer()
x_train_count_vectorizer = vectorizer.fit_transform(x_train.values)
x_test_count_vectorizer = vectorizer.transform(x_test.values)

In [85]:
x_train_count_vectorizer.shape

(4457, 7833)

In [86]:
vectorizer.vocabulary_

{'im': 3663,
 'good': 3230,
 'have': 3400,
 'been': 1329,
 'thinking': 6947,
 'about': 783,
 'you': 7793,
 'urgent': 7301,
 'your': 7798,
 'mobile': 4604,
 'number': 4919,
 'has': 3389,
 'awarded': 1204,
 'with': 7630,
 '2000': 364,
 'prize': 5492,
 'guaranteed': 3311,
 'call': 1648,
 '09061790121': 211,
 'from': 3059,
 'land': 4059,
 'line': 4187,
 'claim': 1865,
 '3030': 452,
 'valid': 7340,
 '12hrs': 305,
 'only': 5009,
 '150ppm': 326,
 'pete': 5234,
 'can': 1671,
 'please': 5310,
 'ring': 5870,
 'meive': 4495,
 'hardly': 3384,
 'gotany': 3246,
 'credit': 2115,
 'had': 3339,
 '10': 272,
 'mths': 4689,
 'update': 7287,
 'to': 7039,
 'latest': 4086,
 'orange': 5043,
 'camera': 1668,
 'video': 7378,
 'phones': 5251,
 'for': 2979,
 'free': 3023,
 'save': 6004,
 'texts': 6895,
 'weekend': 7532,
 'calls': 1662,
 'text': 6886,
 'yes': 7777,
 'callback': 1651,
 'orno': 5057,
 'opt': 5033,
 'out': 5074,
 'if': 3648,
 'we': 7510,
 'one': 5003,
 'partnership': 5157,
 'going': 3217,
 'will': 76

In [87]:
x_train_np = x_train_count_vectorizer.toarray()
x_train_np

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [88]:
np.where(x_train_np[0] != 0)

(array([ 783, 1329, 3230, 3400, 3663, 6947, 7793]),)

### Naive Bayes Classifier

In [89]:
model = MultinomialNB()
model.fit(x_train_count_vectorizer, y_train)

In [90]:
y_pred = model.predict(x_test_count_vectorizer)

In [92]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       990
           1       0.95      0.93      0.94       125

    accuracy                           0.99      1115
   macro avg       0.97      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115



In [96]:
# Testing on a random datapoint
message = {"Upto 20% off on parking, exclusing offer just for you"}
message_cnt = vectorizer.transform(message)
model.predict(message_cnt)

array([0])