In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("datasets/spam.csv")

In [3]:
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
df.Category.value_counts()

ham     4825
spam     747
Name: Category, dtype: int64

In [5]:
df.shape

(5572, 2)

In [10]:
df['Category'].isna().count()

5572

In [11]:
def get_spam_number(x):
    if x == 'spam':
        return 1
    return 0

In [12]:
df['spam'] = df['Category'].apply(get_spam_number)

In [13]:
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [16]:
from sklearn.model_selection import train_test_split
X_train, X_test,y_train,y_test = train_test_split(df.Message,df.spam,test_size=0.2)

In [17]:
X_train.shape

(4457,)

In [18]:
X_test.shape

(1115,)

In [19]:
type(X_train)

pandas.core.series.Series

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

In [21]:
type(X_train.values)

numpy.ndarray

In [23]:
v = CountVectorizer()
X_train_cv = v.fit_transform(X_train.values)

In [24]:
X_train_cv.shape

(4457, 7680)

In [26]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()
model.fit(X_train_cv,y_train)

MultinomialNB()

In [27]:
X_test_cv = v.transform(X_test)

In [28]:
from sklearn.metrics import classification_report

In [29]:
y_pred = model.predict(X_test_cv)

In [30]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.95      0.92      0.94       141

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115



In [31]:
emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)
model.predict(emails_count)

array([0, 1])

In [32]:
from sklearn.pipeline import Pipeline

clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [33]:
clf.fit(X_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [34]:
y_pred = clf.predict(X_test)

print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.99      0.99      0.99       974
           1       0.95      0.92      0.94       141

    accuracy                           0.98      1115
   macro avg       0.97      0.96      0.96      1115
weighted avg       0.98      0.98      0.98      1115

