# Naive Bayes
<br>
Called Naive Bayes because make a naive assumption that all features are independent of each other.

In [6]:
import pandas as pd
df = pd.read_csv('spam.csv', encoding='ISO-8859-1')
df.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], inplace=True)
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Groupby category as describing data
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [8]:
# Convert Category and Message columns to numeric representation
df['spam'] = df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [9]:
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(df.Message, df.spam, test_size=0.2)

In [11]:
# CountVectorizer to convert Message column to numeric representation
# uses unique word counter for each unique word in message
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
x_train_count = cv.fit_transform(x_train.values)
x_train_count.toarray()[:3]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [12]:
from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(x_train_count, y_train)

MultinomialNB()

In [14]:
emails = [
    "Hey Logan, let's get together for lunch tomorrow?",
    "Save 40% discount this week on parking, exclusive offer just for you. Hurry don't miss out on this reward!"
]
emails_count = cv.transform(emails)
model.predict(emails_count)

array([0, 1], dtype=int64)

In [15]:
# Measure score of model
x_test_count = cv.transform(x_test)
model.score(x_test_count, y_test)

0.9838565022421525

In [16]:
# Easier way to rewrite CV with MultinomialNB cells with transform step already included
from sklearn.pipeline import Pipeline
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('mnb', MultinomialNB())
])

In [17]:
# Can be directly trained on x_train text
clf.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('mnb', MultinomialNB())])

In [18]:
clf.score(x_test, y_test)

0.9838565022421525

In [19]:
clf.predict(emails)

array([0, 1], dtype=int64)