#  Naive Bayes Classifiers

In [1]:
import numpy as np
import pandas as pd
from sklearn.naive_bayes import GaussianNB,BernoulliNB,MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer

## Naive Bayes
### Using Naive Bayes to predict spam

In [2]:
#Use Latin encoding as the Data has non UFT-8 Chars
data = pd.read_csv("spam.csv",encoding='latin-1')

In [3]:
data.shape

(5572, 2)

In [4]:
data.head()

Unnamed: 0,type,email
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
from collections import Counter
Counter(data.type)

Counter({'ham': 4825, 'spam': 747})

In [10]:
X =  data.email
y = data.type

In [11]:
X.head()

0    Go until jurong point, crazy.. Available only ...
1                        Ok lar... Joking wif u oni...
2    Free entry in 2 a wkly comp to win FA Cup fina...
3    U dun say so early hor... U c already then say...
4    Nah I don't think he goes to usf, he lives aro...
Name: email, dtype: object

## Vectorization : Transforming TEXT to Vectors

In [12]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)
feature_names = vectorizer.get_feature_names()

In [18]:
type(type(X))
X = X.toarray()

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3,random_state=10)

In [23]:
#Fitting Naive Bayes algo
from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB
model = BernoulliNB()
model.fit(X_train.toarray(),y_train)
y_predict = model.predict(X_test.toarray())

In [24]:
print(accuracy_score(y_test,y_predict))
print(pd.crosstab(y_test,y_predict))
print(classification_report(y_test,y_predict))

0.97966507177
col_0   ham  spam
type             
ham    1446     0
spam     34   192
             precision    recall  f1-score   support

        ham       0.98      1.00      0.99      1446
       spam       1.00      0.85      0.92       226

avg / total       0.98      0.98      0.98      1672



## Checking new email for spam

In [25]:
NewEmail = pd.Series(["Hi , we have a meeting.."])
NewEmail


0    Hi , we have a meeting..
dtype: object

In [26]:
NewEmail_transformed = vectorizer.transform(NewEmail)

In [27]:
model.predict(NewEmail_transformed)

array(['ham'], 
      dtype='<U4')

In [28]:
from sklearn.metrics import roc_auc_score

In [33]:
y_predict

array(['ham', 'ham', 'ham', ..., 'ham', 'ham', 'ham'], 
      dtype='<U4')