<h2 style='color:purple' align='center'>Naive Bayes Tutorial Part 1: Predicting survival from titanic crash</h2>

# Gaussian Naive Bayes

In [147]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.feature_selection import RFE

from sklearn.preprocessing import StandardScaler

from sklearn.decomposition import PCA

from sklearn.model_selection import train_test_split, cross_val_score

from sklearn import datasets


# Supervised ML Algo’s
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn import svm
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
	
from sklearn.metrics import (
mean_absolute_error, 
median_absolute_error, 
mean_squared_error,
mean_squared_log_error,
r2_score,
explained_variance_score,
max_error,

confusion_matrix, 
classification_report, 
accuracy_score,  
precision_score, 
recall_score, 
f1_score, 
auc, 
log_loss, 
roc_auc_score, 
roc_curve, 
precision_recall_curve, 
plot_precision_recall_curve,
silhouette_score
)

# mlxtend is another library which comes handy sometimes. 

# UnSupervised ML Algo’s

from sklearn.cluster import KMeans
import scipy.cluster.hierarchy as sch
from sklearn.cluster import AgglomerativeClustering

# Ensemble ML Algo’s
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
import xgboost as xgb

# Load the dataset

In [148]:
df = pd.read_csv("titanic.csv")
df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


In [149]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)
df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


In [150]:
inputs = df.drop('Survived',axis='columns')
target = df.Survived

In [151]:
inputs.head()

Unnamed: 0,Pclass,Sex,Age,Fare
0,3,male,22.0,7.25
1,1,female,38.0,71.2833
2,3,female,26.0,7.925
3,1,female,35.0,53.1
4,3,male,35.0,8.05


In [152]:
target.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

# FE:

In [153]:
# Convert categorical into numerical for Sex 

dummies = pd.get_dummies(inputs.Sex)
dummies.head(3)


# Alternate approach is,   inputs.Sex = inputs.Sex.map({'male': 1, 'female': 2})

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0


In [154]:
inputs = pd.concat([inputs,dummies],axis='columns')
inputs.head(3)

Unnamed: 0,Pclass,Sex,Age,Fare,female,male
0,3,male,22.0,7.25,0,1
1,1,female,38.0,71.2833,1,0
2,3,female,26.0,7.925,1,0


**I am dropping male column as well because of dummy variable trap theory. One column is enough to repressent male vs female**

In [155]:
inputs.drop(['Sex','male'],axis='columns',inplace=True)
inputs.head(3)

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1


In [156]:
# Check if any column has Null value:

inputs.columns[inputs.isna().any()]

Index(['Age'], dtype='object')

In [157]:
inputs.Age[:10]

0    22.0
1    38.0
2    26.0
3    35.0
4    35.0
5     NaN
6    54.0
7     2.0
8    27.0
9    14.0
Name: Age, dtype: float64

In [158]:
inputs.Age = inputs.Age.fillna(inputs.Age.mean())
inputs.head()

Unnamed: 0,Pclass,Age,Fare,female
0,3,22.0,7.25,0
1,1,38.0,71.2833,1
2,3,26.0,7.925,1
3,1,35.0,53.1,1
4,3,35.0,8.05,0


# Model Build:

In [159]:
X_train, X_test, y_train, y_test = train_test_split(inputs,target,test_size=0.3)

In [160]:
model = GaussianNB()

In [161]:
model.fit(X_train,y_train)

GaussianNB()

In [164]:
y_predict = model.predict(X_test)

In [165]:
print(" Cross Validation Score: \n\n", cross_val_score(GaussianNB(),X_train, y_train, cv=10))

 Cross Validation Score: 

 [0.76190476 0.84126984 0.6984127  0.79032258 0.74193548 0.72580645
 0.79032258 0.85483871 0.74193548 0.77419355]


**Calculate the Metrics**

In [166]:
def classification_metrics_udf(y_test, y_predict):
    
    print("Confusion Matrix :\n\n", confusion_matrix(y_test,y_predict))
    
    print("\n\n Classification Report: \n\n", classification_report(y_test, y_predict))
    
    print("\n\n Accuracy Score: \n\n", accuracy_score(y_test, y_predict)) 

    print("\n\n Missclassfication Rate: \n\n", 1 - accuracy_score(y_test, y_predict))
    
    print("\n\n Recall Score: \n\n", recall_score(y_test, y_predict))

    print("\n\n Precision Score: \n\n", precision_score(y_test, y_predict))

    print("\n\n f1 Score: \n\n", f1_score(y_test, y_predict)) 

    print("\n\n roc_auc_score:\n\n", roc_auc_score(y_test, y_predict))


In [167]:
classification_metrics_udf(y_test, y_predict)

Confusion Matrix :

 [[122  38]
 [ 18  90]]


 Classification Report: 

               precision    recall  f1-score   support

           0       0.87      0.76      0.81       160
           1       0.70      0.83      0.76       108

    accuracy                           0.79       268
   macro avg       0.79      0.80      0.79       268
weighted avg       0.80      0.79      0.79       268



 Accuracy Score: 

 0.7910447761194029


 Missclassfication Rate: 

 0.20895522388059706


 Recall Score: 

 0.8333333333333334


 Precision Score: 

 0.703125


 f1 Score: 

 0.7627118644067796


 roc_auc_score:

 0.7979166666666667


<h2 style='color:purple' align='center'>Naive Bayes Tutorial Part 2: Predicting Spam Email </h2>

# MultiNomial Naive Bayes 

In [168]:
df = pd.read_csv("spam.csv")
df.head()

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [169]:
df.groupby('Category').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Category,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,641,Please call our customer service representativ...,4


In [170]:
# Perform Encoding for Category feature:

df['spam']=df['Category'].apply(lambda x: 1 if x=='spam' else 0)
df.head()

Unnamed: 0,Category,Message,spam
0,ham,"Go until jurong point, crazy.. Available only ...",0
1,ham,Ok lar... Joking wif u oni...,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,ham,U dun say so early hor... U c already then say...,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",0


In [171]:
X = df.Message
y = df.spam

In [172]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [173]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4179,), (1393,), (4179,), (1393,))

In [174]:
from sklearn.feature_extraction.text import CountVectorizer

v = CountVectorizer()

# This will create a new coluns/features for each of the distinct word appeared from all of the messages.
# Then for these columns, the value will be populated as frequency of those word occurance. 
# pls see below video for further understanding. 
# https://www.youtube.com/watch?v=nHIUYwN-5rM

In [175]:
X_train.values

#these are the messages. 

array(['We left already we at orchard now.',
       'Take us out shopping and Mark will distract Isaiah.=D',
       'ELLO BABE U OK?', ...,
       "An excellent thought by a misundrstud frnd: I knw u hate me bt the day wen u'll knw the truth u'll hate urself:-( Gn:-)",
       'HOT LIVE FANTASIES call now 08707509020 Just 20p per min NTT Ltd, PO Box 1327 Croydon CR9 5WB 0870 is a national rate call',
       'Since when, which side, any fever, any vomitin.'], dtype=object)

In [176]:
X_train_count = v.fit_transform(X_train.values) # fit followed by transform, hence fit_transform is used.

X_test_count =  v.transform(X_test) # here we don't want to fit the model with test data, just want to transform the
# messages/test to words count so just transform is used. 

X_train_count

<4179x7346 sparse matrix of type '<class 'numpy.int64'>'
	with 55512 stored elements in Compressed Sparse Row format>

In [177]:
X_train_count.toarray()[:2]

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [178]:
from sklearn.naive_bayes import MultinomialNB

model = MultinomialNB()

model.fit(X_train_count,y_train) # pls note, here we are passing X_train_count

MultinomialNB()

In [179]:
y_predict = model.predict(X_test_count)

In [180]:
model.score(X_test_count,y_test)

0.9820531227566404

In [181]:
print("Cross Validation Score: \n\n", cross_val_score(MultinomialNB(),X_train_count, y_train, cv=10))

Cross Validation Score: 

 [0.98086124 0.98325359 0.99282297 0.97607656 0.98086124 0.98803828
 0.97129187 0.97368421 0.98086124 0.98081535]


In [182]:
def classification_metrics_udf(y_test, y_predict):
    
    print("Confusion Matrix :\n\n", confusion_matrix(y_test,y_predict))
    
    print("\n\n Classification Report: \n\n", classification_report(y_test, y_predict))
    
    print("\n\n Accuracy Score: \n\n", accuracy_score(y_test, y_predict)) 

    print("\n\n Missclassfication Rate: \n\n", 1 - accuracy_score(y_test, y_predict))
    
    print("\n\n Recall Score: \n\n", recall_score(y_test, y_predict))

    print("\n\n Precision Score: \n\n", precision_score(y_test, y_predict))

    print("\n\n f1 Score: \n\n", f1_score(y_test, y_predict)) 

    print("\n\n roc_auc_score:\n\n", roc_auc_score(y_test, y_predict))


In [183]:
classification_metrics_udf(y_test, y_predict)

Confusion Matrix :

 [[1203    6]
 [  19  165]]


 Classification Report: 

               precision    recall  f1-score   support

           0       0.98      1.00      0.99      1209
           1       0.96      0.90      0.93       184

    accuracy                           0.98      1393
   macro avg       0.97      0.95      0.96      1393
weighted avg       0.98      0.98      0.98      1393



 Accuracy Score: 

 0.9820531227566404


 Missclassfication Rate: 

 0.017946877243359638


 Recall Score: 

 0.8967391304347826


 Precision Score: 

 0.9649122807017544


 f1 Score: 

 0.9295774647887324


 roc_auc_score:

 0.9458881756392274


In [184]:
# Lets create some new data and test it.

emails = [
    'Hey mohan, can we get together to watch footbal game tomorrow?',
    'Upto 20% discount on parking, exclusive offer just for you. Dont miss this reward!'
]

emails_count = v.transform(emails)

model.predict(emails_count)

# we can see that first email is not spame hence 0, 2nd email is detected as spam hence 1. 

array([0, 1])

**Sklearn Pipeline**

In [185]:
# Above whatver the word vectorization we performed explicitly can be made as part of sklearn pipeline also as below.

from sklearn.pipeline import Pipeline

model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [186]:
model.fit(X_train, y_train) 

# now we are passing X_train only, not X_train_count because since vectorizer is part of
# pipeline there is no need to explictly vectorize it. 

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [187]:
model.score(X_test,y_test)

0.9820531227566404

In [188]:
model.predict(emails) # here also 'emails' directly, not emails_count

array([0, 1])

In [189]:
# Need to create another example for Bernoulli method later. 