In [87]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from mpl_toolkits.mplot3d import Axes3D


### Naive Bayes

- The conditional probability of Bayes Theorem, where you have a prior probability that you condition on to obtain a posterior probability, is the foundaiton of Naive Bayes as a classfication method in supervised machine learning.  Like in Bayes Theroem, there is an assumption of independence among the predictors given the prior.  The power of Naive Bayes is the ability to handle multidimensional data (datasets with a large number of predictors).  
- In practice the assumption of independence is loosely held, as the model works well for many large datasets without the explicit condition of independence.  

### Spam Data
- This data set is composed of SMS texts labeled as spam or ham given the text keyword content on the messages.  
- We have categorical response variable that takes on two possible responses (spam or ham), predicted from our vector of features $(X = {x_0,x_1...x_p})$.
- Our prediction function is a classification function that uses Naive Bayes.
- We want to minimize loss, $Loss(y,yhat)$ for our classification function, which in the case of Naive Bayes tyring to obtain the smallest probability of misclassification

In [88]:
# Read in training data

trainB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainB.csv")
trainyB = pd.read_csv("https://bitbucket.org/remcc/rob-data-sets/downloads/smsTrainyB.csv")

In [89]:
# checking dimensions of datasets
print(trainB.shape)
print(trainyB.shape)

(4169, 1139)
(4169, 1)


In [90]:
# preview datasets
trainB.head()

Unnamed: 0,£wk,€˜m,€˜s,abiola,abl,abt,accept,access,account,across,...,yahoo,yar,yeah,year,yep,yes,yesterday,yet,yoga,yup
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [91]:
trainyB.head()

Unnamed: 0,smsTrainyB
0,0
1,0
2,0
3,1
4,1


In [92]:
#subset training data by spam or not spam, further subset by index into spam or ham given adult/age columns (indicated by S or H)
iisspam = trainyB.loc[trainyB['smsTrainyB'] == 1]
isnspam = trainyB.loc[trainyB['smsTrainyB'] == 0]
ageS = trainB.loc[iisspam.index, :]
ageS = ageS['age']
ageH = trainB.loc[isnspam.index, :]
ageH = ageH['age']
adultS = trainB.loc[iisspam.index, :]
adultS = adultS['adult']
adultH = trainB.loc[isnspam.index, :]
adultH = adultH['adult']

In [93]:
# Find the proportion of spam and ham in the train dataset
propS = len(iisspam)/len(trainyB)
propH = len(isnspam)/len(trainyB)
print(f"The proportion of Ham is: {propH}")
print(f"The proportion of Spam is: {propS}")

The proportion of Ham is: 0.8647157591748621
The proportion of Spam is: 0.13528424082513793


In [94]:
# Display Joint Frequencies of age and adult for the ham observations
age_adult_ham = pd.crosstab(index=ageH, columns=adultH, margins=True)
age_adult_ham

adult,0,1,All
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,3598,2,3600
1,5,0,5
All,3603,2,3605


In [95]:
# Display Joint Frequencies of age and adult for the spam observations (note: you can specify col and row labels in args for crosstab)
age_adult_spam = pd.crosstab(index=ageS, columns=adultS, margins=True)
age_adult_spam

adult,0,1,All
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,549,3,552
1,12,0,12
All,561,3,564


- Make Conditional Probability tables, pretty granular

In [96]:
# Joint Probability table for spam
pd.crosstab(ageS, adultS, normalize='all',margins=True)

adult,0,1,All
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.973404,0.005319,0.978723
1,0.021277,0.0,0.021277
All,0.994681,0.005319,1.0


In [97]:
# Joint probability table for ham
pd.crosstab(ageH,adultH, normalize = 'all', margins=True)

adult,0,1,All
age,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0,0.998058,0.000555,0.998613
1,0.001387,0.0,0.001387
All,0.999445,0.000555,1.0


In [98]:
# P(ages=yes|ham) from Joint Probability table above for ham
P = .001387

In [99]:
# P(ham|adult=no,age=yes)
P = (0.999445*0.001387*propH)/(.001388*0.999445*propH+0.994681*0.021277*propS)
P

0.2950494894173434

In [100]:
# P(ham|adult=no,age=yes) not independent, then P(A|B) = P(A)
P = .001387


In [101]:
# P(ham|adult=yes,age=yes) without Bayesian assumption
P = 0