# Ground calculation Naive Bayes on Titanic dataset

Assumptions
* Bayes classifier assumes that the presence of a particular feature in a class is unrelated to the presence of any other feature.

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

Data=pd.read_csv("./titanic_dataset/train.csv")


data1=Data.drop(columns=["Name","Ticket","Cabin","Embarked","PassengerId"])

# Check missing values
data2=data1.dropna()

# Remap categorical values to number
remap_sex = {"Sex":{"male": 0, "female": 1}}
data3=data2.replace(remap_sex)

# shaffle data
data4=data3.sample(frac=1)

# split data to the train and test set
train=data4[:600]
test=data4[600:]

In [2]:
data4.head(2)

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare
22,1,3,1,15.0,0,0,8.0292
357,0,2,1,38.0,0,0,13.0


# Naive Bayes: One feature

$$ P(survived|sex)=\frac{P(sex|survived)*P(survived)}{P(sex)}$$

In [3]:
## P(sex|survived)
Frequency_table_sex_survived=pd.pivot_table(train,values="Pclass",index=["Survived","Sex"],aggfunc='count')
Frequency_table_sex_survived=Frequency_table_sex_survived.reset_index()
N_NotSurvived=Frequency_table_sex_survived[Frequency_table_sex_survived["Survived"]==0]["Pclass"].sum()
N_Survived=Frequency_table_sex_survived[Frequency_table_sex_survived["Survived"]==1]["Pclass"].sum()


Frequency_table_sex_survived.loc[Frequency_table_sex_survived["Survived"]==0,"Pclass"]=Frequency_table_sex_survived.loc[Frequency_table_sex_survived["Survived"]==0,"Pclass"]/N_NotSurvived
Frequency_table_sex_survived.loc[Frequency_table_sex_survived["Survived"]==1,"Pclass"]=Frequency_table_sex_survived.loc[Frequency_table_sex_survived["Survived"]==1,"Pclass"]/N_Survived
Prob_table_sex_survived=Frequency_table_sex_survived.rename(columns={"Pclass":"Prob"})

print("#########################")
print("P(sex|survived) \n",Prob_table_sex_survived)
print("#########################")

## P(survived)
Frequency_table_survived=pd.pivot_table(train,values="Pclass",index=["Survived"],aggfunc='count')
Prob_table_survived=Frequency_table_survived/Frequency_table_survived["Pclass"].sum()
Prob_table_survived=Prob_table_survived.rename(columns={"Pclass":"Prob"})
Prob_table_survived.reset_index(inplace=True)
print(" P(survived) \n",Prob_table_survived)
print("#########################")


## P(sex)
Frequency_table_sex=pd.pivot_table(train,values="Pclass",index=["Sex"],aggfunc='count')
Prob_table_sex=Frequency_table_sex/Frequency_table_sex["Pclass"].sum()
Prob_table_sex=Prob_table_sex.rename(columns={"Pclass":"Prob"})
Prob_table_sex.reset_index(inplace=True)
print(" P(sex) \n",Prob_table_sex)
print("#########################")


## Byaes rule
Prob_table_survived_sex=Prob_table_sex_survived.copy()

result=[]
for index,row in Prob_table_sex_survived.iterrows():
    sex=int(row["Sex"])
    survived=int(row["Survived"])
    
    prob=row["Prob"]*Prob_table_survived["Prob"][survived]/Prob_table_sex["Prob"][sex]
    result.append({"Sex":sex,"Survived":survived,"Prob":prob})


Results=pd.DataFrame(result)
print(Results.sort_values(by="Sex"))


#########################
P(sex|survived) 
    Survived  Sex      Prob
0         0    0  0.845070
1         0    1  0.154930
2         1    0  0.326531
3         1    1  0.673469
#########################
 P(survived) 
    Survived      Prob
0         0  0.591667
1         1  0.408333
#########################
 P(sex) 
    Sex      Prob
0    0  0.633333
1    1  0.366667
#########################
   Sex  Survived      Prob
0    0         0  0.789474
2    0         1  0.210526
1    1         0  0.250000
3    1         1  0.750000


In [4]:
predict_train=np.zeros(train.shape[0])
predict_train[train["Sex"]==1]=1
accuracy_train=(train["Survived"]==predict_train).mean()
print("train accuracy={}".format(accuracy_train))

predict_test=np.zeros(test.shape[0])
predict_test[test["Sex"]==1]=1
accuracy_test=(test["Survived"]==predict_test).mean()
print("test accuracy={}".format(accuracy_test))


train accuracy=0.775
test accuracy=0.8070175438596491


### Compare with sklearn result

In [18]:
X_train=train["Sex"].values.reshape(-1,1)
y_train=train["Survived"]

X_test=test["Sex"].values.reshape(-1,1)
y_test=test["Survived"]

In [19]:
from sklearn.naive_bayes import BernoulliNB
nb=BernoulliNB()
nb.fit(X_train, y_train)

print("train score={}".format(nb.score(X_train,y_train)))
print("test score={}".format(nb.score(X_test,y_test)))

print(nb.predict_proba(X_test[:2]))
print(X_test[:2])

train score=0.775
test score=0.8070175438596491
[[0.78837754 0.21162246]
 [0.25272639 0.74727361]]
[[0]
 [1]]


In [34]:
from sklearn.naive_bayes import MultinomialNB
nbM=MultinomialNB(fit_prior=True)
nbM.fit(X_train, y_train)

print(nbM.feature_count_)
print(nbM.feature_log_prob_)
print(nbM.n_features_in_)
print(np.exp(nbM.class_log_prior_))

print("train score={}".format(nb.score(X_train,y_train)))
print("test score={}".format(nb.score(X_test,y_test)))


print(nb.predict_proba(X_test[:2]))
print(X_test[:2])



[[ 55.]
 [165.]]
[[0.]
 [0.]]
1
[0.59166667 0.40833333]
train score=0.5916666666666667
test score=0.6052631578947368
[[0.59166667 0.40833333]
 [0.59166667 0.40833333]]
[[0]
 [1]]


In [37]:
X_train[X_train==1].sum()/X_train.shape[0]

0.36666666666666664

# Naive Bayes: Two features
Predict survival using two features Sex and Pclass <br>

$$ P(survived|sex,Pclass)=\frac{P(sex|survived)*P(Pclass|survived)*P(survived)}{P(sex)}$$

under independt features assumption
P(survived/Sex,Pclass)=P(Sex/survived)*P(Pclass/survived)*P(survived)/P(Sex,Pclass) <br>

In [96]:
print(Prob_table_sex_survived)
print(Prob_table_survived)
print(Prob_table_sex)

   Survived  Sex      Prob
0         0    0  0.844193
1         0    1  0.155807
2         1    0  0.311741
3         1    1  0.688259
   Survived      Prob
0         0  0.588333
1         1  0.411667
   Sex   Prob
0    0  0.625
1    1  0.375


In [97]:
Frequency_table_Pclass=pd.pivot_table(train,values="Age",index=["Survived","Pclass"],aggfunc="count")
Frequency_table_Pclass.rename(columns={"Age":"Prob"},inplace=True)
Frequency_table_Pclass.reset_index(inplace=True)

Frequency_table_Pclass.loc[Frequency_table_Pclass["Survived"]==0,"Prob"]=Frequency_table_Pclass.loc[Frequency_table_Pclass["Survived"]==0,"Prob"]/N_NotSurvived
Frequency_table_Pclass.loc[Frequency_table_Pclass["Survived"]==1,"Prob"]=Frequency_table_Pclass.loc[Frequency_table_Pclass["Survived"]==1,"Prob"]/N_Survived
print(Frequency_table_Pclass)

   Survived  Pclass      Prob
0         0       1  0.155807
1         0       2  0.220963
2         0       3  0.623229
3         1       1  0.412955
4         1       2  0.303644
5         1       3  0.283401


In [160]:
dict_posterio_results=[]
Survived_list=[0,1]
Sex_list=[0,1]
Pclass_list=[1,2,3]


for sex in Sex_list:
    for pclass in Pclass_list:
        for surv in Survived_list:
            prob_sex_survived=Prob_table_sex_survived.loc[(Prob_table_sex_survived["Sex"]==sex) & (Prob_table_sex_survived["Survived"]==surv),"Prob"].values[0]
            prob_surv=Prob_table_survived.loc[Prob_table_survived["Survived"]==surv,"Prob"].values[0]
            prob_sex=Prob_table_sex.loc[Prob_table_sex["Sex"]==sex,"Prob"].values[0]
            prob_pclass_surv=Frequency_table_Pclass.loc[(Frequency_table_Pclass["Survived"]==surv) & (Frequency_table_Pclass["Pclass"]==pclass),"Prob"].values[0]
            prob_result=prob_sex_survived* prob_pclass_surv*prob_surv/prob_sex
            dict_res={"Sex":sex,"Pclass":pclass,"Survived":surv,"Prob":prob_result}
            dict_posterio_results.append(dict_res)
dict_posterio_results_df=pd.DataFrame(dict_posterio_results)

In [161]:
df1=dict_posterio_results_df[dict_posterio_results_df["Survived"]==0]
df2=dict_posterio_results_df[dict_posterio_results_df["Survived"]==1]
                                                      
df3=pd.merge(df1,df2,how="left",on=["Sex","Pclass"],suffixes=('_surv_0', '_surv_1'))
df3.drop(columns=["Survived_surv_0","Survived_surv_1"],inplace=True)

In [162]:
df3

Unnamed: 0,Sex,Pclass,Prob_surv_0,Prob_surv_1
0,0,1,0.123815,0.084794
1,0,2,0.175592,0.062348
2,0,3,0.49526,0.058192
3,1,1,0.038086,0.312011
4,1,2,0.054013,0.22942
5,1,3,0.152345,0.214125


## Compare with Scikit-learn

In [154]:
X_train=train[["Sex","Pclass"]].values
y_train=train["Survived"]

X_test=test[["Sex","Pclass"]].values
y_test=test["Survived"]

from sklearn.naive_bayes import BernoulliNB,MultinomialNB,CategoricalNB 
nb_2 = BernoulliNB()
nb_MB = CategoricalNB()
nb_2.fit(X_train, y_train)
nb_MB.fit(X_train,y_train)



print("train score={}".format(nb_2.score(X_train,y_train)))
print("test score={}".format(nb_2.score(X_test,y_test)))

train score=0.78
test score=0.7807017543859649


In [155]:
nb_2.feature_count_

array([[ 55., 353.],
       [170., 247.]])

In [156]:
X_train_test=[[0,1],[0,2],[0,3],[1,1],[1,2],[1,3]]

In [157]:
nb_2.predict_proba(X_train_test)

array([[0.79369651, 0.20630349],
       [0.79369651, 0.20630349],
       [0.79369651, 0.20630349],
       [0.24736903, 0.75263097],
       [0.24736903, 0.75263097],
       [0.24736903, 0.75263097]])

In [159]:
nb_MB.predict_proba(X_train_test)

array([[0.59495544, 0.40504456],
       [0.73741602, 0.26258398],
       [0.89372324, 0.10627676],
       [0.11149533, 0.88850467],
       [0.19349392, 0.80650608],
       [0.41807103, 0.58192897]])