In [214]:
import csv
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import BernoulliNB

#### Section 1 - Describing the dataset 
The Titanic dataset is loaded using pandas. The dataframe is named df1 and can be found below.
The dataset is described below :

Variable Definition Key survival Survival 0 = No, 1 = Yes pclass Ticket class 1 = 1st, 2 = 2nd, 3 = 3rd sex Sex
Age: Age in years
sibsp: # of siblings / spouses aboard the Titanic
parch: # of parents / children aboard the Titanic
ticket: Ticket number
fare: Passenger fare
cabin: Cabin number
embarked: Port of Embarkation

pclass: A proxy for socio-economic status (SES) 1st = Upper 2nd = Middle 3rd = Lower

age: Age is fractional if less than 1. If the age is estimated, is it in the form of xx.5

sibsp: The dataset defines family relations in this way... Sibling = brother, sister, stepbrother, stepsister Spouse = husband, 

wife (mistresses and fiancés were ignored)

parch: The dataset defines family relations in this way... Parent = mother, father Child = daughter, son, stepdaughter, stepson 
Some children travelled only with a nanny, therefore parch=0 for them.

In [215]:
df1= pd.read_csv("C:\\Users\\ar1\\Documents\\Kaggle\\train.csv")
df1

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.2500,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.9250,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1000,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.0500,,S
5,6,0,3,"Moran, Mr. James",male,,0,0,330877,8.4583,,Q
6,7,0,1,"McCarthy, Mr. Timothy J",male,54.0,0,0,17463,51.8625,E46,S
7,8,0,3,"Palsson, Master. Gosta Leonard",male,2.0,3,1,349909,21.0750,,S
8,9,1,3,"Johnson, Mrs. Oscar W (Elisabeth Vilhelmina Berg)",female,27.0,0,2,347742,11.1333,,S
9,10,1,2,"Nasser, Mrs. Nicholas (Adele Achem)",female,14.0,1,0,237736,30.0708,,C


#### Section 2 - Loading the dataset
The train and test for this dataset are provided. 

In [216]:
def load_data():
    ### Loading the test and train data. Removing 'Survived' from the existing train_data dataset 
    train_data= pd.read_csv("C:\\Users\\ar1\\Documents\\Kaggle\\train.csv")
    target_value = train_data.Survived
    train_data.drop('Survived',1,inplace=True)  
    test_data= pd.read_csv("C:\\Users\\ar1\\Documents\\Kaggle\\test.csv")
    data= train_data.append(test_data)
    return data 
    
    

In [217]:
data= load_data()

#### Section 3 - Binarize Features

In [218]:
### Binarize Pclass - Creating dummies for Pclass.
def binarize_Pclass(data):
    data1= data
    data1.Pclass.fillna(data1['Pclass'].median(), inplace=True)
    Pclass_dummy = pd.get_dummies(data1['Pclass'], prefix='Pclass')
    data1 = pd.concat([data1,Pclass_dummy], axis=1)
    data1.drop('Pclass', axis=1, inplace=True)
    return data1

### Binarize Sex - Male : 1, Female : 0
def binarize_sex(data):
    data['Sex']=data['Sex'].map({'male':1,'female':0})
    return data

""" Binarize Age - Fill the missing value using median and then binarize the attribute. 
    attribute value > mean then 1 else 0 
"""
def binarize_age(data):
    data['Age'].fillna(data['Age'].median(),inplace=True)
    for i,rows in data.iterrows():
        if rows['Age']>data['Age'].mean():
            data['Age']=data['Age'].replace(rows['Age'],1)
        else:
            data['Age']=data['Age'].replace(rows['Age'],0)
    data['Age']=data.Age.astype(int)
    return data  

### Binarize - SibSp : Creating dummies for SibSp
def binarize_SibSp(data):
    data.SibSp.fillna(data['SibSp'].median(), inplace=True)
    SibSp_dummy = pd.get_dummies(data['SibSp'], prefix='SibSp')
    data = pd.concat([data,SibSp_dummy], axis=1)
    data.drop('SibSp', axis=1, inplace=True)
    return data

### Binarize - Parch : Creating dummies for Parch
def binarize_Parch(data):
    data.Parch.fillna(data['Parch'].median(), inplace=True)
    Parch_dummy = pd.get_dummies(data['Parch'], prefix='Parch')
    data = pd.concat([data,Parch_dummy], axis=1)
    data.drop('Parch', axis=1, inplace=True)
    return data

""" Binarize Fare - Fill the missing value using median and then binarize the attribute. 
    attribute value > mean then 1 else 0 
"""
def binarize_Fare(data):
    data['Fare'].fillna(data['Fare'].median(),inplace=True)
    for i,rows in data.iterrows():
        if rows['Fare']>data['Fare'].mean():
            data['Fare']=data['Fare'].replace(rows['Fare'],1)
        else:
            data['Fare']=data['Fare'].replace(rows['Fare'],0)
    data['Fare']=data.Fare.astype(int)
    return data

### Binarize Cabin : Fill the missing value using 'N' and then used dummies to binarize Cabin.
def binarize_Cabin(data):
    data.Cabin.fillna('N', inplace=True)
    Cabin_dummy = pd.get_dummies(data['Cabin'], prefix='Cabin')
    data = pd.concat([data,Cabin_dummy], axis=1)
    data.drop('Cabin', axis=1, inplace=True)
    return data

### Binarize Embarked : Fill the missing value using 'N' and then used dummies to binarize Embarked.
def binarize_Embarked(data):
    data.Embarked.fillna('N', inplace=True)
    Embarked_dummy = pd.get_dummies(data['Embarked'], prefix='Embarked')
    data = pd.concat([data,Embarked_dummy], axis=1)
    data.drop('Embarked', axis=1, inplace=True)
    return data
    

In [219]:
### Dropping PassengerId, Name and Ticket as other features add more value than them. 

data.drop('PassengerId',axis=1,inplace=True)
data.drop('Name',axis=1,inplace=True)
data.drop('Ticket',axis=1,inplace=True)

In [220]:
data = binarize_Pclass(data)
data = binarize_sex(data)
data = binarize_age(data)
data = binarize_SibSp(data)
data = binarize_Parch(data)
data = binarize_Fare(data)
data = binarize_Cabin(data)
data = binarize_Embarked(data)

In [221]:
data.shape

(1309, 212)

#### The final dataset after binarizing all values can be found below.  

In [222]:
data

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_N,Cabin_T,Embarked_C,Embarked_N,Embarked_Q,Embarked_S
0,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,1,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
8,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
9,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0


#### Section 4 - BernoulliNB Model

In [223]:
def get_train_test_data():
    extract_target = pd.read_csv('C:\\Users\\ar1\\Documents\\Kaggle\\train.csv')
    targets_data_val = extract_target.Survived
    train_data_val = data.head(891)
    test_data_val = data.iloc[891:]
    
    return train_data_val, test_data_val, targets_data_val

In [224]:
train,test,target= get_train_test_data()

In [225]:
train

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Cabin_F33,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_N,Cabin_T,Embarked_C,Embarked_N,Embarked_Q,Embarked_S
0,1,0,0,0,0,1,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
1,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,1,0,0,0
2,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
3,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1
4,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
5,1,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,1,0
6,1,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1
7,1,0,0,0,0,1,0,0,0,1,...,0,0,0,0,1,0,0,0,0,1
8,0,0,0,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,0,1
9,0,0,0,0,1,0,0,1,0,0,...,0,0,0,0,1,0,1,0,0,0


In [227]:
clf = BernoulliNB()
clf.fit(train,target)
train_check= clf.predict(train)
accuracy_score(target, train_check)


0.74410774410774416

In [229]:
### The accuracy of the model can be found below
predicted_val= clf.predict(test)
y_true= pd.read_csv("C:\\Users\\ar1\\Documents\\Kaggle\\gender_submission.csv")
y_true.drop('PassengerId',axis=1,inplace=True)
accuracy_score(y_true, predicted_val)


0.73205741626794263

In [230]:
data1= data.iloc[1300:]
Y_val= predicted_val[409:]
data1 = data1.assign(Y=Y_val)
data1

Unnamed: 0,Sex,Age,Fare,Pclass_1,Pclass_2,Pclass_3,SibSp_0,SibSp_1,SibSp_2,SibSp_3,...,Cabin_F38,Cabin_F4,Cabin_G6,Cabin_N,Cabin_T,Embarked_C,Embarked_N,Embarked_Q,Embarked_S,Y
409,0,0,1,0,0,1,0,1,0,0,...,0,0,0,1,0,0,0,0,1,1
410,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,1,0,0
411,0,0,1,1,0,0,0,1,0,0,...,0,0,0,0,0,0,0,1,0,1
412,0,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
413,1,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
414,0,0,1,1,0,0,1,0,0,0,...,0,0,0,0,0,1,0,0,0,1
415,1,1,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
416,1,0,0,0,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,0
417,1,0,1,0,0,1,0,1,0,0,...,0,0,0,1,0,1,0,0,0,1


#### Section 5-  calculating total evidence

Note: The test set is sampled due to computation power. 


In [231]:
evidence_df=pd.DataFrame(clf.predict_proba(test), columns=clf.classes_)
###total= pd.DataFrame['0'].sum()
###total
evidence_df.columns=['Neg','Pos']
evidence_df

Unnamed: 0,Neg,Pos
0,0.947002,0.052998
1,0.507219,0.492781
2,0.783153,0.216847
3,0.993205,0.006795
4,0.190883,0.809117
5,0.979104,0.020896
6,0.648498,0.351502
7,0.315919,0.684081
8,0.717651,0.282349
9,0.952296,0.047704


In [232]:
### Finding out the total 1s and 0s in column Y. Column Y is the predicted value given by BernoulliNB model
data1['Y'].value_counts().reset_index()

Unnamed: 0,index,Y
0,0,5
1,1,4


In [233]:
### final_yes_Y will contain the total number of 1s that column 'Y' contains in dataframe data1.
### final_no_Y will contain the total number of 0s that column 'Y' contains in dataframe data1.
yes_val = 0
no_val = 0
val=pd.value_counts(data1['Y'].values)
final_yes_Y= val[1]
final_no_Y=val[0]
final_yes_Y += 2
final_no_Y += 2

In [234]:
data1.shape

(9, 213)

In [235]:
##creating new numpy array and finding evidence
evidence_array=np.zeros((9,213),dtype=np.float64)

for rows in range(data1.shape[0]):
    for column in range(data1.shape[1]):
        for index, row_val in data1.iterrows():
            if data1.iloc[rows]['Y'] == 1:
                if data1.iloc[rows][column] == row_val[column]:
                    yes_val += 1
            if data1.iloc[rows]['Y'] == 0:
                if data1.iloc[rows][column] == row_val[column]:
                    no_val += 1
        log_yes= np.log((yes_val+1)/final_yes_Y)
        log_no= np.log((no_val+1)/final_no_Y)
        log_evidence = log_yes-log_no
        evidence_array[rows][column]= log_evidence
        yes_val=0
        no_val=0
        

In [170]:
evidence_array

array([[ 2.2617631 ,  2.66722821,  2.66722821, ...,  2.54944517,
         2.2617631 ,  1.85629799],
       [-1.32175584, -1.72722095, -1.72722095, ..., -0.62860866,
        -1.13943428, -1.47590652],
       [ 2.2617631 ,  2.66722821,  2.66722821, ...,  1.56861592,
         2.07944154,  1.85629799],
       ..., 
       [-1.13943428, -0.22314355, -0.22314355, ..., -1.60943791,
        -1.32175584, -1.47590652],
       [-1.13943428, -1.72722095, -1.72722095, ..., -1.60943791,
        -1.32175584, -1.47590652],
       [-1.13943428, -1.72722095, -1.72722095, ..., -1.60943791,
        -1.13943428, -1.47590652]])

In [171]:
np.sum(evidence_array,axis=1).tolist()

[582.8457689660648,
 -385.1015779674053,
 579.1977115064711,
 -386.26472877721096,
 -386.08240722041705,
 580.316943082342,
 -383.07425242686446,
 -386.08240722041705,
 -381.83836632871703]

### Section 6 - Evidence 

In [213]:
#displaying individual evidence for each object
evidence_dataframe=pd.DataFrame(evidence_array)
evidence_dataframe

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,203,204,205,206,207,208,209,210,211,212
0,2.261763,2.667228,2.667228,2.549445,2.772589,2.549445,1.856298,1.856298,2.772589,2.772589,...,2.772589,2.772589,2.772589,2.549445,2.772589,2.549445,2.772589,2.549445,2.261763,1.856298
1,-1.321756,-1.727221,-1.727221,-1.609438,-1.832581,-1.609438,-1.475907,-1.475907,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-1.609438,-1.832581,-0.628609,-1.139434,-1.475907
2,2.261763,2.667228,2.667228,1.568616,2.772589,1.568616,1.856298,1.856298,2.772589,2.772589,...,2.772589,2.772589,2.772589,1.568616,2.772589,2.549445,2.772589,1.568616,2.079442,1.856298
3,-1.321756,-1.727221,-1.727221,-1.609438,-1.832581,-1.609438,-1.475907,-1.475907,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-1.609438,-1.832581,-1.609438,-1.321756,-1.475907
4,-1.139434,-1.727221,-1.727221,-1.609438,-1.832581,-1.609438,-1.475907,-1.475907,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-1.609438,-1.832581,-1.609438,-1.321756,-1.475907
5,2.261763,2.667228,2.667228,1.568616,2.772589,1.568616,2.415914,2.415914,2.772589,2.772589,...,2.772589,2.772589,2.772589,1.568616,2.772589,1.568616,2.772589,2.549445,2.079442,1.856298
6,-1.139434,-0.223144,-0.223144,-1.609438,-1.832581,-1.609438,-1.475907,-1.475907,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-1.609438,-1.832581,-1.609438,-1.321756,-1.475907
7,-1.139434,-1.727221,-1.727221,-1.609438,-1.832581,-1.609438,-1.475907,-1.475907,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-1.609438,-1.832581,-1.609438,-1.321756,-1.475907
8,-1.139434,-1.727221,-1.727221,-1.609438,-1.832581,-1.609438,-0.916291,-0.916291,-1.832581,-1.832581,...,-1.832581,-1.832581,-1.832581,-1.609438,-1.832581,-0.628609,-1.832581,-1.609438,-1.139434,-1.475907


###### Total Negative Evidence

In [186]:
total= evidence_df['Neg'].sum()
total

265.95802643979886

###### Total Positive Evidence

In [188]:
total= evidence_df['Pos'].sum()
total

152.04197356020143