In [132]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [36]:
df = sns.load_dataset('titanic')
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [14]:
print('Totatl enrtries are',df.shape[0])

Totatl enrtries are 891


In [37]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 15 columns):
survived       891 non-null int64
pclass         891 non-null int64
sex            891 non-null object
age            714 non-null float64
sibsp          891 non-null int64
parch          891 non-null int64
fare           891 non-null float64
embarked       889 non-null object
class          891 non-null category
who            891 non-null object
adult_male     891 non-null bool
deck           203 non-null category
embark_town    889 non-null object
alive          891 non-null object
alone          891 non-null bool
dtypes: bool(2), category(2), float64(2), int64(4), object(5)
memory usage: 80.6+ KB


From above it is clear that age, deck and embarked columns contains some missing entries

In [38]:
df.describe()

Unnamed: 0,survived,pclass,age,sibsp,parch,fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,0.0,1.0,0.42,0.0,0.0,0.0
25%,0.0,2.0,20.125,0.0,0.0,7.9104
50%,0.0,3.0,28.0,0.0,0.0,14.4542
75%,1.0,3.0,38.0,1.0,0.0,31.0
max,1.0,3.0,80.0,8.0,6.0,512.3292


### Servived column is a Binary column can also be considered as categorical with two categories 0 and 1 

In [39]:
df.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [40]:
df.columns#fetch the column names

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'deck', 'embark_town',
       'alive', 'alone'],
      dtype='object')

In [45]:
#Lets Select columns for the final data
df1 = df[['survived','sex', 'age', 'parch', 'fare']]

In [81]:
df1

Unnamed: 0,survived,sex,age,parch,fare
0,0,male,22.0,0,7.2500
1,1,female,38.0,0,71.2833
2,1,female,26.0,0,7.9250
3,1,female,35.0,0,53.1000
4,0,male,35.0,0,8.0500
...,...,...,...,...,...
886,0,male,27.0,0,13.0000
887,1,female,19.0,0,30.0000
888,0,female,,2,23.4500
889,1,male,26.0,0,30.0000


In [82]:
#Sex column contains string data so we need to convert it into numbers using encoding 
#either we can go with OneHotEncoder or else we can use get_dummies method
d = pd.get_dummies(df.sex)

In [83]:
#Insert the dummy values into data by concatenation
data = pd.concat([df1, d],axis ='columns')

In [84]:
data

Unnamed: 0,survived,sex,age,parch,fare,female,male
0,0,male,22.0,0,7.2500,0,1
1,1,female,38.0,0,71.2833,1,0
2,1,female,26.0,0,7.9250,1,0
3,1,female,35.0,0,53.1000,1,0
4,0,male,35.0,0,8.0500,0,1
...,...,...,...,...,...,...,...
886,0,male,27.0,0,13.0000,0,1
887,1,female,19.0,0,30.0000,1,0
888,0,female,,2,23.4500,1,0
889,1,male,26.0,0,30.0000,0,1


In [85]:
#drop sex column
data.drop('sex', inplace = True, axis = 1)


In [92]:
data.isna().sum()

survived      0
age         177
parch         0
fare          0
female        0
male          0
dtype: int64

In [97]:
data.age = data.age.fillna(np.mean(data.age))

In [98]:
X = data[['age','parch','fare','female','male']]
X

Unnamed: 0,age,parch,fare,female,male
0,22.000000,0,7.2500,0,1
1,38.000000,0,71.2833,1,0
2,26.000000,0,7.9250,1,0
3,35.000000,0,53.1000,1,0
4,35.000000,0,8.0500,0,1
...,...,...,...,...,...
886,27.000000,0,13.0000,0,1
887,19.000000,0,30.0000,1,0
888,29.699118,2,23.4500,1,0
889,26.000000,0,30.0000,0,1


In [99]:
y = data.survived

In [100]:
y

0      0
1      1
2      1
3      1
4      0
      ..
886    0
887    1
888    0
889    1
890    0
Name: survived, Length: 891, dtype: int64

In [125]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.2)

In [126]:
model = GaussianNB()

In [127]:
model.fit(X_train, y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [128]:
model.score(X_train, y_train)*100

78.08988764044943

In [129]:
X_test[:10]

Unnamed: 0,age,parch,fare,female,male
366,60.0,0,75.25,1,0
35,42.0,0,52.0,0,1
605,36.0,0,15.55,0,1
472,33.0,2,27.75,1,0
370,25.0,0,55.4417,0,1
615,24.0,2,65.0,1,0
647,56.0,0,35.5,0,1
97,23.0,1,63.3583,0,1
278,7.0,1,29.125,0,1
61,38.0,0,80.0,1,0


In [130]:
y_test[:10]

366    1
35     0
605    0
472    1
370    1
615    1
647    1
97     1
278    0
61     1
Name: survived, dtype: int64

In [138]:
y_pred = model.predict(X_test)
y_pred

array([1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0,
       0, 1, 1], dtype=int64)

In [140]:
print('Prediction accuracy is',accuracy_score(y_pred,y_test)*100)# Lets find the prediction accuracy

Prediction accuracy is 81.56424581005587


In [141]:
#lets use now LogisticRegression on the data
from sklearn.linear_model import LogisticRegression

In [155]:
#lets create an object for LR
model1 = LogisticRegression()


In [156]:
#Training of LR
model1.fit(X_train,y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [157]:
#Lets check the training accuracy
model.score(X_train,y_train)*100

78.08988764044943

In [158]:
#Test the model
pred1 = model1.predict(X_test)

In [159]:
#Prediction accuracy 
accuracy_score(pred1,y_test)*100

81.56424581005587

# From the accuracy score we conclude that GaussianNB and LogisticRegression has same accuracy of 81.56%