In [1]:
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

In [2]:
df = pd.read_csv("titanic.csv")

df.head()

Unnamed: 0,PassengerId,Name,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Survived
0,1,"Braund, Mr. Owen Harris",3,male,22.0,1,0,A/5 21171,7.25,,S,0
1,2,"Cumings, Mrs. John Bradley (Florence Briggs Th...",1,female,38.0,1,0,PC 17599,71.2833,C85,C,1
2,3,"Heikkinen, Miss. Laina",3,female,26.0,0,0,STON/O2. 3101282,7.925,,S,1
3,4,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",1,female,35.0,1,0,113803,53.1,C123,S,1
4,5,"Allen, Mr. William Henry",3,male,35.0,0,0,373450,8.05,,S,0


### DATA PREPROCESSING

In [3]:
df.drop(['PassengerId','Name','SibSp','Parch','Ticket','Cabin','Embarked'],axis='columns',inplace=True)

df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived
0,3,male,22.0,7.25,0
1,1,female,38.0,71.2833,1
2,3,female,26.0,7.925,1
3,1,female,35.0,53.1,1
4,3,male,35.0,8.05,0


### OneHotEncoding using pandas get dummies

In [4]:
dummies = pd.get_dummies(df.Sex)
dummies.head()

Unnamed: 0,female,male
0,0,1
1,1,0
2,1,0
3,1,0
4,0,1


In [5]:
# merging both df and dummies
final_df = pd.concat([df, dummies], axis='columns')
final_df.head()

Unnamed: 0,Pclass,Sex,Age,Fare,Survived,female,male
0,3,male,22.0,7.25,0,0,1
1,1,female,38.0,71.2833,1,1,0
2,3,female,26.0,7.925,1,1,0
3,1,female,35.0,53.1,1,1,0
4,3,male,35.0,8.05,0,0,1


In [6]:
# dropping sex column cuz we don't need it now
final_df.drop('Sex', axis='columns', inplace=True)
final_df.head()

Unnamed: 0,Pclass,Age,Fare,Survived,female,male
0,3,22.0,7.25,0,0,1
1,1,38.0,71.2833,1,1,0
2,3,26.0,7.925,1,1,0
3,1,35.0,53.1,1,1,0
4,3,35.0,8.05,0,0,1


In [7]:
# Checking null values
final_df.columns[final_df.isnull().any()]

Index(['Age'], dtype='object')

In [8]:
final_df.Age.mean()

29.69911764705882

In [9]:
final_df.Age.median()

28.0

In [10]:
# lets take median to fill null values
final_df.fillna(final_df.median(), inplace=True)

final_df.columns[final_df.isnull().any()]

Index([], dtype='object')

### TRAINING

In [11]:
X = final_df.drop('Survived', axis='columns')

X.head()

Unnamed: 0,Pclass,Age,Fare,female,male
0,3,22.0,7.25,0,1
1,1,38.0,71.2833,1,0
2,3,26.0,7.925,1,0
3,1,35.0,53.1,1,0
4,3,35.0,8.05,0,1


In [12]:
y = final_df.Survived

y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
from sklearn.naive_bayes import GaussianNB

model = GaussianNB()

In [15]:
model.fit(X_train, y_train)

In [16]:
model.score(X_test, y_test)

0.776536312849162

In [17]:
y_test[:5]

709    1
439    0
840    0
720    1
39     1
Name: Survived, dtype: int64

In [18]:
model.predict(X_test[:5])

array([0, 0, 0, 1, 1], dtype=int64)

it predicted 4 correctly from the first 5

In [32]:
# Probabilites of first five data
prob = model.predict_proba(X_test[:5])
prob_df = pd.DataFrame(prob, columns=["0 (not survived)", "1 (survived)"])
prob_df.head()

Unnamed: 0,0 (not survived),1 (survived)
0,0.990535,0.009465
1,0.979993,0.020007
2,0.989016,0.010984
3,0.014645,0.985355
4,0.046583,0.953417
