# Titanic Surviving Prediction

## Importing Necessary Libraries

In [1]:
import pandas as pd
pd.set_option("display.max_columns",100)

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import warnings
warnings.simplefilter('ignore')

In [2]:
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB

from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

from xgboost import XGBClassifier

from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, classification_report

## Loading the Datasets

In [3]:
df1 = pd.read_csv("train.csv")
df2 = pd.read_csv("test.csv")

In [4]:
df1.shape, df2.shape

((891, 12), (418, 11))

In [5]:
df = df1.append(df2) # we add to each other, we will do the necessary operations and then we will separate them.

In [6]:
df.shape

(1309, 12)

In [7]:
df.sample(10)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
717,718,1.0,2,"Troutt, Miss. Edwina Celia ""Winnie""",female,27.0,0,0,34218,10.5,E101,S
742,743,1.0,1,"Ryerson, Miss. Susan Parker ""Suzette""",female,21.0,2,2,PC 17608,262.375,B57 B59 B63 B66,C
191,192,0.0,2,"Carbines, Mr. William",male,19.0,0,0,28424,13.0,,S
580,581,1.0,2,"Christy, Miss. Julie Rachel",female,25.0,1,1,237789,30.0,,S
887,888,1.0,1,"Graham, Miss. Margaret Edith",female,19.0,0,0,112053,30.0,B42,S
493,494,0.0,1,"Artagaveytia, Mr. Ramon",male,71.0,0,0,PC 17609,49.5042,,C
268,1160,,3,"Howard, Miss. May Elizabeth",female,,0,0,A. 2. 39186,8.05,,S
631,632,0.0,3,"Lundahl, Mr. Johan Svensson",male,51.0,0,0,347743,7.0542,,S
866,867,1.0,2,"Duran y More, Miss. Asuncion",female,27.0,1,0,SC/PARIS 2149,13.8583,,C
120,121,0.0,2,"Hickman, Mr. Stanley George",male,21.0,2,0,S.O.C. 14879,73.5,,S


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  1309 non-null   int64  
 1   Survived     891 non-null    float64
 2   Pclass       1309 non-null   int64  
 3   Name         1309 non-null   object 
 4   Sex          1309 non-null   object 
 5   Age          1046 non-null   float64
 6   SibSp        1309 non-null   int64  
 7   Parch        1309 non-null   int64  
 8   Ticket       1309 non-null   object 
 9   Fare         1308 non-null   float64
 10  Cabin        295 non-null    object 
 11  Embarked     1307 non-null   object 
dtypes: float64(3), int64(4), object(5)
memory usage: 132.9+ KB


In [9]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
dtype: int64

In [10]:
# We got the titles using Regular Expression
df["Title"] = df["Name"].str.extract("(\S+)\.", expand = False)
#df["Title2"] = df["Name"].str.extract(" ([A-Za-z]+)\.", expand = False)

In [11]:
df["Title"].value_counts()

Mr          757
Miss        260
Mrs         197
Master       61
Rev           8
Dr            8
Col           4
Mlle          2
Major         2
Ms            2
Capt          1
Lady          1
Sir           1
Jonkheer      1
Countess      1
Dona          1
Mme           1
Don           1
Name: Title, dtype: int64

In [307]:
df["Title"] = df["Title"].replace(["Ms", "Mlle"], "Miss")
df["Title"] = df["Title"].replace(["Mme", "Countess", "Lady", "Dona"], "Mrs")
df["Title"] = df["Title"].replace(["Dr", "Major", "Col", "Sir", "Rev", "Jonkheer", "Capt", "Don"], "Mr")

In [308]:
df["Title"].value_counts()

Mr        783
Miss      264
Mrs       201
Master     61
Name: Title, dtype: int64

In [309]:
df.groupby("Sex")["Age"].median()

Sex
female    27.0
male      28.0
Name: Age, dtype: float64

In [310]:
df.isnull().sum()

PassengerId       0
Survived        418
Pclass            0
Name              0
Sex               0
Age             263
SibSp             0
Parch             0
Ticket            0
Fare              1
Cabin          1014
Embarked          2
Title             0
dtype: int64

In [311]:
# We fill in the blanks according to the title
df["Age"].fillna(df.groupby("Title")["Age"].transform("median"), inplace = True)
df.groupby("Title")["Age"].transform("median")

0      30.0
1      35.5
2      22.0
3      35.5
4      30.0
       ... 
413    30.0
414    35.5
415    30.0
416    30.0
417     4.0
Name: Age, Length: 1309, dtype: float64

In [313]:
del df["Cabin"] # We delete it because the total number of data is below 20%

In [314]:
df["Embarked"].value_counts() # we'll fill in the blanks with the most Embarked

S    914
C    270
Q    123
Name: Embarked, dtype: int64

In [315]:
df["Embarked"].value_counts(normalize=True) # Since 70% is a high rate, we fill both spaces with S.

S    0.699311
C    0.206580
Q    0.094109
Name: Embarked, dtype: float64

In [316]:
df["Embarked"].fillna("S", inplace = True) # We fill it with S

In [317]:
df["Fare"].fillna(df["Fare"].mean(), inplace = True)  # We fill the single space in Mouse with the average

In [318]:
df["FamilySize"] = df["SibSp"] + df["Parch"] + 1

In [319]:
df["FamilySize"].value_counts()

1     790
2     235
3     159
4      43
6      25
5      22
7      16
11     11
8       8
Name: FamilySize, dtype: int64

In [321]:
del df["Name"]

In [322]:
df.drop(["PassengerId", "Ticket"], axis = 1, inplace = True)

In [323]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1309 entries, 0 to 417
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Survived    891 non-null    float64
 1   Pclass      1309 non-null   int64  
 2   Sex         1309 non-null   object 
 3   Age         1309 non-null   float64
 4   SibSp       1309 non-null   int64  
 5   Parch       1309 non-null   int64  
 6   Fare        1309 non-null   float64
 7   Embarked    1309 non-null   object 
 8   Title       1309 non-null   object 
 9   FamilySize  1309 non-null   int64  
dtypes: float64(3), int64(4), object(3)
memory usage: 112.5+ KB


In [324]:
df_temp = pd.get_dummies(df, drop_first = True)

In [325]:
df_train = df_temp[:891]
df_test = df_temp[891:]

In [326]:
df_train.shape, df_test.shape

((891, 13), (418, 13))

In [327]:
df_train.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs
0,0.0,3,22.0,1,0,7.25,2,1,0,1,0,1,0
1,1.0,1,38.0,1,0,71.2833,2,0,0,0,0,0,1
2,1.0,3,26.0,0,0,7.925,1,0,0,1,1,0,0
3,1.0,1,35.0,1,0,53.1,2,0,0,1,0,0,1
4,0.0,3,35.0,0,0,8.05,1,1,0,1,0,1,0


In [328]:
y = df_train["Survived"]
del df_train["Survived"]

In [329]:
g = GaussianNB()
b = BernoulliNB()
k = KNeighborsClassifier()
log = LogisticRegression()
gbc = GradientBoostingClassifier()
r = RandomForestClassifier()
d = DecisionTreeClassifier()
xgbc = XGBClassifier()

In [330]:
algorithms = [g,b,k,log,gbc,r,d,xgbc]
names = ["GaussianNB", "BernoulliNB", "K Nearest", "Logistic", "GradientBoosting", "RandomForest", "Decision Tree", "XGBC"]

In [331]:
def algo_test(X, y, algorithms = algorithms, names = names):
    # fit the data 
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X,y)
    # print metrics
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ["Accuracy", "Precision", "Recall", "F1"], index = names)
    metrics["Accuracy"] = accuracy
    metrics["Precision"] = precision
    metrics["Recall"] = recall
    metrics["F1"] = f1
    return metrics.sort_values("F1", ascending = False)

In [332]:
algo_test(df_train, y)



Unnamed: 0,Accuracy,Precision,Recall,F1
RandomForest,0.979798,0.97929,0.967836,0.973529
Decision Tree,0.979798,0.993902,0.953216,0.973134
XGBC,0.962963,0.969605,0.932749,0.95082
GradientBoosting,0.897868,0.908795,0.815789,0.859784
Logistic,0.823793,0.789969,0.736842,0.762481
K Nearest,0.818182,0.790323,0.716374,0.751534
GaussianNB,0.794613,0.752381,0.692982,0.721461
BernoulliNB,0.785634,0.735202,0.690058,0.711916


In [333]:
df_test.shape

(418, 13)

In [334]:
df_test.head()

Unnamed: 0,Survived,Pclass,Age,SibSp,Parch,Fare,FamilySize,Sex_male,Embarked_Q,Embarked_S,Title_Miss,Title_Mr,Title_Mrs
0,,3,34.5,0,0,7.8292,1,1,1,0,0,1,0
1,,3,47.0,1,0,7.0,2,0,0,1,0,0,1
2,,2,62.0,0,0,9.6875,1,1,1,0,0,1,0
3,,3,27.0,0,0,8.6625,1,1,0,1,0,1,0
4,,3,22.0,1,1,12.2875,3,0,0,1,0,0,1


In [335]:
del df_test["Survived"]

In [336]:
predict = r.predict(df_test)

In [337]:
result = df2[["PassengerId"]]

In [338]:
result["Survived"] = predict

In [339]:
result["Survived"] = result["Survived"].apply(int)

In [340]:
result

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,1
4,896,0
...,...,...
413,1305,0
414,1306,1
415,1307,0
416,1308,0


In [341]:
result.shape

(418, 2)

In [342]:
result.to_csv("result.csv", index = False)