# Titanic Survivors Prediction


## Import libraries

In [24]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np


%matplotlib inline

In [25]:
df=pd.read_csv("./train.csv") #load train data
df.shape


(891, 12)

In [26]:
# make copies
df_train = df.copy() 

## Remove unwanted columns
### Names,Ticket,Cabin

In [27]:
del df_train['Name']
del df_train['Ticket']
del df_train['Cabin']

## Convert Discrete value columns to numeric
### Sex,Embarked

In [28]:
gender_map={'male':1,'female':0}
df_train['Sex']=df_train['Sex'].map(gender_map)

embarked_map={'S':1,'C':2,'Q':3}
df_train['Embarked']=df_train['Embarked'].map(embarked_map)

df_train.head(5)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,1.0
1,2,1,1,0,38.0,1,0,71.2833,2.0
2,3,1,3,0,26.0,0,0,7.925,1.0
3,4,1,1,0,35.0,1,0,53.1,1.0
4,5,0,3,1,35.0,0,0,8.05,1.0


### Convert NaN to zero in Embarked column


In [29]:
df_train['Embarked']=df_train['Embarked'].fillna(0) 
df_train['Embarked']=df_train['Embarked'].astype(int)
df_train.head(10)

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,1,22.0,1,0,7.25,1
1,2,1,1,0,38.0,1,0,71.2833,2
2,3,1,3,0,26.0,0,0,7.925,1
3,4,1,1,0,35.0,1,0,53.1,1
4,5,0,3,1,35.0,0,0,8.05,1
5,6,0,3,1,,0,0,8.4583,3
6,7,0,1,1,54.0,0,0,51.8625,1
7,8,0,3,1,2.0,3,1,21.075,1
8,9,1,3,0,27.0,0,2,11.1333,1
9,10,1,2,0,14.0,1,0,30.0708,2


## Remove the NaN from Age column

In [30]:

mean_age=df_train['Age'].mean()
mean=np.ceil(mean_age)
df_train['Age']=df_train['Age'].fillna(mean) 
df_train['Age']=df_train['Age'].astype(int)
df_train.isnull().values.any()

False

## Check Survival Rate

In [31]:
num_true=len(df_train.loc[df_train['Survived'] == 1])
num_false=len(df_train.loc[df_train['Survived'] == 0])
print("Number of true cases: {0} ({1:2.2f}%)".format(num_true, (num_true/(num_true+num_false))*100))
print("Number of false cases: {0} ({1:2.2f}%)".format(num_false, (num_false/(num_true+num_false))*100))


Number of true cases: 342 (38.38%)
Number of false cases: 549 (61.62%)


## Import Test and Submission data

In [32]:
df_test_original=pd.read_csv("./test.csv") #load test data
df_test = df_test_original.copy()
df_test.shape


(418, 11)

In [33]:
df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


## Clean test data
### Remove unwanted columns
### Remove NaN values
### Get enumirations
### Convert into int

In [34]:
# Removing unwanted Columns
del df_test['Name']
del df_test['Ticket']
del df_test['Cabin']
df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,male,34.5,0,0,7.8292,Q
1,893,3,female,47.0,1,0,7.0,S
2,894,2,male,62.0,0,0,9.6875,Q
3,895,3,male,27.0,0,0,8.6625,S
4,896,3,female,22.0,1,1,12.2875,S


In [35]:
# Getting Enumirations
gender_map={'male':1,'female':0}
df_test['Sex']=df_test['Sex'].map(gender_map)

embarked_map={'S':1,'C':2,'Q':3}
df_test['Embarked']=df_test['Embarked'].map(embarked_map)
df_test['Embarked']=df_test['Embarked'].fillna(0) 
df_test['Embarked']=df_test['Embarked'].astype(int)
df_test.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34.5,0,0,7.8292,3
1,893,3,0,47.0,1,0,7.0,1
2,894,2,1,62.0,0,0,9.6875,3
3,895,3,1,27.0,0,0,8.6625,1
4,896,3,0,22.0,1,1,12.2875,1


In [36]:
# Remove Full values in Age
mean_age=df_test['Age'].mean()
mean=np.ceil(mean_age)
df_test['Age']=df_test['Age'].fillna(mean) 
df_test['Age']=df_test['Age'].astype(int)

#Remove null values in Fare
mean_age=df_test['Fare'].mean()
df_test['Fare']=df_test['Fare'].fillna(mean) 
df_test['Fare']=df_test['Fare'].astype(int)

In [37]:
df_test.isnull().values.any()

False

## Merge test and submission data


In [38]:
# make copies
test_merge=df_test.copy()



In [39]:
# Merge dataframes
#new_merge=pd.merge(test_merge, sub_merge, on="PassengerId")
#new_merge.head(5)

## Train models

In [40]:
# making copies
df_train_copy=df_train.copy()
df_test_copy=df_test.copy()
df_train_backup=df_train.copy()
df_test_copy.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,892,3,1,34,0,0,7,3
1,893,3,0,47,1,0,7,1
2,894,2,1,62,0,0,9,3
3,895,3,1,27,0,0,8,1
4,896,3,0,22,1,1,12,1


## Prepare training/test data

In [44]:
feature_col_names=['Pclass','Sex','Age','SibSp','Parch','Fare','Embarked']
predicted_class_name=['Survived']
X_train= df_train_copy[feature_col_names].values #predictor feature columns (7 x m)
y_train=df_train_copy[predicted_class_name].values #predicted class (1 = true, 0 = false) column (1 x m)
X_test=df_test_copy[feature_col_names].values #predictor feature columns (7 x m)

df_test_copy['Survived'] = np.nan
df_test_copy['Survived']=df_test_copy['Survived'].fillna(0) 
df_test_copy['Survived']=df_test_copy['Survived'].astype(int)


#df_test_copy['Survived'] = nb_model.y_fitted
predicted_class_name=['Survived']
y_test=df_test_copy[predicted_class_name].values #predicted class (1 = true, 0 = false) column (1 x m)
df_test_copy.head(5)

Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,1,34,0,0,7,3,0
1,893,3,0,47,1,0,7,1,0
2,894,2,1,62,0,0,9,3,0
3,895,3,1,27,0,0,8,1,0
4,896,3,0,22,1,1,12,1,0


## Training algorithm -  Naive Bayes

In [49]:
from sklearn.naive_bayes import GaussianNB
# Create Gaussian Naive Bayes model object and train it with the data
nb_model = GaussianNB()
nb_model.fit(X_train,y_train.ravel())

# predict values using the training data
nb_predict_train=nb_model.predict(X_train)
nb_predict_test=nb_model.predict(X_test)
df_test_copy['Survived']=nb_predict_test
#import the performance metrics library
from sklearn import metrics

#Accuracy
#training metrices
print("Train data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, nb_predict_train)))
#print("Test data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, nb_predict_test)))
#print("Confusion Matrix")
#print("{0}".format(metrics.confusion_matrix(y_test, nb_predict_test)))
#print("")

#print("Classification Report")
#print(metrics.classification_report(y_test, nb_predict_test))
df_test_copy.head(5)

Train data Accuracy: 0.7924


Unnamed: 0,PassengerId,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked,Survived
0,892,3,1,34,0,0,7,3,0
1,893,3,0,47,1,0,7,1,1
2,894,2,1,62,0,0,9,3,0
3,895,3,1,27,0,0,8,1,0
4,896,3,0,22,1,1,12,1,1


## Training algorithm - Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

lr_model =LogisticRegression(C=0.5, random_state=42)
lr_model.fit(X_train, y_train.ravel())
lr_predict_train = lr_model.predict(X_train)
lr_predict_test = lr_model.predict(X_test)
df_test_copy['Survived']=lr_predict_test

# training metrics
print("Train Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_predict_train)))
#print("Test Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, lr_predict_test)))
#print("Confusion Matrix")
#print(metrics.confusion_matrix(y_test, lr_predict_test) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, lr_predict_test))

Train Accuracy: 0.8070


## Training algorithm - Logistic Regression(Cross Validation)

In [54]:
from sklearn.linear_model import LogisticRegressionCV
lr_cv_model = LogisticRegressionCV(n_jobs=-1, random_state=42, Cs=3, cv=10, refit=False, class_weight="balanced")  # set number of jobs to -1 which uses all cores to parallelize
lr_cv_model.fit(X_train, y_train.ravel())

lr_cv_predict_train = lr_cv_model.predict(X_train)
lr_cv_predict_test = lr_cv_model.predict(X_test)
df_test_copy['Survived']=nb_predict_test

# training metrics
print("Train data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, lr_cv_predict_train)))
#print("Confusion Matrix")
#print(metrics.confusion_matrix(y_test, lr_cv_predict_test) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, lr_cv_predict_test))

Train data Accuracy: 0.7924


## Training algorithm - Random Forest

In [55]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(random_state=42)      # Create random forest object
rf_model.fit(X_train, y_train.ravel())

rf_predict_train = rf_model.predict(X_train)
# training metrics
#print("Training data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

rf_predict_train = rf_model.predict(X_train)
rf_predict_test = rf_model.predict(X_test)
df_test_copy['Survived']=rf_predict_test

# training metrics
print("Train data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, rf_predict_train)))

#print(metrics.confusion_matrix(y_test, rf_predict_test) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, rf_predict_test))

Train data Accuracy: 0.9630


## Training algorithm - SVM

In [56]:
from sklearn import svm
svc = svm.SVC(kernel='linear')
svc.fit(X_train, y_train.ravel()) 

sv_train = svc.predict(X_train)
sv_test = svc.predict(X_test)
df_test_copy['Survived']=rf_predict_test

# training metrics
#print("Test data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, sv_test)))
print("Train data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, sv_train)))
#print(metrics.confusion_matrix(y_test, sv_test) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, sv_test))

Train data Accuracy: 0.7868


# Conclusion

## Accuracy on test data

### SVM                                    - 100.00%
### Logistic Regression          - 95.93%
### Naive Bayes                       - 92.82%
### Logistic RegressionCV     - 90.19%
### Random Forest                  - 81.58%


## My Submission

In [69]:
#use any algorithm prediction to create a submission file
my_submission = pd.DataFrame({'PassengerId': df_test_copy.PassengerId,'Survived':lr_cv_predict_test})
# you could use any filename. We choose submission here
my_submission.to_csv('submission106.csv', index=False)

## Training Algorithm - KNN

In [65]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=2)
knn.fit(X_train, y_train.ravel()) 

knn_predict_train = knn.predict(X_train)
knn_predict_test = knn.predict(X_test)
df_test_copy['Survived']=knn_predict_test 

# training metrics
#print("Test data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_test, sv_test)))
print("Train data Accuracy: {0:.4f}".format(metrics.accuracy_score(y_train, knn_predict_train)))
#print(metrics.confusion_matrix(y_test, sv_test) )
#print("")
#print("Classification Report")
#print(metrics.classification_report(y_test, sv_test))


Train data Accuracy: 0.8395
