# Kaggle Titanic dataset. Predicting whether passengers will survive 

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn import preprocessing

In [156]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
PassengerId    891 non-null int64
Survived       891 non-null int64
Pclass         891 non-null int64
Name           891 non-null object
Sex            891 non-null object
Age            714 non-null float64
SibSp          891 non-null int64
Parch          891 non-null int64
Ticket         891 non-null object
Fare           891 non-null float64
Cabin          204 non-null object
Embarked       889 non-null object
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


In [5]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [10]:
train.isna().sum()

PassengerId      0
Survived         0
Pclass           0
Name             0
Sex              0
Age            177
SibSp            0
Parch            0
Ticket           0
Fare             0
Cabin          687
Embarked         2
dtype: int64

In [18]:
train['Ticket'].value_counts()

1601                7
347082              7
CA. 2343            7
347088              6
CA 2144             6
                   ..
C.A. 29395          1
392092              1
376566              1
STON/O2. 3101290    1
349206              1
Name: Ticket, Length: 681, dtype: int64

In [115]:
train['Name'].nunique()

891

In [157]:
#Name - no informational value
#cabin - too many null values
#ticket - few unique counts for each ticket
train = train.drop(['Name','Cabin','Ticket','PassengerId'], axis=1)

In [158]:
dummies = pd.get_dummies(train)
train2 = dummies

In [150]:
dummies.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Age', 'SibSp', 'Parch', 'Fare',
       'Sex_female', 'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S'],
      dtype='object')

In [159]:
#impute age column with median
dummies['Age'].fillna(dummies['Age'].median(),inplace = True)

In [160]:
dummies.dropna(how='all',inplace=True)

In [161]:
#drop target class
import numpy as np
X = dummies.drop(columns=['Survived'])
y = dummies['Survived']
X = np.array(X)
y = np.array(y)

In [162]:
scaler = preprocessing.MinMaxScaler()
X=scaler.fit_transform(X)

In [163]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [164]:
from sklearn.linear_model import LogisticRegression
logreg=LogisticRegression()
logreg.fit(x_train,y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [165]:
y_pred = logreg.predict(x_test)

In [166]:
print("Accuracy:",round(metrics.accuracy_score(y_test,y_pred),3))
print("Precision:",metrics.precision_score(y_test,y_pred))
print("Recall:",round(metrics.recall_score(y_test,y_pred),3))
print("F1 score:",round(metrics.f1_score(y_test,y_pred),3))

Accuracy: 0.726
Precision: 0.8082191780821918
Recall: 0.628
F1 score: 0.707


WITHOUT THE AGE IMPUTATION

In [167]:
train2.dropna(how='all',inplace=True)

In [168]:
X1 = train2.drop(columns=['Survived'])
y1 = train2['Survived']
X1.columns
X1 = np.array(X1)
y1 = np.array(y1)


In [169]:
X1 = scaler.fit_transform(X1)

In [170]:
from sklearn.model_selection import train_test_split
x_train1,x_test1,y_train1,y_test1 = train_test_split(X1,y1,test_size=0.2)

In [171]:
logreg.fit(x_train1,y_train1)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [172]:
y_pred1 = logreg.predict(x_test1)

In [173]:
print("Accuracy:",round(metrics.accuracy_score(y_test1,y_pred1),3))
print("Precision:",metrics.precision_score(y_test1,y_pred1))
print("Recall:",round(metrics.recall_score(y_test1,y_pred1),3))
print("F1 score:",round(metrics.f1_score(y_test1,y_pred1),3))

Accuracy: 0.804
Precision: 0.7575757575757576
Recall: 0.725
F1 score: 0.741


# Predicting the survived in the test data 

In [174]:
#Using the more accurate pipeline to predict the Survived in the test set.
test.drop(['Name', 'Ticket', 'Cabin'], 1, inplace = True)#Dropping the unuseful columns
test.dropna(inplace = True)#Dropping all other Nan values

test = pd.get_dummies(test, columns = ['Sex', 'Embarked'])#Dummifying the object columns

#Selecting the useful columns as features and saving them to test_X as an array
#Note, the number and order of the columns have to be the same as to how the ML was trained
test_X = test[['Pclass', 'Age', 'SibSp', 'Parch', 'Fare', 'Sex_female', 
                  'Sex_male', 'Embarked_C', 'Embarked_Q', 'Embarked_S']].values

test_X = scaler.fit_transform(test_X)#Preprocessing the features
predicted_survivors = logreg.predict(test_X)#And using the ML from the previous training step to predict the Survived

In [175]:
#Saving the predicted survived labels into the analysed testset as column Survived
test['Survived'] = predicted_survivors

In [176]:
test.head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Embarked_C,Embarked_Q,Embarked_S,Survived
0,892,3,34.5,0,0,7.8292,0,1,0,1,0,0
1,893,3,47.0,1,0,7.0,1,0,0,0,1,0
2,894,2,62.0,0,0,9.6875,0,1,0,1,0,0
3,895,3,27.0,0,0,8.6625,0,1,0,0,1,0
4,896,3,22.0,1,1,12.2875,1,0,0,0,1,1


In [177]:
#Saving the resulting test dataframe as a csv file
test.to_csv('predictions.csv')