# Titanic: Machine Learning

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv('Data/train.csv')
df = df[[
    'PassengerId',
    'Survived',
    'Pclass',
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Embarked'
]]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [3]:
df.count()

PassengerId    891
Survived       891
Pclass         891
Sex            891
Age            714
SibSp          891
Parch          891
Fare           891
Embarked       889
dtype: int64

### Deleting the rows with NaN Embarked values

In [4]:
df = df[df['Embarked'].notnull() == True]
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1,0,3,male,22.0,1,0,7.25,S
1,2,1,1,female,38.0,1,0,71.2833,C
2,3,1,3,female,26.0,0,0,7.925,S
3,4,1,1,female,35.0,1,0,53.1,S
4,5,0,3,male,35.0,0,0,8.05,S


In [5]:
df = df.reset_index().drop('index', axis = 'columns')

### Filling the NaN Age values and Labeling the Sex column

In [6]:
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer

In [7]:
imputer = SimpleImputer(strategy = 'median')
imputer = imputer.fit(df[['PassengerId','Age']])
df[['PassengerId','Age']] = imputer.transform(df[['PassengerId','Age']])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0,3,male,22.0,1,0,7.25,S
1,2.0,1,1,female,38.0,1,0,71.2833,C
2,3.0,1,3,female,26.0,0,0,7.925,S
3,4.0,1,1,female,35.0,1,0,53.1,S
4,5.0,0,3,male,35.0,0,0,8.05,S


In [8]:
# 1 for male and 0 for female
df['Sex'] = LabelEncoder().fit_transform(df['Sex'])
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,1.0,0,3,1,22.0,1,0,7.25,S
1,2.0,1,1,0,38.0,1,0,71.2833,C
2,3.0,1,3,0,26.0,0,0,7.925,S
3,4.0,1,1,0,35.0,1,0,53.1,S
4,5.0,0,3,1,35.0,0,0,8.05,S


In [9]:
df.count()

PassengerId    889
Survived       889
Pclass         889
Sex            889
Age            889
SibSp          889
Parch          889
Fare           889
Embarked       889
dtype: int64

### Feature Scaling

In [10]:
from sklearn.preprocessing import StandardScaler

In [12]:
scaler = StandardScaler()
df[['Age', 'Fare']] = scaler.fit_transform(df[['Age', 'Fare']])

### Creating Dummy Variables for Pclass and Embarked

In [13]:
dummies1 = pd.get_dummies(df['Pclass']).drop([3], axis='columns')
dummies1.columns = ['Pclass_1', 'Pclass_2']
dummies1.head()

Unnamed: 0,Pclass_1,Pclass_2
0,0,0
1,1,0
2,0,0
3,1,0
4,0,0


In [14]:
dummies2 = pd.get_dummies(df['Embarked']).drop(['Q'], axis='columns')
dummies2.head()

Unnamed: 0,C,S
0,0,1
1,1,0
2,0,1
3,0,1
4,0,1


In [15]:
df_enc = df.drop(['Pclass', 'Embarked'], axis='columns')
df_enc = pd.concat([df_enc, dummies1], axis = 'columns')
df_enc = pd.concat([df_enc, dummies2], axis = 'columns')

In [16]:
df_enc.head()

Unnamed: 0,PassengerId,Survived,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,C,S
0,1.0,0,1,-0.563674,1,0,-0.50024,0,0,0,1
1,2.0,1,0,0.669217,1,0,0.788947,1,0,1,0
2,3.0,1,0,-0.255451,0,0,-0.48665,0,0,0,1
3,4.0,1,0,0.43805,1,0,0.422861,1,0,0,1
4,5.0,0,1,0.43805,0,0,-0.484133,0,0,0,1


### Creating X_train and y_train

In [17]:
X_train = df_enc[[
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Pclass_1',
    'Pclass_2',
    'C',
    'S'
]]
X_train.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,C,S
0,1,-0.563674,1,0,-0.50024,0,0,0,1
1,0,0.669217,1,0,0.788947,1,0,1,0
2,0,-0.255451,0,0,-0.48665,0,0,0,1
3,0,0.43805,1,0,0.422861,1,0,0,1
4,1,0.43805,0,0,-0.484133,0,0,0,1


In [18]:
y_train = df_enc['Survived']
y_train.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [19]:
from sklearn import model_selection, metrics

### Creating Random Forest Classifier

In [20]:
from sklearn.ensemble import RandomForestClassifier

In [21]:
clf_rf = RandomForestClassifier(n_estimators = 1000).fit(X_train, y_train)

#### Approx 98% training accuracy ( It is probably overfitting )

In [22]:
clf_rf.score(X_train, y_train)

0.9797525309336333

### Importing Test Data

In [43]:
test = pd.read_csv('Data/test.csv')
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [44]:
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            332
SibSp          418
Parch          418
Ticket         418
Fare           417
Cabin           91
Embarked       418
dtype: int64

### Encoding and Filling NaN values

In [45]:
test[['Age', 'Fare']] = imputer.transform(test[['Age', 'Fare']])
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [46]:
# 1 for male and 0 for female
test['Sex'] = LabelEncoder().fit_transform(test['Sex'])
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",1,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",0,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",1,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",1,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",0,22.0,1,1,3101298,12.2875,,S


In [47]:
test.count()

PassengerId    418
Pclass         418
Name           418
Sex            418
Age            418
SibSp          418
Parch          418
Ticket         418
Fare           418
Cabin           91
Embarked       418
dtype: int64

### Feature Scaling

In [48]:
test[['Age', 'Fare']] = scaler.transform(test[['Age', 'Fare']])

### Creatig Dummy Variables for Pclass and Embarked

In [49]:
dummies1 = pd.get_dummies(test['Pclass']).drop([3], axis='columns')
dummies1.columns = ['Pclass_1', 'Pclass_2']
dummies2 = pd.get_dummies(test['Embarked']).drop(['Q'], axis='columns')
X_test = test_df.drop(['Pclass', 'Embarked'], axis='columns')
X_test = pd.concat([X_test, dummies1], axis = 'columns')
X_test = pd.concat([X_test, dummies2], axis = 'columns')

### Creating X_test

In [50]:
X_test = X_test[[
    'Sex',
    'Age',
    'SibSp',
    'Parch',
    'Fare',
    'Pclass_1',
    'Pclass_2',
    'C',
    'S'
]]

In [51]:
X_test.head()

Unnamed: 0,Sex,Age,SibSp,Parch,Fare,Pclass_1,Pclass_2,C,S
0,1,0.399522,0,0,-0.488579,0,0,0,0
1,0,1.362718,1,0,-0.505273,0,0,0,1
2,1,2.518553,0,0,-0.451165,0,1,0,0
3,1,-0.178396,0,0,-0.471802,0,0,0,1
4,0,-0.563674,1,1,-0.398819,0,0,0,1


### Random Forest Predictions

In [52]:
rf_predict = clf_rf.predict(X_test)
rf_predict

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

### Trying Logistic Regression

In [53]:
from sklearn.linear_model import LogisticRegression

In [54]:
clf_lr = LogisticRegression().fit(X_train, y_train)



In [55]:
clf_lr.score(X_train, y_train)

0.8053993250843644

### Trying Cross Validation

In [56]:
train_pred = model_selection.cross_val_predict(clf_lr, X_train, y_train, cv=10, n_jobs = -1)

In [57]:
metrics.accuracy_score(y_train, train_pred)

0.7986501687289089

### It looks like Logistic Regression is working well

In [58]:
survived = pd.Series(clf_lr.predict(X_test))
survived.head()

0    0
1    0
2    0
3    0
4    1
dtype: int64

In [59]:
submission = pd.DataFrame()
submission['PassengerId'] = test['PassengerId'].astype(int)
submission['Survived'] = survived
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [60]:
submission.count()

PassengerId    418
Survived       418
dtype: int64

In [61]:
submission.to_csv('First_Submission.csv', index = False)