In [182]:
import numpy as np
import pandas as pd

%matplotlib inline
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure
import seaborn as sns

In [183]:
from tqdm import tqdm # progress bar
from datetime import datetime 
import json
import os

In [184]:
from sklearn.ensemble import RandomForestClassifier

from sklearn import metrics
from sklearn import preprocessing
from sklearn.impute import SimpleImputer

In [None]:
### Helper functions

# Avoid truncation of columns
def showAllData():
    pd.set_option('display.max_columns', 500);
    pd.set_option('display.max_rows', 500);
showAllData();

## Reading the data

In [278]:
# Read data
fetch_from = './data/train.csv'
train = pd.read_csv(fetch_from)

fetch_from = './data/test.csv'
test = pd.read_csv(fetch_from)

## Basic check of the data

In [280]:
train.describe()

Unnamed: 0,PassengerId,Survived,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,0.383838,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.486592,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,0.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,0.0,2.0,20.125,0.0,0.0,7.9104
50%,446.0,0.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,1.0,3.0,38.0,1.0,0.0,31.0
max,891.0,1.0,3.0,80.0,8.0,6.0,512.3292


### Continous variables

* Except Age, everything else has 0 missing values

In [279]:
train.describe(include=['O'])

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Peter, Mrs. Catherine (Catherine Rizk)",male,347082,C23 C25 C27,S
freq,1,577,7,4,644


### Removing Name(due to irrelavance) & Cabin(due to lack of data)

In [281]:
train.drop(columns=['Name','Cabin'],inplace=True)
test.drop(columns=['Name','Cabin'],inplace=True)

In [287]:
test.shape

(418, 9)

## Visualizations

Display scatter plot for the numerical variables

Display count plot ( bar plot ) for the categorical variables

In [277]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Sex,Age,SibSp,Parch,Ticket,Fare,Embarked
0,1.0,0.0,3.0,1,22.0,1.0,0.0,523,7.25,2
1,2.0,1.0,1.0,0,38.0,1.0,0.0,596,71.2833,0
2,3.0,1.0,3.0,0,26.0,0.0,0.0,669,7.925,2
3,4.0,1.0,1.0,0,35.0,1.0,0.0,49,53.1,2
4,5.0,0.0,3.0,1,35.0,0.0,0.0,472,8.05,2


In [284]:
# Replace categorical missing values with a constant
# Label Encode the categorical variables
# Replace the continous missing values with the mean

def edaBasic(data):
    for col in data.columns:
        if (data[col].dtype)=='object':
            imp = SimpleImputer(missing_values=np.nan, strategy='constant');
            data[col] = imp.fit_transform(data[[col]]).ravel();
            
            le = preprocessing.LabelEncoder();
            le.fit(data[col]);
            data[col] = le.transform(data[col]);
        else:
            imp = SimpleImputer(missing_values=np.nan, strategy='mean');
            data[col] = imp.fit_transform(data[[col]]).ravel();
    return data;
edaBasic(train);
edaBasic(test);

In [250]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1.0,0.0,3.0,108,1,22.0,1.0,0.0,523,7.25,147,2
1,2.0,1.0,1.0,190,0,38.0,1.0,0.0,596,71.2833,81,0
2,3.0,1.0,3.0,353,0,26.0,0.0,0.0,669,7.925,147,2
3,4.0,1.0,1.0,272,0,35.0,1.0,0.0,49,53.1,55,2
4,5.0,0.0,3.0,15,1,35.0,0.0,0.0,472,8.05,147,2


In [285]:
X_train = train.drop(columns='Survived');
Y_train = train['Survived'];

In [288]:
ml = RandomForestClassifier(n_jobs=-1, min_samples_leaf=3, max_features=0.5, oob_score=True, n_estimators=100)
# ml = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0)
ml.fit(X_train, Y_train)
# ml.score(train.drop(columns='Survived'), train['Survived'])
print([ml.oob_score_,ml.score(X_train, Y_train)])

[0.8361391694725028, 0.9315375982042648]


In [289]:
prediction = ml.predict(test).astype(int);

In [290]:
submission = pd.DataFrame({'PassengerId':test['PassengerId'].astype(int),'Survived':prediction});
submission.head()

Unnamed: 0,PassengerId,Survived
0,892,0
1,893,0
2,894,0
3,895,0
4,896,1


In [238]:
filename = 'Titanic Predictions 1.csv'

submission.to_csv(filename,index=False)