# Kaggle - Titanic Challenge
## Machine Learning from Disaster

This project uses the [Titanic - Machine Learning from Disaster](https://www.kaggle.com/c/titanic/overview) Kaggle Challenge to use data to predict whether a person that was in the Titanic survived or not to the disaster.


The purpose of this project was to apply new knowledge of Machine Learning and Neural Networks, as well as to practice new tools of the TensorFlow, pandas and Scikit-Learn.

In [1]:
import pandas as pd
import numpy as np
import os


# Learning and Processing the Data

In [2]:
train = pd.read_csv(os.path.join("Data","train.csv"))
test =  pd.read_csv(os.path.join("Data","test.csv"))


In [3]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [12]:
def process_sex_into_binary(word):
    if word == 'male':
        return(0)
    else:
        return(1)

train['Classify_Sex'] = train['Sex'].map(process_sex_into_binary)
test['Classify_Sex'] = test['Sex'].map(process_sex_into_binary)

In [13]:
#Checking the train
train[['Sex','Classify_Sex']]

Unnamed: 0,Sex,Classify_Sex
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0
...,...,...
886,male,0
887,female,1
888,female,1
889,male,0


In [50]:
from sklearn.ensemble import RandomForestClassifier

In [57]:
def train_model(X_train, X_dev, y_train, y_dev):
    model = {}

    # Training
    arch = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)
    model['architecture'] = arch
    arch.fit(X_train,y_train)

    # Predicting
    model['predict_train'] = arch.predict(X_train)
    model['predict_dev'] = arch.predict(X_dev)

    # Evaluating
    model['train_accuracy'] = np.mean(model['predict_train'] == y_train)
    model['dev_accuracy'] = np.mean(model['predict_dev'] == y_dev)
    
    return model


In [58]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold

In [80]:
variables = ['Classify_Sex', 'Age']
X = train[variables]
X = X.fillna(-1)
y = train['Survived']

# Splitting the Set
#np.random.seed(1)
#X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3)

# Several splits of the training-dev sets
n_fold = 4
models = []
for seed in range(5):
    kf = KFold(n_fold, shuffle=True, random_state=seed)
    for train_indexes, dev_indexes in kf.split(X):
        X_train = X.iloc[train_indexes]
        X_dev = X.iloc[dev_indexes]
        y_train = y.iloc[train_indexes]
        y_dev = y.iloc[dev_indexes]

        model = train_model(X_train, X_dev, y_train, y_dev)
        models.append(model)
print('Train sizes: ', train_indexes.shape[0])
print('Dev sizes: ', dev_indexes.shape[0])

Train sizes:  669
Dev sizes:  222


In [82]:
accuracy = 0

for i in range(len(models)):
    print('Model ', i+1)
    print("Train accuracy: ", models[i]['train_accuracy'])
    print("Dev accuracy: ", models[i]['dev_accuracy'])
    print()
    accuracy += models[i]['dev_accuracy']
accuracy /= len(models)

print("Mean dev accuracy: ", accuracy)

Model  1
Train accuracy:  0.8248502994011976
Dev accuracy:  0.7713004484304933

Model  2
Train accuracy:  0.8338323353293413
Dev accuracy:  0.757847533632287

Model  3
Train accuracy:  0.8233532934131736
Dev accuracy:  0.7443946188340808

Model  4
Train accuracy:  0.820627802690583
Dev accuracy:  0.7972972972972973

Model  5
Train accuracy:  0.8293413173652695
Dev accuracy:  0.757847533632287

Model  6
Train accuracy:  0.8338323353293413
Dev accuracy:  0.7443946188340808

Model  7
Train accuracy:  0.8233532934131736
Dev accuracy:  0.7757847533632287

Model  8
Train accuracy:  0.8146487294469357
Dev accuracy:  0.7927927927927928

Model  9
Train accuracy:  0.8293413173652695
Dev accuracy:  0.7399103139013453

Model  10
Train accuracy:  0.8203592814371258
Dev accuracy:  0.7847533632286996

Model  11
Train accuracy:  0.8173652694610778
Dev accuracy:  0.8026905829596412

Model  12
Train accuracy:  0.828101644245142
Dev accuracy:  0.7612612612612613

Model  13
Train accuracy:  0.823353293413

## Submission

In [47]:
# Accuracy if I predict all women were survived
gender_predict = (X_dev['Classify_Sex'] == 1).astype(np.int64)
gender_accuracy = np.mean(gender_predict)
print("Gender accuracy: |", dev_accuracy)


Gender accuracy: | 0.7313432835820896


In [69]:
X_test = test[variables]
X_test = X_test.fillna(-1)

In [71]:
predict = models[5]['architecture'].predict(X_test)

In [72]:
predict

array([0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0,
       1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1,
       1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1,
       0, 0, 1, 1, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       0, 0, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1,
       0, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0,
       0, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [73]:
subscription = pd.Series(predict, index=test['PassengerId'], name='Survived')
subscription.to_csv("Scikit_RandomForestClassifier1.csv", header=True)

In [74]:
!head -n10 Scikit_RandomForestClassifier.csv

PassengerId,Survived
892,0
893,1
894,0
895,1
896,1
897,0
898,1
899,0
900,1
