In [63]:
import pandas as pd
import numpy as np
import os


# Learning and Processing the Data

In [64]:
train = pd.read_csv(os.path.join("Data","train.csv"))
test =  pd.read_csv(os.path.join("Data","test.csv"))


In [65]:
train.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [66]:
test.head()

Unnamed: 0,PassengerId,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,892,3,"Kelly, Mr. James",male,34.5,0,0,330911,7.8292,,Q
1,893,3,"Wilkes, Mrs. James (Ellen Needs)",female,47.0,1,0,363272,7.0,,S
2,894,2,"Myles, Mr. Thomas Francis",male,62.0,0,0,240276,9.6875,,Q
3,895,3,"Wirz, Mr. Albert",male,27.0,0,0,315154,8.6625,,S
4,896,3,"Hirvonen, Mrs. Alexander (Helga E Lindqvist)",female,22.0,1,1,3101298,12.2875,,S


In [67]:
def process_sex_into_binary(word):
    if word == 'male':
        return(0)
    else:
        return(1)

train['Classify_Sex'] = train['Sex'].map(process_sex_into_binary)
test['Classify_Sex'] = test['Sex'].map(process_sex_into_binary)

In [68]:
def process_dataset(dataset):
    dataset['Embarked_S'] = (dataset['Embarked'] == 'S').astype(int)
    dataset['Embarked_C'] = (dataset['Embarked'] == 'C').astype(int)

    dataset['Null_cabin'] = ( dataset['Cabin'].isnull() ).astype(int)

    dataset['Name_contains_Miss'] = (dataset["Name"].str.contains("Miss")).astype(int)
    dataset['Name_contains_Mrs'] = (dataset["Name"].str.contains("Mrs")).astype(int)
    dataset['Name_contains_Master'] = (dataset["Name"].str.contains("Master")).astype(int)
    dataset['Name_contains_Col'] = (dataset["Name"].str.contains("Col")).astype(int)
    dataset['Name_contains_Major'] = (dataset["Name"].str.contains("Major")).astype(int)
    dataset['Name_contains_Mr'] = (dataset["Name"].str.contains("Mr")).astype(int)

process_dataset(train)
process_dataset(test)

In [69]:
#Checking the train
train[['Sex','Classify_Sex']]

Unnamed: 0,Sex,Classify_Sex
0,male,0
1,female,1
2,female,1
3,female,1
4,male,0
...,...,...
886,male,0
887,female,1
888,female,1
889,male,0


In [70]:
from sklearn.ensemble import RandomForestClassifier

In [71]:
def train_model(X_train, X_dev, y_train, y_dev):
    model = {}

    # Training
    arch = RandomForestClassifier(n_estimators=100,n_jobs=-1,random_state=0)
    model['architecture'] = arch
    arch.fit(X_train,y_train)

    # Predicting
    model['predict_train'] = arch.predict(X_train)
    model['predict_dev'] = arch.predict(X_dev)

    # Evaluating
    model['train_accuracy'] = np.mean(model['predict_train'] == y_train)
    model['dev_accuracy'] = np.mean(model['predict_dev'] == y_dev)
    
    return model


In [72]:
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold

In [73]:
variables1 = ['Classify_Sex', 'Age', 'Pclass','SibSp','Parch','Fare']
variables2 = ['Classify_Sex', 'Age', 'Pclass','SibSp','Parch','Fare','Embarked_S','Embarked_C','Null_cabin','Name_contains_Miss','Name_contains_Mrs', 'Name_contains_Master', 'Name_contains_Col', 'Name_contains_Major', 'Name_contains_Mr']

variables = variables2
X = train[variables]
X = X.fillna(-1)
y = train['Survived']

# Splitting the Set
#np.random.seed(1)
#X_train, X_dev, y_train, y_dev = train_test_split(X, y, test_size=0.3)

# Several splits of the training-dev sets
models = []
kf = RepeatedKFold(n_splits=3, n_repeats=5, random_state=10)
for train_indexes, dev_indexes in kf.split(X):
    X_train = X.iloc[train_indexes]
    X_dev = X.iloc[dev_indexes]
    y_train = y.iloc[train_indexes]
    y_dev = y.iloc[dev_indexes]

    model = train_model(X_train, X_dev, y_train, y_dev)
    models.append(model)
print('Train sizes: ', train_indexes.shape[0])
print('Dev sizes: ', dev_indexes.shape[0])

Train sizes:  594
Dev sizes:  297


In [74]:
accuracy = 0

for i in range(len(models)):
    print('Model ', i+1)
    print("Train accuracy: ", models[i]['train_accuracy'])
    print("Dev accuracy: ", models[i]['dev_accuracy'])
    print()
    accuracy += models[i]['dev_accuracy']
accuracy /= len(models)

print("Mean dev accuracy: ", accuracy)

Model  1
Train accuracy:  0.9915824915824916
Dev accuracy:  0.8181818181818182

Model  2
Train accuracy:  0.9882154882154882
Dev accuracy:  0.8148148148148148

Model  3
Train accuracy:  0.9949494949494949
Dev accuracy:  0.8047138047138047

Model  4
Train accuracy:  0.9932659932659933
Dev accuracy:  0.8114478114478114

Model  5
Train accuracy:  0.9882154882154882
Dev accuracy:  0.7878787878787878

Model  6
Train accuracy:  0.9865319865319865
Dev accuracy:  0.8013468013468014

Model  7
Train accuracy:  0.9915824915824916
Dev accuracy:  0.8047138047138047

Model  8
Train accuracy:  0.9848484848484849
Dev accuracy:  0.8249158249158249

Model  9
Train accuracy:  0.9949494949494949
Dev accuracy:  0.8181818181818182

Model  10
Train accuracy:  0.9831649831649831
Dev accuracy:  0.8417508417508418

Model  11
Train accuracy:  0.9932659932659933
Dev accuracy:  0.8080808080808081

Model  12
Train accuracy:  0.9932659932659933
Dev accuracy:  0.8114478114478114

Model  13
Train accuracy:  0.99326599

## Error analysis

In [75]:
previous_model = models[-1]
check_X_dev = train.iloc[dev_indexes].copy()

check_X_dev['Predicted'] = previous_model['predict_dev'] 
errors = check_X_dev[ check_X_dev['Predicted'] != check_X_dev['Survived'] ]
errors.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,...,Embarked_S,Embarked_C,Null_cabin,Name_contains_Miss,Name_contains_Mrs,Name_contains_Master,Name_contains_Col,Name_contains_Major,Name_contains_Mr,Predicted
14,15,0,3,"Vestrom, Miss. Hulda Amanda Adolfina",female,14.0,0,0,350406,7.8542,...,1,0,1,1,0,0,0,0,0,1
27,28,0,1,"Fortune, Mr. Charles Alexander",male,19.0,3,2,19950,263.0,...,1,0,0,0,0,0,0,0,1,1
38,39,0,3,"Vander Planke, Miss. Augusta Maria",female,18.0,2,0,345764,18.0,...,1,0,1,1,0,0,0,0,0,1
41,42,0,2,"Turpin, Mrs. William John Robert (Dorothy Ann ...",female,27.0,1,0,11668,21.0,...,1,0,1,0,1,0,0,0,1,1
49,50,0,3,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)",female,18.0,1,0,349237,17.8,...,1,0,1,0,1,0,0,0,1,1


In [76]:
errors = errors [['PassengerId', 'Predicted','Survived', 'Sex', 'Age','Pclass', 'SibSp', 'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked','Name']]
errors.head()

Unnamed: 0,PassengerId,Predicted,Survived,Sex,Age,Pclass,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name
14,15,1,0,female,14.0,3,0,0,350406,7.8542,,S,"Vestrom, Miss. Hulda Amanda Adolfina"
27,28,1,0,male,19.0,1,3,2,19950,263.0,C23 C25 C27,S,"Fortune, Mr. Charles Alexander"
38,39,1,0,female,18.0,3,2,0,345764,18.0,,S,"Vander Planke, Miss. Augusta Maria"
41,42,1,0,female,27.0,2,1,0,11668,21.0,,S,"Turpin, Mrs. William John Robert (Dorothy Ann ..."
49,50,1,0,female,18.0,3,1,0,349237,17.8,,S,"Arnold-Franchi, Mrs. Josef (Josefine Franchi)"


In [77]:
women = errors[ errors['Sex'] == 'female' ]
men = errors[ errors['Sex'] == 'male' ]

In [78]:
women.sort_values('Survived')

Unnamed: 0,PassengerId,Predicted,Survived,Sex,Age,Pclass,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name
14,15,1,0,female,14.0,3,0,0,350406,7.8542,,S,"Vestrom, Miss. Hulda Amanda Adolfina"
807,808,1,0,female,18.0,3,0,0,347087,7.775,,S,"Pettersson, Miss. Ellen Natalia"
654,655,1,0,female,18.0,3,0,0,365226,6.75,,Q,"Hegarty, Miss. Hanora ""Nora"""
617,618,1,0,female,26.0,3,1,0,A/5. 3336,16.1,,S,"Lobb, Mrs. William Arthur (Cordelia K Stanlick)"
578,579,1,0,female,,3,1,0,2689,14.4583,,C,"Caram, Mrs. Joseph (Maria Elias)"
567,568,1,0,female,29.0,3,0,4,349909,21.075,,S,"Palsson, Mrs. Nils (Alma Cornelia Berglund)"
415,416,1,0,female,,3,0,0,343095,8.05,,S,"Meek, Mrs. Thomas (Annie Louise Rowley)"
362,363,1,0,female,45.0,3,0,1,2691,14.4542,,C,"Barbara, Mrs. (Catherine David)"
357,358,1,0,female,38.0,2,0,0,237671,13.0,,S,"Funk, Miss. Annie Clemmer"
312,313,1,0,female,26.0,2,1,1,250651,26.0,,S,"Lahtinen, Mrs. William (Anna Sylfven)"


In [61]:
men.sort_values("Survived")

Unnamed: 0,PassengerId,Predicted,Survived,Sex,Age,Pclass,SibSp,Parch,Ticket,Fare,Cabin,Embarked,Name
27,28,1,0,male,19.0,1,3,2,19950,263.0,C23 C25 C27,S,"Fortune, Mr. Charles Alexander"
793,794,1,0,male,,1,0,0,PC 17600,30.6958,,C,"Hoyt, Mr. William Fisher"
782,783,1,0,male,29.0,1,0,0,113501,30.0,D6,S,"Long, Mr. Milton Clyde"
748,749,1,0,male,19.0,1,1,0,113773,53.1,D30,S,"Marvin, Mr. Daniel Warner"
667,668,1,0,male,,3,0,0,312993,7.775,,S,"Rommetvedt, Mr. Knud Paust"
636,637,1,0,male,32.0,3,0,0,STON/O 2. 3101292,7.925,,S,"Leinonen, Mr. Antti Gustaf"
544,545,1,0,male,50.0,1,1,0,PC 17761,106.425,C86,C,"Douglas, Mr. Walter Donald"
826,827,1,0,male,,3,0,0,1601,56.4958,,S,"Lam, Mr. Len"
434,435,1,0,male,50.0,1,1,0,13507,55.9,E44,S,"Silvey, Mr. William Baird"
382,383,1,0,male,32.0,3,0,0,STON/O 2. 3101293,7.925,,S,"Tikkanen, Mr. Juho"


## Submission

In [79]:
# Accuracy if I predict all women were survived
gender_predict = (X['Classify_Sex'] == 1).astype(np.int64)
gender_accuracy = np.mean(gender_predict == y)
print("Gender accuracy: |", gender_accuracy)


Gender accuracy: | 0.7867564534231201


In [80]:
model = RandomForestClassifier(n_estimators=1000,n_jobs=-1,random_state=0)
model.fit(X[variables],y)

RandomForestClassifier(n_estimators=1000, n_jobs=-1, random_state=0)

In [81]:
X_test = test[variables]
X_test = X_test.fillna(-1)

In [82]:
predict = model.predict(X_test)

In [83]:
predict

array([0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 1, 0, 1, 1, 0, 0, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1,
       1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0,
       0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1,
       1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 1, 0,
       1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1,
       0, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0,

In [84]:
subscription = pd.Series(predict, index=test['PassengerId'], name='Survived')
subscription.to_csv("Scikit_RandomForestClassifier3.csv", header=True)

In [85]:
!head -n10 Scikit_RandomForestClassifier2.csv

PassengerId,Survived
892,0
893,0
894,1
895,1
896,0
897,0
898,0
899,0
900,1
