In [None]:
import pandas as pd
import numpy as np
import os
from sklearn.cross_validation import train_test_split
import string

In [None]:
data_path = '../data/titanic/'

In [None]:
train = pd.read_csv( os.path.join(data_path, 'train.csv') )
test = pd.read_csv( os.path.join(data_path, 'test.csv') )
sub = pd.read_csv( os.path.join(data_path, 'gender_submission.csv') )

In [None]:
test['Fare'][152] = train['Fare'].mean()

In [None]:
train.head()

In [None]:
train = train.drop('Ticket', axis=1)
train = train.drop('Embarked', axis=1)
train = train.drop('PassengerId', axis=1)
train = train.drop('Cabin', axis=1)
train['FamilySize'] = train['SibSp'] + train['Parch']
train['Total Fare'] = (train['FamilySize']+1) * train['Fare']
train = train.drop('Fare', axis=1)


test_pID = test['PassengerId']
test = test.drop('Ticket', axis=1)
test = test.drop('Embarked', axis=1)
test = test.drop('PassengerId', axis=1)
test = test.drop('Cabin', axis=1)
test['FamilySize'] = test['SibSp'] + test['Parch']
test['Total Fare'] = (test['FamilySize']+1) * test['Fare']
test = test.drop('Fare', axis=1)


In [None]:
train.head()

In [None]:
title_list = ['Mr', 'Mrs', 'Dr', 'Miss', 'Master', 'Rev', 'Ms', 'Capt', 'Sir', 'Major',\
              'Don', 'Mlle', 'Mme', 'Jonkheer', 'Countess', 'Col']

In [None]:
# Reference : https://triangleinequality.wordpress.com/2013/09/08/basic-feature-engineering-with-the-titanic-data/

def title_in_name(name, target_list):
    for title in title_list:
        if name.find(title) != -1:
            return title
    print(name)
    return np.nan


In [None]:
train['Title'] = train['Name'].map(lambda x: title_in_name(x, title_list))
train = train.drop('Name', axis=1)

test['Title'] = test['Name'].map(lambda x: title_in_name(x, title_list))
test = test.drop('Name', axis=1)


In [None]:
#train.head()
test.head()

In [None]:
def replace_titles(x):
    title = x['Title']
    if title in ['Don', 'Major', 'Capt', 'Jonkheer', 'Rev', 'Col']:
        return 'Mr'
    elif title in ['Countess', 'Mme', 'Mme.']:
        return 'Mrs'
    elif title in ['Mlle', 'Ms', 'Mlle.', 'Ms.']:
        return 'Miss'
    elif title == 'Dr':
        if x['Sex'] == 'male':
            return 'Mr'
        else:
            return 'Mrs'
    else:
        return title
    
train['Title'] = train.apply(replace_titles, axis=1)
test['Title'] = test.apply(replace_titles, axis=1)

In [None]:
train['Sex'] = (train['Sex'] == 'male').astype('int')
test['Sex'] = (test['Sex'] == 'male').astype('int')

In [None]:
X = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Title', 'Total Fare']]
Y = train[['Survived']]

X_test = test[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'FamilySize', 'Title', 'Total Fare']]


In [None]:
test_split = 0.1
X_train, X_CV, Y_train, Y_CV = train_test_split(X, Y, test_size=test_split, random_state=42)


print('Train : ')
print(X_train.shape)
print(Y_train.shape)
print('CV : ')
print(X_CV.shape)
print(Y_CV.shape)

In [None]:
def get_means_age(df):
    sum_val={'Mr':0, 'Mrs':0, 'Miss':0, 'Master':0}
    count_val={'Mr':0, 'Mrs':0, 'Miss':0, 'Master':0}
    mean_val={'Mr':0, 'Mrs':0, 'Miss':0, 'Master':0}
    
    for _, row in df.iterrows():
        if not np.isnan(row['Age']):
            sum_val[row['Title']] += row['Age']
            count_val[row['Title']] += 1
    
    for key in sum_val.keys():
        mean_val[key] = sum_val[key] / count_val[key]
        
    return mean_val

mean_ages = get_means_age(train)

def replace_nan_ages(x):
    if np.isnan(x['Age']):
        x['Age'] = mean_ages[x['Title']]
    return x




In [None]:
X_train = X_train.apply(replace_nan_ages, axis=1)
X_CV = X_CV.apply(replace_nan_ages, axis=1)
X_test = X_test.apply(replace_nan_ages, axis=1)


In [None]:
title = {'Mr':0, 'Master':1, 'Miss':2, 'Mrs':3}

def title2num(x):
    return title[ x['Title'] ]

X_train['Title'] = X_train.apply(title2num, axis=1)
X_CV['Title'] = X_CV.apply(title2num, axis=1)
X_test['Title'] = X_test.apply(title2num, axis=1)

In [None]:
X_train.head()

In [None]:
Y_train.head()

In [None]:
X_train = np.array(X_train)
Y_train = np.array(Y_train)

X_CV = np.array(X_CV)
Y_CV = np.array(Y_CV)

X_test = np.array(X_test)

Y_train =  Y_train.ravel()
Y_CV =  Y_CV.ravel()

# Model Training

In [None]:
X_train = np.concatenate((X_train, X_CV))
Y_train = np.concatenate((Y_train, Y_CV))


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score, RandomizedSearchCV
from sklearn import metrics
import matplotlib.pyplot as plt
from scipy.stats import randint

In [None]:
params = {"n_estimators": [1000, 5000],
          "criterion": ["gini", "entropy"],
          "min_samples_leaf":randint(1, 9),
          "max_depth": [1, None]}

In [None]:
clf = RandomForestClassifier(verbose=1, n_jobs=8)

In [None]:
best_model = RandomizedSearchCV(clf, params, cv=5)

In [None]:
best_model.fit(X_train, Y_train)

In [None]:
best_model.best_score_

In [None]:
best_model.best_params_

In [None]:
#print( np.mean(cross_val_score(best_model, X_train, Y_train, cv=4, scoring='f1')) )

In [None]:
best_model.feature_importances_

In [None]:
'''
print('Accuracy : ', clf.score(X_CV, Y_CV))

fpr, tpr, thresholds = metrics.roc_curve(Y_CV, Y_pred_CV, pos_label=1)
roc_auc = metrics.auc(fpr, tpr)
print('AUC       : ', roc_auc)

f1 = metrics.f1_score(Y_CV, Y_pred_CV, average='binary')
print('F1 score  : ', f1)

cm = metrics.confusion_matrix(Y_CV, Y_pred_CV)
print('Confusion Matrix : ')
print(cm)

plt.figure()
plt.plot(fpr, tpr, color='darkorange', lw=1, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="lower right")
plt.grid()
'''

# Making Prediction

In [None]:
prediction = clf.predict(X_test)

In [None]:
id_list = np.array(test_pID)

In [None]:
submission = {'PassengerId': id_list,
              'Survived': prediction}

In [None]:
df = pd.DataFrame.from_dict(submission)

In [None]:
df.to_csv('submission.csv', index=False)