In [1]:
import os
import pandas as pd
from sklearn.preprocessing import Imputer
from sklearn.base import TransformerMixin
import numpy as np

In [2]:
class CategoricalImputer(TransformerMixin):
    def fit(self, X, y=None):
        # uniques, counts = np.unique(X, return_counts=True)
        self.fill = pd.Series([X[c].value_counts().index[0]
                               if X[c].dtype == np.dtype('O') else X[c].mean() for c in X],
                              index=X.columns)

        return self

    def transform(self, X, y=None):
        return X.fillna(self.fill)

In [3]:
# data_dir = '/home/joan/PycharmProjects/titanic/data'
data_dir = '../data'
train_file = os.path.join(data_dir, 'train.csv')

In [4]:
data = pd.read_csv(train_file)
def drop_cols(data):
    data = data.drop(['PassengerId', 'Name', 'Ticket', 'Cabin'], axis=1)
    return data

data = drop_cols(data)
data.head()

Unnamed: 0,Survived,Pclass,Sex,Age,SibSp,Parch,Fare,Embarked
0,0,3,male,22.0,1,0,7.25,S
1,1,1,female,38.0,1,0,71.2833,C
2,1,3,female,26.0,0,0,7.925,S
3,1,1,female,35.0,1,0,53.1,S
4,0,3,male,35.0,0,0,8.05,S


In [5]:
def impute_data(data):
    data[['Age', 'Fare']] = Imputer().fit_transform(data[['Age', 'Fare']].values)
    embark_imputer = CategoricalImputer()
    data['Embarked'] = embark_imputer.fit_transform(data[['Embarked']])
    return data

data = impute_data(data)
data.isnull().sum()

Survived    0
Pclass      0
Sex         0
Age         0
SibSp       0
Parch       0
Fare        0
Embarked    0
dtype: int64

In [6]:
data.groupby(['Sex'])['Survived'].sum() * 100 /  data.groupby(['Sex'])['Survived'].count()

Sex
female    74.203822
male      18.890815
Name: Survived, dtype: float64

In [7]:
data.groupby(['Sex']).count()

Unnamed: 0_level_0,Survived,Pclass,Age,SibSp,Parch,Fare,Embarked
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
female,314,314,314,314,314,314,314
male,577,577,577,577,577,577,577


In [8]:
data.groupby(['Pclass'])['Survived'].sum() * 100 / data.groupby(['Pclass'])['Survived'].count()

Pclass
1    62.962963
2    47.282609
3    24.236253
Name: Survived, dtype: float64

In [9]:
attr = 'SibSp'
data.groupby([attr])['Survived'].sum() * 100 / data.groupby([attr])['Survived'].count()

SibSp
0    34.539474
1    53.588517
2    46.428571
3    25.000000
4    16.666667
5     0.000000
8     0.000000
Name: Survived, dtype: float64

In [10]:
attr = 'Parch'
data.groupby([attr])['Survived'].sum() * 100 / data.groupby([attr])['Survived'].count()

Parch
0    34.365782
1    55.084746
2    50.000000
3    60.000000
4     0.000000
5    20.000000
6     0.000000
Name: Survived, dtype: float64

In [11]:
attr = 'Embarked'
data.groupby([attr])['Survived'].sum() * 100 / data.groupby([attr])['Survived'].count()

Embarked
C    55.357143
Q    38.961039
S    33.900929
Name: Survived, dtype: float64

In [12]:
def cat_to_num(series):
    series = series.astype('category')
    return series.cat.codes

In [13]:
def data_to_cat(data):
    data[['Sex', 'Embarked']] = data[['Sex', 'Embarked']].apply(cat_to_num)
    return data

data = data_to_cat(data)
data[['Sex', 'Embarked']].head()


Unnamed: 0,Sex,Embarked
0,1,2
1,0,0
2,0,2
3,0,2
4,1,2


In [14]:
from sklearn.model_selection import train_test_split
train, dev = train_test_split(data, test_size=0.2)

In [15]:
# build a random forest

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
clf = RandomForestClassifier()
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [17]:
def check_accuracy(clf, X, y, test_X, test_y):
#   X, y are dataframes 
    clf = clf.fit(X, y)
    pred = clf.predict(test_X)
    return accuracy_score(test_y, pred)

In [18]:
# train on the training set
X = train[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
y = train[['Survived']]
dev_X = dev[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]
dev_y = dev[['Survived']]
# check_accuracy(clf, X, y, test_X, test_y)

# fit the model
clf = clf.fit(X, y)

# predict train set
train_yhat = clf.predict(X)
accuracy_score(y, train_yhat)

  if __name__ == '__main__':


0.9620786516853933

In [19]:
# predict the dev set - to see what the variance
dev_yhat = clf.predict(dev_X)
accuracy_score(dev_y, dev_yhat)

0.81005586592178769

In [20]:
# build a decision tree so we can do visualization
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier()
tree_clf = tree_clf.fit(X, y)
tree_clf

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [21]:
# visualize the tree before hyperparameter tuning

from sklearn import tree
with open('titanic.dot', 'w') as f:
    f = tree.export_graphviz(tree_clf,
                            feature_names=['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked'],
                            out_file=f)

In [26]:
# hyper parameter tuning
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials

space = {
    'max_depth': hp.quniform('max_depth', 5, 20, 1),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 30, 100, 5),
    'n_estimators': hp.quniform('n_estimators', 10, 50, 5),
}

def score(params):
    params['n_estimators'] = int(params['n_estimators'])
    clf = RandomForestClassifier(**params)
    return {'loss': 1-check_accuracy(clf), 'status': STATUS_OK}

trials = Trials()
best = fmin(score, space, algo=tpe.suggest, trials=trials, max_evals=250)

TypeError: 'generator' object is not subscriptable

In [None]:
# try with test data
test = pd.read_csv(os.path.join(data_dir, 'test.csv'))
passenger_id = test[['PassengerId']]
test = drop_cols(test)
test = impute_data(test)
t = data_to_cat(test)

test_X = rt[['Pclass', 'Sex', 'Age', 'SibSp', 'Parch', 'Fare', 'Embarked']]

pred = clf.predict(test_X)
pred

In [None]:
# write prediction to file
outfile = os.path.join(data_dir, 'prediction.csv')
df = pd.DataFrame()
df['PassengerId'] = passenger_id
df['Survived'] = pred
df.to_csv(outfile, index=False)