In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline

In [2]:
from fastai.imports import *
from fastai.structured import *

from pandas_summary import DataFrameSummary
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from IPython.display import FileLinks

from sklearn import metrics

1. add_datepart (not relevant)
2. train_cats
3. proc_df

### Import and explore

In [67]:
df_trn_raw = pd.read_csv("data/titanic-train.csv", low_memory=False)

In [68]:
df_trn_raw.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### Remove columns

In [69]:
del df_trn_raw["PassengerId"]
del df_trn_raw["Name"]
del df_trn_raw["Ticket"]

### Replace null values

1. Convert Cabin
2. Convert strings to categorical

In [70]:
df_trn_raw["Cabin"] = df_trn_raw["Cabin"].fillna("N").str[0]
train_cats(df_trn_raw)
X_trn, y_trn, nas = proc_df(df_trn_raw, "Survived")

3. Scale and center values

In [71]:
X_sc, y_sc, nas = proc_df(df_trn_raw, "Survived")

In [74]:
X_sc.head(5).T

Unnamed: 0,0,1,2,3,4
Pclass,3,1,3,1,3
Sex,2,1,1,1,2
Age,22,38,26,35,35
SibSp,1,1,0,1,0
Parch,0,0,0,0,0
Fare,7.25,71.2833,7.925,53.1,8.05
Cabin,8,3,8,3,8
Embarked,3,1,3,3,3
Age_na,False,False,False,False,False


In [78]:
X_sc["Age"] = (X_sc["Age"] - np.mean(X_sc["Age"])) / np.std(X_sc["Age"])
X_sc["Fare"] = (X_sc["Fare"] - np.mean(X_sc["Fare"])) / np.std(X_sc["Fare"])

### Create validation set

In [48]:
def split_val(a, n): return a[:n], a[n:]

def rmse(x, y): return math.sqrt(((x - y) ** 2).mean())

def rf_eval(m, X_trn, y_trn, X_val, y_val):
    print("RMSE trn: {}\nRMSE val: {}\nScore trn: {}\nScore val: {}".format(
        rmse(m.predict(X_trn), y_trn),
        rmse(m.predict(X_val), y_val),
        m.score(X_trn, y_trn),
        m.score(X_val, y_val)
    ))

In [50]:
n_val = math.ceil(len(df_trn_raw) * 0.25)
n_trn = 1 - n_val 

In [79]:
X_trn_sc, X_val_sc = split_val(X_sc, n_trn)
y_trn_sc, y_val_sc = split_val(y_sc, n_trn)

In [80]:
m = RandomForestClassifier(n_jobs=-1)
m.fit(X_trn_sc, y_trn_sc)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

#### Original

In [52]:
rf_eval(m, X_trn, y_trn, X_val, y_val)

RMSE trn: 0.16852476941780506
RMSE val: 0.3970612769556579
Score trn: 0.9715994020926756
Score val: 0.8423423423423423


#### Scaled & centered

In [81]:
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

RMSE trn: 0.1771726122433938
RMSE val: 0.3855498004780299
Score trn: 0.968609865470852
Score val: 0.8513513513513513


#### Increase num trees & leaf size 

In [85]:
m = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, n_jobs=-1,)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.54 s, sys: 160 ms, total: 1.7 s
Wall time: 1.34 s
RMSE trn: 0.3280599310882849
RMSE val: 0.3796631983009996
Score trn: 0.8923766816143498
Score val: 0.8558558558558559


#### Reduce max features

In [127]:
m = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, max_features=0.5, n_jobs=-1,)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.54 s, sys: 140 ms, total: 1.68 s
Wall time: 1.35 s
RMSE trn: 0.31881694613367556
RMSE val: 0.3736838766118223
Score trn: 0.898355754857997
Score val: 0.8603603603603603


In [116]:
rf_feat_importance(m, X_trn_sc)

Unnamed: 0,cols,imp
1,Sex,0.391241
5,Fare,0.193441
2,Age,0.153267
0,Pclass,0.095162
6,Cabin,0.05659
3,SibSp,0.043196
7,Embarked,0.033916
4,Parch,0.020815
8,Age_na,0.012373


In [126]:
m = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, max_features="log2", n_jobs=-1,)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.57 s, sys: 208 ms, total: 1.78 s
Wall time: 1.45 s
RMSE trn: 0.32577377131183904
RMSE val: 0.39134786718871123
Score trn: 0.8938714499252616
Score val: 0.8468468468468469


In [125]:
m = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, max_features="sqrt", n_jobs=-1,)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.45 s, sys: 124 ms, total: 1.58 s
Wall time: 1.35 s
RMSE trn: 0.3280599310882849
RMSE val: 0.39134786718871123
Score trn: 0.8923766816143498
Score val: 0.8468468468468469


#### Use subsamples

In [110]:
reset_rf_samples()

In [92]:
m = RandomForestClassifier(n_estimators=1000, min_samples_leaf=3, max_features=0.5, n_jobs=-1,)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 2.16 s, sys: 400 ms, total: 2.56 s
Wall time: 1.85 s
RMSE trn: 0.32115263264204263
RMSE val: 0.3796631983009996
Score trn: 0.8968609865470852
Score val: 0.8558558558558559


#### ExtraTreesClassifier

In [93]:
from sklearn.ensemble import ExtraTreesClassifier

In [96]:
m = ExtraTreesClassifier(n_estimators=1000, n_jobs=-1)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.5 s, sys: 280 ms, total: 1.78 s
Wall time: 1.4 s
RMSE trn: 0.1222607177678836
RMSE val: 0.44010645722790037
Score trn: 0.9850523168908819
Score val: 0.8063063063063063


In [97]:
m = ExtraTreesClassifier(n_estimators=1000, min_samples_leaf=3, n_jobs=-1)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.33 s, sys: 188 ms, total: 1.52 s
Wall time: 1.2 s
RMSE trn: 0.36881420294366996
RMSE val: 0.4026936331284146
Score trn: 0.8639760837070254
Score val: 0.8378378378378378


In [98]:
m = ExtraTreesClassifier(n_estimators=1000, min_samples_leaf=3, max_features=0.5, n_jobs=-1)
% time m.fit(X_trn_sc, y_trn_sc)
rf_eval(m, X_trn_sc, y_trn_sc, X_val_sc, y_val_sc)

CPU times: user 1.6 s, sys: 244 ms, total: 1.84 s
Wall time: 1.41 s
RMSE trn: 0.3522297117020086
RMSE val: 0.39134786718871123
Score trn: 0.8759342301943199
Score val: 0.8468468468468469


In [135]:
pd.crosstab(index=df_trn_raw["Sex"], columns=df_trn_raw["Survived"], margins=True)

Survived,0,1,All
Sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
female,81,233,314
male,468,109,577
All,549,342,891


### Predict training data

In [128]:
df_test_raw = pd.read_csv("data/titanic-test.csv", low_memory=False)

In [129]:
df_test = df_test_raw.copy()
df_test["_"] = "_"

del df_test["PassengerId"]
del df_test["Name"]
del df_test["Ticket"]

In [130]:
df_test["Cabin"] = df_test["Cabin"].fillna("N").str[0]
train_cats(df_test)
X_test, _, nas = proc_df(df_test, "_")

In [131]:
X_test["Age"] = (X_test["Age"] - np.mean(X_test["Age"])) / np.std(X_test["Age"])
X_test["Fare"] = (X_test["Fare"] - np.mean(X_test["Fare"])) / np.std(X_test["Fare"])

In [132]:
del X_test["Fare_na"]

In [133]:
preds_test = m.predict(X_test)

In [134]:
subm = pd.DataFrame({
    "PassengerId": df_test_raw["PassengerId"],
    "Survived": preds_test
})

subm.to_csv("data/titanic-rf3-submission.csv", index=False)
FileLinks("data/")