In [28]:
from sklearn.ensemble import RandomForestRegressor
#the error metric , in this case we"ll use c-stat aka(ROC/AUC)
from sklearn.metrics import roc_auc_score
import pandas as pd
X=pd.read_csv('titanic/train.csv')
y=X.pop('Survived')

In [29]:
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,714.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,14.526497,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,20.125,0.0,0.0,7.9104
50%,446.0,3.0,28.0,0.0,0.0,14.4542
75%,668.5,3.0,38.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


In [30]:
#Impute age with mean
X['Age'].fillna(X.Age.mean(), inplace=True)
X.describe()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
count,891.0,891.0,891.0,891.0,891.0,891.0
mean,446.0,2.308642,29.699118,0.523008,0.381594,32.204208
std,257.353842,0.836071,13.002015,1.102743,0.806057,49.693429
min,1.0,1.0,0.42,0.0,0.0,0.0
25%,223.5,2.0,22.0,0.0,0.0,7.9104
50%,446.0,3.0,29.699118,0.0,0.0,14.4542
75%,668.5,3.0,35.0,1.0,0.0,31.0
max,891.0,3.0,80.0,8.0,6.0,512.3292


# fitting model with numerical columns only

In [31]:
#Get just the numeric variables by selecting only those variables that are not object datatype
numeric_variables=list(X.dtypes[X.dtypes!="object"].index)
X[numeric_variables].head()

Unnamed: 0,PassengerId,Pclass,Age,SibSp,Parch,Fare
0,1,3,22.0,1,0,7.25
1,2,1,38.0,1,0,71.2833
2,3,3,26.0,0,0,7.925
3,4,1,35.0,1,0,53.1
4,5,3,35.0,0,0,8.05


In [32]:
#In this case oob score based on a forest of 33 trees, random_state=42 so that model will be exactly replicable
model=RandomForestRegressor(n_estimators=100, oob_score=True, random_state=42)
model.fit(X[numeric_variables], y)
model.oob_score_

0.1361695005913669

In [33]:
y_oob=model.oob_prediction_
#check out the bench_mark metric
print("c-stat:", roc_auc_score(y, y_oob))

c-stat: 0.73995515504


# Introducing categorical columns in the model

In [34]:
#Simple function to show descriptive stats on categorical variables
def describe_categorical(X):
    from IPython.display import display, HTML
    display(HTML(X[X.columns[X.dtypes=="object"]].describe().to_html()))

In [35]:
describe_categorical(X)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked
count,891,891,891,204,889
unique,891,2,681,147,3
top,"Clarke, Mrs. Charles V (Ada Maria Winfield)",male,1601,B96 B98,S
freq,1,577,7,4,644


In [36]:
X.drop(['Name', 'Ticket','PassengerId'], axis=1, inplace=True)

In [37]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C85,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C123,S
4,3,male,35.0,0,0,8.05,,S


In [38]:
#Change the cabin variable to be the only the first variable or None
def clean_cabin(x):
    try:
        return x[0]
    except TypeError:
        return "None"
X['Cabin']=X.Cabin.apply(clean_cabin)

In [39]:
X.head()

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked
0,3,male,22.0,1,0,7.25,,S
1,1,female,38.0,1,0,71.2833,C,C
2,3,female,26.0,0,0,7.925,,S
3,1,female,35.0,1,0,53.1,C,S
4,3,male,35.0,0,0,8.05,,S


In [40]:
categorical_variables=['Sex', 'Cabin', 'Embarked']

for variable in categorical_variables:
    #fill missing data with the word missing
    X[variable].fillna("Missing", inplace=True)
    #Create array of dummies
    dummies=pd.get_dummies(X[variable], prefix=variable)
    #update x to include dummies and drop the main variable
    X=pd.concat([X, dummies], axis=1)
    X.drop([variable], axis=1, inplace=True)

In [41]:
X.head()

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Cabin_T,Embarked_C,Embarked_Missing,Embarked_Q,Embarked_S
0,3,22.0,1,0,7.25,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
1,1,38.0,1,0,71.2833,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2,3,26.0,0,0,7.925,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,1,35.0,1,0,53.1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,3,35.0,0,0,8.05,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1


In [42]:
#Look at all the columns at the dataset at once
def printall(X, max_rows=10):
    from IPython.display import display, HTML
    display(HTML(X.to_html(max_rows=max_rows)))
printall(X)

Unnamed: 0,Pclass,Age,SibSp,Parch,Fare,Sex_female,Sex_male,Cabin_A,Cabin_B,Cabin_C,Cabin_D,Cabin_E,Cabin_F,Cabin_G,Cabin_None,Cabin_T,Embarked_C,Embarked_Missing,Embarked_Q,Embarked_S
0,3,22.000000,1,0,7.2500,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
1,1,38.000000,1,0,71.2833,1,0,0,0,1,0,0,0,0,0,0,1,0,0,0
2,3,26.000000,0,0,7.9250,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
3,1,35.000000,1,0,53.1000,1,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,3,35.000000,0,0,8.0500,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,2,27.000000,0,0,13.0000,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1
887,1,19.000000,0,0,30.0000,1,0,0,1,0,0,0,0,0,0,0,0,0,0,1
888,3,29.699118,1,2,23.4500,1,0,0,0,0,0,0,0,0,1,0,0,0,0,1
889,1,26.000000,0,0,30.0000,0,1,0,0,1,0,0,0,0,0,0,1,0,0,0


In [44]:
model=RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True, random_state=42)
model.fit(X, y)
print("c-stat:", roc_auc_score(y, model.oob_prediction_))

c-stat: 0.863521128261


# Variable importances measures

In [48]:
model.feature_importances_

array([  9.11384671e-02,   2.38891052e-01,   4.43567267e-02,
         2.15831071e-02,   2.15047796e-01,   1.43423437e-01,
         1.58822440e-01,   2.95342368e-03,   3.79055011e-03,
         6.47116172e-03,   4.30998991e-03,   8.59480266e-03,
         1.02403226e-03,   8.12054428e-04,   2.67741854e-02,
         6.64265010e-05,   1.06189189e-02,   0.00000000e+00,
         6.00379221e-03,   1.53176370e-02])

In [None]:
#Simple version that shows all of the variables
feature_importances=pd.Series(model.feature_importances_, index=X.columns)
feature_importances.sort()
feature_importances.plot(kind='barh', fig_size=(7,6));

# Parameters Optimization

Parameters that make your model better
    
    n_estimators=no.of trees in forest. choose as much as high as your computer can handle
    
    max_features=no. of features to consider to get the best split.Try['auto', 'None', 'log2', 0.9 and 0.2]
    
    min_samples_leaf= min no. of samples in a newly created leaves.


Parameters that will make it easier to train your model
   
   n_jobs=Determines if multiple processor is use to train and test the model . Always set it to -1.

### n_jobs

In [49]:
%%timeit
model=RandomForestRegressor(n_estimators=1000, n_jobs=1, oob_score=True, random_state=42)
model.fit(X, y)

8.4 s ± 397 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [50]:
%%timeit
model=RandomForestRegressor(n_estimators=1000, n_jobs=-1, oob_score=True, random_state=42)
model.fit(X, y)

5.91 s ± 244 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


### n_estimators

In [54]:
y.head()

0    0
1    1
2    1
3    1
4    0
Name: Survived, dtype: int64

In [55]:
results=[]
n_estimator_options=[30, 50, 100, 200, 500, 1000]
for trees in n_estimator_options:
    model=RandomForestRegressor(n_estimators=trees, n_jobs=-1, oob_score=True, random_state=42)
    model.fit(X, y)
    print(trees, "trees")
    roc=roc_auc_score(y, model.oob_prediction_)
    print("C-stat: ", roc)
    results.append(roc)
    print (" ")
pd.Series(results, n_estimator_options).plot()

30 trees
C-stat:  0.853870407652
 
50 trees
C-stat:  0.860698345743
 
100 trees
C-stat:  0.863521128261
 
200 trees
C-stat:  0.862192290076
 
500 trees
C-stat:  0.863739494456
 
1000 trees
C-stat:  0.864043076726
 


<matplotlib.axes._subplots.AxesSubplot at 0x335d849ba8>

# Final Model

In [56]:
model=RandomForestRegressor(n_estimators=1000, n_jobs=-1, oob_score=True, random_state=42, max_features="auto", min_samples_leaf=5)
model.fit(X, y)
roc=roc_auc_score(y, model.oob_prediction_)
print("C-stat: ", roc)

C-stat:  0.874269005848
