# Random Forest Classifier

## Imports and Globals

In [1]:
import mahalangur as mhgr
import numpy as np
import pandas as pd
import sklearn
import sqlite3

from hashlib import sha256
from sklearn.model_selection import LeaveOneGroupOut, GridSearchCV
from sklearn.ensemble import RandomForestClassifier

## Read Data

In [2]:
sql = 'SELECT * FROM model_base WHERE expedition_year >= 1970;'

conn = sqlite3.connect(mhgr.DATABASE_PATH)
hdb_df = pd.read_sql(sql, conn, index_col=['expedition_id', 'member_id'])
conn.close()

In [3]:
def partition(id_values, n_partitions=10):
    return (int(sha256('-'.join(id_values).encode('utf-8')).hexdigest(), 16) % n_partitions) + 1

In [4]:
hdb_df['fold'] = hdb_df.index.map(lambda x: partition(x, 10))
hdb_df['test_indicator'] = hdb_df.index.map(lambda x: partition(x, 5) == 1)

## Create Model

In [5]:
model_df = pd.get_dummies(data=pd.concat([
    hdb_df['himal'],
    hdb_df['height'],
    hdb_df['expedition_year'],
    hdb_df['season'],
    pd.Series(hdb_df['commercial_route'] == 'Y', name='commercial_route', dtype=np.uint8),
    hdb_df['total_members'],
    hdb_df['total_hired'],
    hdb_df['age'],
    pd.Series(hdb_df['sex'] == 'F', name='female', dtype=np.uint8),
    pd.Series(hdb_df['o2_used'] == 'Y', name='o2_used', dtype=np.uint8)
], axis=1), columns = ['season', 'himal'])

model_df.columns = map(str.lower, model_df.columns)

In [6]:
model_df.head(25)

Unnamed: 0_level_0,Unnamed: 1_level_0,height,expedition_year,commercial_route,total_members,total_hired,age,female,o2_used,season_autumn,season_spring,...,himal_northern,himal_palchunghamga,himal_pamari,himal_peri,himal_rolwaling,himal_saipal,himal_serang,himal_singalila,himal_umbak,himal_yokapahar
expedition_id,member_id,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
ACHN15301,1,6055,2015,0,5,0,23,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15301,2,6055,2015,0,5,0,23,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15301,3,6055,2015,0,5,0,19,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15301,4,6055,2015,0,5,0,25,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15301,5,6055,2015,0,5,0,22,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15302,1,6055,2015,0,9,2,58,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15302,2,6055,2015,0,9,2,60,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15302,3,6055,2015,0,9,2,50,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15302,4,6055,2015,0,9,2,66,1,0,1,0,...,0,0,0,0,0,0,0,0,0,0
ACHN15302,5,6055,2015,0,9,2,54,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [7]:
X_train = model_df[hdb_df['test_indicator'] == 0]
y_train = hdb_df['successful_summit'][hdb_df['test_indicator'] == 0]
folds = hdb_df['fold'][hdb_df['test_indicator'] == 0]

X_test = model_df[hdb_df['test_indicator'] == 1]
y_test = hdb_df['successful_summit'][hdb_df['test_indicator'] == 1]

In [8]:
cv = LeaveOneGroupOut()
param_grid = {
    'n_estimators': [50, 60, 70, 80, 90, 100, 110, 120],
    'max_depth': [None, 5, 10, 15]
}

gscv_rf = GridSearchCV(RandomForestClassifier(criterion='gini', oob_score=False), param_grid=param_grid, cv=cv.get_n_splits(X_train, y_train, folds))

In [9]:
gscv_rf.fit(X_train, y_train)

GridSearchCV(cv=8, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False,
                                              rando

In [10]:
gscv_rf.best_score_

0.695144205091686

In [11]:
model_rf = gscv_rf.best_estimator_
model_rf

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=5, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=90,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [12]:
sum(model_rf.predict(X_train) == y_train)/len(y_train)

0.7675583051450953

In [13]:
sum(model_rf.predict(X_test) == y_test)/len(y_test)

0.769053934571176

In [14]:
pd.DataFrame({
    'feature': X_train.columns,
    'importance': 100*model_rf.feature_importances_
}).sort_values(by='importance', axis=0, ascending=False)

Unnamed: 0,feature,importance
7,o2_used,50.741831
1,expedition_year,13.015392
0,height,9.964661
2,commercial_route,7.141121
13,himal_barun,3.79277
4,total_hired,3.654419
12,himal_annapurna,2.188598
3,total_members,1.812888
9,season_spring,1.610214
5,age,1.419049
