In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# read in data

train_values = pd.read_csv('data/Proj5_train_values.csv')
train_labels = pd.read_csv('data/Proj5_train_labels.csv')

## Modeling with 10% of data
- For faster processing

In [3]:
# grab first 10% of rows

train_values_10pct = train_values.head(int(len(train_values) * 0.1))
train_labels_10pct = train_labels.head(int(len(train_labels) * 0.1))

#### Baseline + TTS

In [10]:
# baseline model

train_labels_10pct['damage_grade'].value_counts(normalize = True)

2    0.567421
3    0.336109
1    0.096470
Name: damage_grade, dtype: float64

In [4]:
# establish X + y

X = train_values_10pct.drop(columns = ['building_id'])
y = train_labels_10pct['damage_grade']

In [5]:
# tts

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 123)

#### Modeling

In [6]:
# Random Forest

pipe_forest = make_pipeline(OneHotEncoder(use_cat_names = True), StandardScaler(), RandomForestClassifier(n_jobs = -1, random_state = 123))

params = {'randomforestclassifier__max_depth' : [6, 7, 8, 9, 10, 11],
          'randomforestclassifier__max_features' : [15, 20, 30, 35]}

grid_forest = GridSearchCV(pipe_forest, param_grid = params)

grid_forest.fit(X_train, y_train)

print(f'Train Score: {grid_forest.score(X_train, y_train)}')
print(f'Test Score: {grid_forest.score(X_test, y_test)}')

grid_forest.best_params_

Train Score: 0.7649015093374264
Test Score: 0.6836531082118189


{'randomforestclassifier__max_depth': 11,
 'randomforestclassifier__max_features': 35}

In [34]:
# grab feature importances

forest_fi_df = pd.DataFrame({'importances': grid_forest.best_estimator_.named_steps['randomforestclassifier'].feature_importances_, 
                             'name': grid_forest.best_estimator_.named_steps['onehotencoder'].get_feature_names()}).sort_values('importances', ascending = False)
forest_fi_df[:5]

Unnamed: 0,importances,name
0,0.322698,geo_level_1_id
1,0.091739,geo_level_2_id
2,0.076912,geo_level_3_id
10,0.064281,foundation_type_r
4,0.056407,age


In [37]:
# test to ensure X_train.columns + feature_importances are same length

print(len(grid_forest.best_estimator_.named_steps['randomforestclassifier'].feature_importances_))
print(len(grid_forest.best_estimator_.named_steps['onehotencoder'].get_feature_names()))

68
68


In [7]:
# Extra Trees

pipe_trees = make_pipeline(OneHotEncoder(use_cat_names = True), StandardScaler(), ExtraTreesClassifier(n_jobs = -1, random_state = 123))

params = {'extratreesclassifier__max_depth' : [6, 7, 8, 9, 10, 11],
          'extratreesclassifier__max_features' : [15, 20, 30, 35]}

grid_trees = GridSearchCV(pipe_trees, param_grid = params)

grid_trees.fit(X_train, y_train)

print(f'Train Score: {grid_trees.score(X_train, y_train)}')
print(f'Test Score: {grid_trees.score(X_test, y_test)}')

grid_trees.best_params_

Train Score: 0.7266820158608339
Test Score: 0.664313123561013


{'extratreesclassifier__max_depth': 11,
 'extratreesclassifier__max_features': 35}

In [32]:
# grab feature importances

trees_fi_df = pd.DataFrame({'importances': grid_trees.best_estimator_.named_steps['extratreesclassifier'].feature_importances_, 
                             'name': grid_trees.best_estimator_.named_steps['onehotencoder'].get_feature_names()}).sort_values('importances', ascending = False)
trees_fi_df[:5]

Unnamed: 0,importances,name
0,0.286205,geo_level_1_id
10,0.11876,foundation_type_r
19,0.054393,ground_floor_type_v
42,0.041385,has_superstructure_mud_mortar_stone
1,0.026127,geo_level_2_id


In [31]:
# test to ensure X_train.columns + feature_importances are same length

print(len(grid_trees.best_estimator_.named_steps['extratreesclassifier'].feature_importances_))
print(len(grid_trees.best_estimator_.named_steps['onehotencoder'].get_feature_names()))

68
68
