In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder, OrdinalEncoder
from sklearn.pipeline import make_pipeline
from category_encoders import OneHotEncoder
from sklearn.metrics import f1_score, precision_score, recall_score

from sklearn.linear_model import LogisticRegression
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier, BaggingClassifier, ExtraTreesClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_selection import RFE

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [2]:
# read in data

train_values = pd.read_csv('data/Proj5_train_values.csv')
train_labels = pd.read_csv('data/Proj5_train_labels.csv')

#### Label Encode

In [3]:
# Label Encode categorical features

le = LabelEncoder()
train_enc = train_values.apply(le.fit_transform)
train_enc

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,198723,6,482,11250,1,6,5,3,2,2,...,0,0,0,0,0,0,0,0,0,0
1,7210,8,891,2602,1,2,7,5,1,2,...,0,0,0,0,0,0,0,0,0,0
2,23774,21,359,8286,1,2,4,3,2,2,...,0,0,0,0,0,0,0,0,0,0
3,146212,22,413,9868,1,2,5,3,2,2,...,0,0,0,0,0,0,0,0,0,0
4,50437,11,129,1371,2,6,7,7,2,2,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
260596,170650,25,1322,1491,0,11,5,1,0,2,...,0,0,0,0,0,0,0,0,0,0
260597,165884,17,708,1893,1,0,5,3,2,2,...,0,0,0,0,0,0,0,0,0,0
260598,149084,17,50,7546,2,11,5,5,2,2,...,0,0,0,0,0,0,0,0,0,0
260599,37871,26,38,1702,1,2,13,4,2,2,...,0,0,0,0,0,0,0,0,0,0


## Modeling with 10% of data
- For faster processing

In [4]:
# grab first 10% of rows

train_enc_10pct = train_enc.head(int(len(train_values) * 0.1))
train_labels_10pct = train_labels.head(int(len(train_labels) * 0.1))

#### Baseline + TTS

In [25]:
# baseline model

train_labels_10pct['damage_grade'].value_counts(normalize = True)

2    0.567421
3    0.336109
1    0.096470
Name: damage_grade, dtype: float64

In [5]:
# establish X + y

X = train_enc_10pct.drop(columns = ['building_id'])
y = train_labels_10pct['damage_grade']

In [6]:
# tts

X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 123)

#### Modeling

In [30]:
# Random Forest

pipe_forest = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs = -1, random_state = 123))

params = {'randomforestclassifier__max_depth' : [6, 7, 8, 9, 10, 11],
          'randomforestclassifier__max_features' : [15, 20, 30, 35]}

grid_forest = GridSearchCV(pipe_forest, param_grid = params)

grid_forest.fit(X_train, y_train)

print(f'Train Score: {grid_forest.score(X_train, y_train)}')
print(f'Test Score: {grid_forest.score(X_test, y_test)}')

grid_forest.best_params_

Train Score: 0.7793809158352519
Test Score: 0.688871834228703


{'randomforestclassifier__max_depth': 11,
 'randomforestclassifier__max_features': 35}

In [32]:
# grab feature importances

pipe_forest_fi = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs = -1, random_state = 123, max_depth = 11, max_features = 15))
pipe_forest_fi.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=11, max_features=15,
                                        n_jobs=-1, random_state=123))])

In [33]:
forest_fi_df = pd.DataFrame({'importances': pipe_forest_fi.named_steps['randomforestclassifier'].feature_importances_, 
                             'name': X_train.columns}).sort_values('importances', ascending = False)
forest_fi_df[:5]

Unnamed: 0,importances,name
0,0.310246,geo_level_1_id
1,0.093202,geo_level_2_id
2,0.07557,geo_level_3_id
8,0.064637,foundation_type
4,0.063691,age


In [85]:
# test to ensure X_train.columns + feature_importances are same length

print(len(X_train.columns))
print(len(pipe_forest_fi.named_steps['randomforestclassifier'].feature_importances_))

38
38


In [7]:
# Extra Trees

pipe_trees = make_pipeline(StandardScaler(), ExtraTreesClassifier(n_jobs = -1, random_state = 123))

params = {'extratreesclassifier__max_depth' : [6, 7, 8, 9, 10, 11],
          'extratreesclassifier__max_features' : [15, 20, 30, 35]}

grid_trees = GridSearchCV(pipe_trees, param_grid = params)

grid_trees.fit(X_train, y_train)

print(f'Train Score: {grid_trees.score(X_train, y_train)}')
print(f'Test Score: {grid_trees.score(X_test, y_test)}')

grid_trees.best_params_

Train Score: 0.7532872857508314
Test Score: 0.6785878741366078


{'extratreesclassifier__max_depth': 11,
 'extratreesclassifier__max_features': 35}

In [8]:
# grab feature importances

pipe_trees_fi = make_pipeline(StandardScaler(), ExtraTreesClassifier(n_jobs = -1, random_state = 123, max_depth = 6, max_features = 35))
pipe_trees_fi.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('extratreesclassifier',
                 ExtraTreesClassifier(max_depth=6, max_features=35, n_jobs=-1,
                                      random_state=123))])

In [9]:
trees_fi_df = pd.DataFrame({'importances': pipe_trees_fi.named_steps['extratreesclassifier'].feature_importances_, 
                             'name': X_train.columns}).sort_values('importances', ascending = False)
trees_fi_df[:5]

Unnamed: 0,importances,name
0,0.395387,geo_level_1_id
15,0.212578,has_superstructure_mud_mortar_stone
14,0.093145,has_superstructure_adobe_mud
18,0.063215,has_superstructure_mud_mortar_brick
8,0.039315,foundation_type


In [10]:
# test to ensure X_train.columns + feature_importances are same length

print(len(X_train.columns))
print(len(pipe_trees_fi.named_steps['extratreesclassifier'].feature_importances_))

38
38
