## FOR DEMO PURPOSES ONLY --

- Used our best model estimator (RandomForestClassifier) and top 5 permutation feature importances to create the Streamlit app
- Since we're only using 5 features, had to re-GridSearchCV to find the best hyperparams for these subset of features (max_depth = 4, max_features = 2)
- The test score continued to perform slightly better than the baseline accuracy score, so it was still ok to utilize for demo purposes

In [1]:
from os import lseek
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.pipeline import make_pipeline
import pickle
from sklearn.model_selection import GridSearchCV

In [4]:
# load in data
train_values = pd.read_csv('../data/Proj5_train_values.csv')
train_labels = pd.read_csv('../data/Proj5_train_labels.csv')

# Label Encode features

le = LabelEncoder()
train_enc = train_values.apply(le.fit_transform)
train_enc

# merge train values + labels

earthquake_encoded = pd.merge(train_enc, train_labels, on = 'building_id')

# set a subset of X + y using the 5 permutation feature importances
X = earthquake_encoded[['age', 'count_families', 'foundation_type', 'roof_type', 'has_superstructure_mud_mortar_stone']]
y = earthquake_encoded['damage_grade']

# tts
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y, random_state = 123)

# scale
sscaler = StandardScaler()
X_train_scaled = sscaler.fit_transform(X_train)
X_test_scaled = sscaler.transform(X_test)

In [5]:
# baseline score -- reminder

y.value_counts(normalize = True)

2    0.569332
3    0.334892
1    0.095776
Name: damage_grade, dtype: float64

In [13]:
# Random Forest

pipe_forest = make_pipeline(StandardScaler(), RandomForestClassifier(n_jobs = -1, random_state = 123))

params = {'randomforestclassifier__max_depth' : [2, 3, 4, 5],
          'randomforestclassifier__max_features' : [1, 2, 3, 4, 5]}

grid_forest = GridSearchCV(pipe_forest, param_grid = params)

grid_forest.fit(X_train, y_train)

print(f'Train Score: {grid_forest.score(X_train, y_train)}')
print(f'Test Score: {grid_forest.score(X_test, y_test)}')

grid_forest.best_params_

Train Score: 0.5694096894613775
Test Score: 0.5694683731233079


{'randomforestclassifier__max_depth': 4,
 'randomforestclassifier__max_features': 2}