In [1]:
import pandas as pd

### This notebook is for the [Richter Predictor competition](https://www.drivendata.org/competitions/57/nepal-earthquake/)

There are three data sets here - a test set, a training set and training labels. This classification problem is a multi-class problem with a target variable, `damage_grade` with three possible values: 1, 2, 3.

In [2]:
# Loading test set
test_values = pd.read_csv('test_values.csv', index_col='building_id')
test_values.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
300051,17,596,11307,3,20,7,6,t,r,n,...,0,0,0,0,0,0,0,0,0,0
99355,6,141,11987,2,25,13,5,t,r,n,...,1,0,0,0,0,0,0,0,0,0
890251,22,19,10044,2,5,4,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
745817,26,39,633,1,0,19,3,t,r,x,...,0,0,1,0,0,0,0,0,0,0
421793,17,289,7970,3,15,8,7,t,r,q,...,0,0,0,0,0,0,0,0,0,0


In [3]:
#Loading train values & train labels

train_values = pd.read_csv('train_values.csv', index_col='building_id')
train_labels = pd.read_csv('train_labels.csv', index_col='building_id')

display(train_values.head())

train_labels.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,o,r,n,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,t,r,n,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,t,r,n,...,0,0,0,0,0,0,0,0,0,0


Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
802906,3
28830,2
94947,3
590882,2
201944,3


In [4]:
# Getting integer-type columns, and subsetting train values and test values
train_int_cols = [col for col in train_values.columns.tolist() if train_values[col].dtype in ['int64']]

train_df = train_values[train_int_cols]
test_df = test_values[train_int_cols]
train_df.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,0,0
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,0,0
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [5]:
# Merging train values & labels
df = pd.merge(train_df, train_labels, left_index=True, right_index=True)
df.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other,damage_grade
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,6,487,12198,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,0,3
28830,8,900,2812,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,0,2
94947,21,363,8973,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,3
590882,22,418,10694,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,0,2
201944,11,131,1488,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,0,3


In [6]:
# splitting new combined training set into X & y, with 'damage_grade' as the target
X = df.drop('damage_grade', axis=1)
y= df['damage_grade']

In [7]:
# Using mean encoding to scale and encode the geo_level columns. 

geo_level_1_mean = X.geo_level_1_id.mean()
geo_level_2_mean = X.geo_level_2_id.mean()
geo_level_3_mean = X.geo_level_3_id.mean()

X['geo_level_1_id'] = X['geo_level_1_id'] - geo_level_1_mean
X['geo_level_2_id'] = X['geo_level_2_id'] - geo_level_2_mean
X['geo_level_3_id'] = X['geo_level_3_id'] - geo_level_3_mean



In [8]:
X.head()

Unnamed: 0_level_0,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,...,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
building_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
802906,-7.900353,-214.074685,5940.123852,2,30,6,5,1,1,0,...,0,0,0,0,0,0,0,0,0,0
28830,-5.900353,198.925315,-3445.876148,2,10,8,7,0,1,0,...,0,0,0,0,0,0,0,0,0,0
94947,7.099647,-338.074685,2715.123852,2,10,5,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
590882,8.099647,-283.074685,4436.123852,2,10,6,5,0,1,0,...,0,0,0,0,0,0,0,0,0,0
201944,-2.900353,-570.074685,-4769.876148,3,30,8,9,1,0,0,...,0,0,0,0,0,0,0,0,0,0


In [9]:
X.var().sort_values(ascending=False)

geo_level_3_id                            1.329601e+07
geo_level_2_id                            1.703301e+05
age                                       5.411947e+03
geo_level_1_id                            6.453900e+01
area_percentage                           1.929169e+01
height_percentage                         3.680328e+00
count_floors_pre_eq                       5.294957e-01
has_superstructure_timber                 1.899696e-01
has_superstructure_mud_mortar_stone       1.813908e-01
count_families                            1.750493e-01
has_secondary_use                         9.936313e-02
has_superstructure_adobe_mud              8.078745e-02
has_superstructure_bamboo                 7.778458e-02
has_superstructure_cement_mortar_brick    6.960327e-02
has_superstructure_mud_mortar_brick       6.350927e-02
has_secondary_use_agriculture             6.023380e-02
has_superstructure_rc_non_engineered      4.077626e-02
has_superstructure_stone_flag             3.315361e-02
has_second

In [10]:
# Subsetting training set for columsn with higher variance
training_set = X[['geo_level_3_id', 'geo_level_2_id', 'age', 'geo_level_1_id', 'area_percentage', 
                  'height_percentage', 'count_floors_pre_eq', 'has_superstructure_timber', 
                  'has_superstructure_mud_mortar_stone']]

In [11]:
# Getting dummy variables
training_set = pd.get_dummies(training_set)

In [12]:
# Splitting training set into train/test sets

# This is possibly not a necessary step, since I already have a dataframe of training values/labels
# and a dataframe of test values

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(training_set, y, random_state=42, test_size=0.2)

In [31]:
# Scaling using RobustScaler

from sklearn.preprocessing import RobustScaler

rs = RobustScaler()
X_train_scaled = rs.fit_transform(X_train, y_train)
X_test_scaled = rs.transform(X_test)

In [14]:
# Initially, I loaded all of these to test out different algorithms.
# The kernel kept crashing, so I settled on RandomForest and just proceeded from there

from sklearn.naive_bayes import BernoulliNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import ExtraTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.semi_supervised import LabelPropagation
from sklearn.semi_supervised import LabelSpreading
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.svm import LinearSVC #(setting multi_class=”crammer_singer”)
from sklearn.linear_model import LogisticRegression #(setting multi_class=”multinomial”)
from sklearn.linear_model import LogisticRegressionCV #(setting multi_class=”multinomial”)
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import RidgeClassifierCV

from sklearn.metrics import f1_score

In [15]:
# Baseline RandomForest model

random_forest_model = RandomForestClassifier(random_state=42)
random_forest_model.fit(X_train_scaled, y_train)
y_pred = random_forest_model.predict(X_test_scaled)
random_forest_score = f1_score(y_test, y_pred, average='micro')

In [16]:
random_forest_model

RandomForestClassifier(random_state=42)

In [20]:
# Tuning RF parameters and using HalvingGridSearch to find the best parameter set for the model

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = np.arange(100, 1000, 100)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree


param_grid = {#'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
             }


rf_gs = HalvingGridSearchCV(RandomForestClassifier(random_state=42), param_grid, resource='n_estimators',
                             max_resources=10, scoring='f1_micro', random_state=0)

rf_grid = rf_gs.fit(X_train_scaled, y_train)
rf_optimal = rf_grid.best_estimator_
result = rf_gs.cv_results_

print('Best Parameters: ', rf_gs.best_params_)
print('Best Score: ', rf_gs.best_score_)

#rf_optimal.fit(X_train, y_train)

Best Parameters:  {'max_depth': 30, 'max_features': 'sqrt', 'min_samples_leaf': 4, 'min_samples_split': 10, 'n_estimators': 9}
Best Score:  0.7135504604758252


In [21]:
best_model = rf_optimal

In [23]:
# Subsetting test_set to prepare for predictions
test_set = test_values[['geo_level_3_id', 'geo_level_2_id', 'age', 'geo_level_1_id', 'area_percentage', 
                  'height_percentage', 'count_floors_pre_eq', 'has_superstructure_timber', 
                  'has_superstructure_mud_mortar_stone']]

In [24]:
# Getting dummies

test_set = pd.get_dummies(test_set)

In [26]:
# Using my optimal RandomForest to fit to the training data, then predict on the test values
rf_optimal.fit(X_train_scaled, y_train)

predictions = rf_optimal.predict(test_set)


In [28]:
# Preparing the submission

submission_format = pd.read_csv('submission_format.csv', index_col='building_id')

my_submission = pd.DataFrame(data=predictions,
                             columns=submission_format.columns,
                             index=submission_format.index)

In [29]:
# Submission dataframe

my_submission.head()


Unnamed: 0_level_0,damage_grade
building_id,Unnamed: 1_level_1
300051,2
99355,2
890251,2
745817,2
421793,2


In [30]:
# Submission csv received an f1_score of .5643. 
# This is much better than the .33 I received on my first submission

my_submission.to_csv('submission_attempt_2.csv')

In [33]:
import pickle

filename = 'improved_random_forest_model.sav'
pickle.dump(best_model, open(filename, 'wb'))

### END OF NOTEBOOK

In [None]:
bernoulli_model = BernoulliNB()
bernoulli_model.fit(X_train_scaled, y_train)
y_pred = bernoulli_model.predict(X_test_scaled)
bernoulli_score = f1_score(y_test, y_pred, average='micro')

In [None]:
decision_tree_model = DecisionTreeClassifier(random_state=42)
decision_tree_model.fit(X_train_scaled, y_train)
y_pred = decision_tree_model.predict(X_test_scaled)
decision_tree_score = f1_score(y_test, y_pred, average='micro')

In [None]:
extra_tree_classifier = ExtraTreeClassifier(random_state=42)
extra_tree_classifier.fit(X_train_scaled, y_train)
y_pred = extra_tree_classifier.predict(X_test_scaled)
extra_tree_score = f1_score(y_test, y_pred, average='micro')

In [None]:
extra_trees_classifier = ExtraTreesClassifier(random_state=42)
extra_trees_classifier.fit(X_train_scaled, y_train)
y_pred = extra_trees_classifier.predict(X_test_scaled)
extra_trees_score = f1_score(y_test, y_pred, average='micro')

In [None]:
gaussian_nb_model = GaussianNB()
gaussian_nb_model.fit(X_train_scaled, y_train)
y_pred = gaussian_nb_model.predict(X_test_scaled)
gaussian_nb_score = f1_score(y_test, y_pred, average='micro')

In [None]:
kneighbors_model = KNeighborsClassifier()
kneighbors_model.fit(X_train_scaled, y_train)
y_pred = kneighbors_model.predict(X_test_scaled)
kneighbors_score = f1_score(y_test, y_pred, average='micro')

In [None]:
ld_model = LinearDiscriminantAnalysis()
ld_model.fit(X_train_scaled, y_train)
y_pred = ld_model.predict(X_test_scaled)
ld_score = f1_score(y_test, y_pred, average='micro')

In [None]:
linear_svc_model = LinearSVC(multi_class='crammer_singer', random_state=42)
linear_svc_model.fit(X_train_scaled, y_train)
y_pred = linear_svc_model.predict(X_test_scaled)
linear_svc_score = f1_score(y_test, y_pred, average='micro')

In [None]:
logreg_model = LogisticRegression(multi_class='multinomial', random_state=42)
logreg_model.fit(X_train_scaled, y_train)
y_pred = logreg_model.predict(X_test_scaled)
logreg_score = f1_score(y_test, y_pred, average='micro')

In [None]:
logreg_cv_model = LogisticRegressionCV(multi_class='multinomial', random_state=42)
logreg_cv_model.fit(X_train_scaled, y_train)
y_pred = logreg_cv_model.predict(X_test_scaled)
logreg_cv_score = f1_score(y_test, y_pred, average='micro')

In [None]:
mlp_model = MLPClassifier(random_state=42)
mlp_model.fit(X_train_scaled, y_train)
y_pred = mlp_model.predict(X_test_scaled)
mlp_score = f1_score(y_test, y_pred, average='micro')

In [None]:
nearest_centroid_model = NearestCentroid()
nearest_centroid_model.fit(X_train_scaled, y_train)
y_pred = nearest_centroid_model.predict(X_test_scaled)
nearest_centroid_score = f1_score(y_test, y_pred, average='micro')

In [None]:
quad_model = QuadraticDiscriminantAnalysis()
quad_model.fit(X_train_scaled, y_train)
y_pred = quad_model.predict(X_test_scaled)
quad_model_score = f1_score(y_test, y_pred, average='micro')

In [None]:
ridge_model = RidgeClassifier()
ridge_model.fit(X_train_scaled, y_train)
y_pred = ridge_model.predict(X_test_scaled)
ridge_model_score = f1_score(y_test, y_pred, average='micro')

In [None]:
ridge_cv_model = RidgeClassifierCV()
ridge_cv_model.fit(X_train_scaled, y_train)
y_pred = ridge_cv_model.predict(X_test_scaled)
ridge_cv_score = f1_score(y_test, y_pred, average='micro')

In [None]:
scores_df = pd.DataFrame({'Model': ['bernoulli_model', 'decision_tree_model', 'extra_tree_classifier', 
                                    'extra_trees_classifier', 'gaussian_nb_model', 'kneighbors_model',
                                    'ld_model', 'linear_svc_model', 'logreg_model', 'logreg_cv_model',
                                    'mlp_model', 'nearest_centroid_model', 'quad_model',
                                    'random_forest_model', 'ridge_model', 'ridge_cv_model'
                                   ], 
                          'F1 Score': [bernoulli_score, decision_tree_score, extra_tree_score,
                                       extra_trees_score, gaussian_nb_score, kneighbors_score,
                                       ld_score, linear_svc_score, logreg_score, logreg_cv_score,
                                       mlp_score, nearest_centroid_score, quad_model_score,
                                       random_forest_score, ridge_model_score, ridge_cv_score
                                      ]})

scores_df.sort_values(by='F1 Score', ascending=False).head(4)

In [None]:
mlp_new = MLPClassifier(hidden_layer_sizes=(64,64,64),
                   activation="relu" ,
                   random_state=42, 
                   max_iter=2000).fit(X_train_scaled, y_train)
y_pred = mlp_new.predict(X_test_scaled)
mlp_new_score = f1_score(y_test, y_pred, average='micro')

In [None]:
mlp_new_score

In [None]:
from yellowbrick.classifier import PrecisionRecallCurve

viz = PrecisionRecallCurve(
    RandomForestClassifier(n_estimators=300),
    per_class=False,
    cmap="Set1"
)
viz.fit(X_train_scaled, y_train)
viz.score(X_test_scaled, y_test)
viz.show()

In [None]:
from sklearn.model_selection import RandomizedSearchCV
import numpy as np

# Number of trees in random forest
n_estimators = np.arange(100, 1000, 100)
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree


param_grid = {'n_estimators': n_estimators,
              'max_features': max_features,
              'max_depth': max_depth,
              'min_samples_split': min_samples_split,
              'min_samples_leaf': min_samples_leaf,
             }

rf_gs = RandomizedSearchCV(RandomForestClassifier(),
                      return_train_score=True,
                      param_distributions=param_grid,
                      scoring='f1_micro',
                      cv=5)

rf_grid = rf_gs.fit(X_train_scaled, y_train)
rf_optimal = rf_grid.best_estimator_
result = rf_gs.cv_results_

print('Best Parameters: ', rf_gs.best_params_)
print('Best Score: ', rf_gs.best_score_)

#rf_optimal.fit(X_train, y_train)
