In [52]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score

In [53]:
#Set viewing options
pd.set_option('display.max_columns', None)  
pd.set_option('display.expand_frame_repr', False)
pd.set_option('display.max_colwidth', None)

In [54]:
trainlabel = pd.read_csv('train_10pct_labels.csv')

In [55]:
train = pd.read_csv('train_10pct.csv')

In [56]:
trainlabel.shape

(26059, 2)

In [57]:
train.shape

(26059, 39)

In [58]:
train.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,802906,6,487,12198,2,30,6,5,t,r,n,f,q,t,d,1,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
1,28830,8,900,2812,2,10,8,7,o,r,n,x,q,s,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
2,94947,21,363,8973,2,10,5,5,t,r,n,f,x,t,d,0,1,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
3,590882,22,418,10694,2,10,6,5,t,r,n,f,x,s,d,0,1,0,0,0,0,1,1,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0
4,201944,11,131,1488,3,30,8,9,t,r,n,f,x,s,d,1,0,0,0,0,0,0,0,0,0,0,v,1,0,0,0,0,0,0,0,0,0,0,0


In [9]:
train.isna().sum()


building_id                               0
geo_level_1_id                            0
geo_level_2_id                            0
geo_level_3_id                            0
count_floors_pre_eq                       0
age                                       0
area_percentage                           0
height_percentage                         0
land_surface_condition                    0
foundation_type                           0
roof_type                                 0
ground_floor_type                         0
other_floor_type                          0
position                                  0
plan_configuration                        0
has_superstructure_adobe_mud              0
has_superstructure_mud_mortar_stone       0
has_superstructure_stone_flag             0
has_superstructure_cement_mortar_stone    0
has_superstructure_mud_mortar_brick       0
has_superstructure_cement_mortar_brick    0
has_superstructure_timber                 0
has_superstructure_bamboo       

In [11]:
#null model
trainlabel['damage_grade'].value_counts(normalize=True)

2    0.567405
3    0.336122
1    0.096473
Name: damage_grade, dtype: float64

In [12]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26059 entries, 0 to 26058
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype 
---  ------                                  --------------  ----- 
 0   building_id                             26059 non-null  int64 
 1   geo_level_1_id                          26059 non-null  int64 
 2   geo_level_2_id                          26059 non-null  int64 
 3   geo_level_3_id                          26059 non-null  int64 
 4   count_floors_pre_eq                     26059 non-null  int64 
 5   age                                     26059 non-null  int64 
 6   area_percentage                         26059 non-null  int64 
 7   height_percentage                       26059 non-null  int64 
 8   land_surface_condition                  26059 non-null  object
 9   foundation_type                         26059 non-null  object
 10  roof_type                               26059 non-null  object
 11  gr

In [13]:
train.columns

Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [59]:
##https://intellipaat.com/community/1161/label-encoding-across-multiple-columns-in-scikit-learn
le = LabelEncoder()
train_enc = train.apply(le.fit_transform)
train_enc

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
0,19838,6,435,7525,1,6,5,3,2,2,0,0,1,3,2,1,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
1,678,8,802,1716,1,2,7,5,1,2,0,3,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
2,2319,21,322,5535,1,2,4,3,2,2,0,0,3,3,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
3,14549,22,370,6597,1,2,5,3,2,2,0,0,3,2,2,0,1,0,0,0,0,1,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
4,5001,11,115,928,2,6,7,7,2,2,0,0,3,2,2,1,0,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
26054,11229,20,169,1422,0,3,4,2,2,2,0,0,0,2,2,0,1,0,0,0,0,1,1,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
26055,347,20,1047,894,1,2,7,3,2,2,1,3,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
26056,5972,10,1268,7344,2,13,5,5,2,2,0,0,1,2,2,1,1,1,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
26057,24218,17,208,6169,2,5,7,5,2,2,1,0,1,3,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0


In [16]:
train_enc.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26059 entries, 0 to 26058
Data columns (total 39 columns):
 #   Column                                  Non-Null Count  Dtype
---  ------                                  --------------  -----
 0   building_id                             26059 non-null  int64
 1   geo_level_1_id                          26059 non-null  int64
 2   geo_level_2_id                          26059 non-null  int64
 3   geo_level_3_id                          26059 non-null  int64
 4   count_floors_pre_eq                     26059 non-null  int64
 5   age                                     26059 non-null  int64
 6   area_percentage                         26059 non-null  int64
 7   height_percentage                       26059 non-null  int64
 8   land_surface_condition                  26059 non-null  int32
 9   foundation_type                         26059 non-null  int32
 10  roof_type                               26059 non-null  int32
 11  ground_floor_ty

In [17]:
train_enc.columns


Index(['building_id', 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id',
       'count_floors_pre_eq', 'age', 'area_percentage', 'height_percentage',
       'land_surface_condition', 'foundation_type', 'roof_type',
       'ground_floor_type', 'other_floor_type', 'position',
       'plan_configuration', 'has_superstructure_adobe_mud',
       'has_superstructure_mud_mortar_stone', 'has_superstructure_stone_flag',
       'has_superstructure_cement_mortar_stone',
       'has_superstructure_mud_mortar_brick',
       'has_superstructure_cement_mortar_brick', 'has_superstructure_timber',
       'has_superstructure_bamboo', 'has_superstructure_rc_non_engineered',
       'has_superstructure_rc_engineered', 'has_superstructure_other',
       'legal_ownership_status', 'count_families', 'has_secondary_use',
       'has_secondary_use_agriculture', 'has_secondary_use_hotel',
       'has_secondary_use_rental', 'has_secondary_use_institution',
       'has_secondary_use_school', 'has_secondary_use_i

In [60]:
X = train_enc
y = trainlabel['damage_grade']

In [61]:
X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y, random_state=123)

In [52]:
X_train.head()

Unnamed: 0,building_id,geo_level_1_id,geo_level_2_id,geo_level_3_id,count_floors_pre_eq,age,area_percentage,height_percentage,land_surface_condition,foundation_type,roof_type,ground_floor_type,other_floor_type,position,plan_configuration,has_superstructure_adobe_mud,has_superstructure_mud_mortar_stone,has_superstructure_stone_flag,has_superstructure_cement_mortar_stone,has_superstructure_mud_mortar_brick,has_superstructure_cement_mortar_brick,has_superstructure_timber,has_superstructure_bamboo,has_superstructure_rc_non_engineered,has_superstructure_rc_engineered,has_superstructure_other,legal_ownership_status,count_families,has_secondary_use,has_secondary_use_agriculture,has_secondary_use_hotel,has_secondary_use_rental,has_secondary_use_institution,has_secondary_use_school,has_secondary_use_industry,has_secondary_use_health_post,has_secondary_use_gov_office,has_secondary_use_use_police,has_secondary_use_other
20786,5672,20,140,1028,0,2,8,1,2,2,1,0,0,2,2,0,0,0,0,0,0,0,0,1,0,0,2,0,1,0,0,0,0,0,1,0,0,0,0
1125,16994,4,470,5781,1,3,5,2,2,2,0,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
13343,8474,6,756,6727,2,4,11,6,2,2,1,0,3,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
6760,15570,20,1047,2939,0,2,5,2,0,2,0,3,0,2,2,0,1,1,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0
26012,6201,17,577,7355,1,3,5,5,2,2,1,0,1,2,2,0,1,0,0,0,0,0,0,0,0,0,2,1,0,0,0,0,0,0,0,0,0,0,0


In [100]:
#DECISION TREE CLASSIFIER
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {'decisiontreeclassifier__max_depth' : [2, 3, 4, 5]
             }
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [101]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [2, 3, 4, 5]})

In [102]:

grid_forest.score(X_test,y_test)

0.646815042210284

In [103]:
pred = grid_forest.predict(X_test)

In [105]:
f1_score(y_test,pred, average='micro')

0.646815042210284

In [61]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=5))])

In [108]:
# min_samples_split and max_depth adjusted
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {'decisiontreeclassifier__max_depth' : [2, 3, 4, 5],
          'decisiontreeclassifier__random_state' : [123]
          ,'decisiontreeclassifier__min_samples_split' : [2, 3, 4]
         }
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [109]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [2, 3, 4, 5],
                         'decisiontreeclassifier__min_samples_split': [2, 3, 4],
                         'decisiontreeclassifier__random_state': [123]})

In [110]:

grid_forest.score(X_test,y_test)

0.646815042210284

In [111]:
pred = grid_forest.predict(X_test)

In [112]:
f1_score(y_test,pred, average='micro')

0.646815042210284

In [66]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=5, random_state=123))])

In [113]:
# max_depth adjusted
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {'decisiontreeclassifier__max_depth' : [5,6,7,8],
          'decisiontreeclassifier__random_state' : [123]
          ,'decisiontreeclassifier__min_samples_split' : [2, 3, 4]
         }
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [114]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [5, 6, 7, 8],
                         'decisiontreeclassifier__min_samples_split': [2, 3, 4],
                         'decisiontreeclassifier__random_state': [123]})

In [115]:

grid_forest.score(X_test,y_test)

0.6660015349194167

In [116]:
pred = grid_forest.predict(X_test)

In [117]:
f1_score(y_test,pred, average='micro')

0.6660015349194167

In [70]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=7, min_samples_split=4,
                                        random_state=123))])

In [118]:
# clean DT pipeline
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {
          'decisiontreeclassifier__random_state' : [123]}
          
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [119]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__random_state': [123]})

In [120]:

grid_forest.score(X_test,y_test)

0.5938603223330775

In [121]:
pred = grid_forest.predict(X_test)

In [122]:
f1_score(y_test,pred, average='micro')

0.5938603223330775

In [66]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=5, random_state=123))])

In [123]:
# min_samples_split adjusted again
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {'decisiontreeclassifier__max_depth' : [5,6,7,8],
          'decisiontreeclassifier__random_state' : [123]
          ,'decisiontreeclassifier__min_samples_split' : [4,5,6,7]
         }
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [124]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [5, 6, 7, 8],
                         'decisiontreeclassifier__min_samples_split': [4, 5, 6,
                                                                       7],
                         'decisiontreeclassifier__random_state': [123]})

In [125]:
# best score
grid_forest.score(X_test,y_test)

0.6658480429777437

In [126]:
pred = grid_forest.predict(X_test)

In [127]:
f1_score(y_test,pred, average='micro')

0.6658480429777437

In [78]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=7, min_samples_split=7,
                                        random_state=123))])

In [128]:
# higher max_depth AND min_samples_split
pipe_forest = make_pipeline(StandardScaler(), DecisionTreeClassifier())
params = {'decisiontreeclassifier__max_depth' : [7,8,9,10,11],
          'decisiontreeclassifier__random_state' : [123]
          ,'decisiontreeclassifier__min_samples_split' : [6,7,8,9,10]
         }
grid_forest = GridSearchCV(pipe_forest, param_grid = params)

In [129]:
grid_forest.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('decisiontreeclassifier',
                                        DecisionTreeClassifier())]),
             param_grid={'decisiontreeclassifier__max_depth': [7, 8, 9, 10, 11],
                         'decisiontreeclassifier__min_samples_split': [6, 7, 8,
                                                                       9, 10],
                         'decisiontreeclassifier__random_state': [123]})

In [81]:
# no change from best score
grid_forest.score(X_test,y_test)

0.6658480429777437

In [116]:
pred = grid_forest.predict(X_test)

In [117]:
f1_score(y_test,pred, average='micro')

0.6660015349194167

In [82]:
grid_forest.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('decisiontreeclassifier',
                 DecisionTreeClassifier(max_depth=7, min_samples_split=7,
                                        random_state=123))])

In [130]:
#BAGGING ESTIMATOR
pipe_bagged = make_pipeline(StandardScaler(), BaggingClassifier())
params = {'baggingclassifier__n_estimators' : [10,20,30,40,50],
          'baggingclassifier__random_state' : [123]
          ,'baggingclassifier__max_features' : [1,6,7,8,9,10],
          'baggingclassifier__max_samples' : [1, 6,7,8,9,10]
         }
grid_bagged = GridSearchCV(pipe_bagged, param_grid = params)

In [131]:
grid_bagged.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('baggingclassifier',
                                        BaggingClassifier())]),
             param_grid={'baggingclassifier__max_features': [1, 6, 7, 8, 9, 10],
                         'baggingclassifier__max_samples': [1, 6, 7, 8, 9, 10],
                         'baggingclassifier__n_estimators': [10, 20, 30, 40,
                                                             50],
                         'baggingclassifier__random_state': [123]})

In [132]:
grid_bagged.score(X_test,y_test)

0.5613200306983883

In [133]:
pred = grid_bagged.predict(X_test)

In [134]:
f1_score(y_test,pred, average='micro')

0.5613200306983883

In [86]:
grid_bagged.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('baggingclassifier',
                 BaggingClassifier(max_features=10, max_samples=8,
                                   n_estimators=40, random_state=123))])

In [62]:
# Bagging with higher max_features
pipe_bagged = make_pipeline(StandardScaler(), BaggingClassifier())
params = {
          'baggingclassifier__random_state' : [123]
          ,'baggingclassifier__max_features' : [9,10, 12, 14, 16, 20]
         }
grid_bagged = GridSearchCV(pipe_bagged, param_grid = params)

In [63]:
grid_bagged.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('baggingclassifier',
                                        BaggingClassifier())]),
             param_grid={'baggingclassifier__max_features': [9, 10, 12, 14, 16,
                                                             20],
                         'baggingclassifier__random_state': [123]})

In [64]:
# best score
grid_bagged.score(X_test,y_test)

0.6681504221028396

In [138]:
pred = grid_bagged.predict(X_test)

f1_score(y_test,pred, average='micro')

0.6681504221028396

In [139]:
grid_bagged.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('baggingclassifier',
                 BaggingClassifier(max_features=9, random_state=123))])

In [140]:
#CLEAN BAGGING ESTIMATOR
pipe_bagged = make_pipeline(StandardScaler(), BaggingClassifier())
params = {
          'baggingclassifier__random_state' : [123]
}
grid_bagged = GridSearchCV(pipe_bagged, param_grid = params)

In [141]:
grid_bagged.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('baggingclassifier',
                                        BaggingClassifier())]),
             param_grid={'baggingclassifier__random_state': [123]})

In [142]:
#not as good
grid_bagged.score(X_test,y_test)

0.6555640828856485

In [146]:
pred = grid_bagged.predict(X_test)

f1_score(y_test,pred, average='micro')

0.6555640828856485

In [144]:
grid_bagged.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('baggingclassifier', BaggingClassifier(random_state=123))])

In [20]:
#CLEAN Adaboost ESTIMATOR
pipe_ada = make_pipeline(StandardScaler(), AdaBoostClassifier())
params = {
          'adaboostclassifier__random_state' : [123]
}
grid_ada = GridSearchCV(pipe_ada, param_grid = params)

In [21]:
grid_ada.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__random_state': [123]})

In [22]:
#not as good
grid_ada.score(X_test,y_test)

0.6526477359938603

In [23]:
pred = grid_ada.predict(X_test)

f1_score(y_test,pred, average='micro')

0.6526477359938603

In [24]:
grid_ada.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('adaboostclassifier', AdaBoostClassifier(random_state=123))])

In [26]:
#n_estimators, learning_rate, algorithm adjusted
pipe_ada = make_pipeline(StandardScaler(), AdaBoostClassifier())
params = {'adaboostclassifier__n_estimators':[30,40,50,60,70],
    'adaboostclassifier__learning_rate':[.1,.3,1.0,1.3,3],
    'adaboostclassifier__algorithm':['SAMME.R','SAMME'],
     'adaboostclassifier__random_state' : [123]
}
grid_ada = GridSearchCV(pipe_ada, param_grid = params)

In [27]:
grid_ada.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__algorithm': ['SAMME.R', 'SAMME'],
                         'adaboostclassifier__learning_rate': [0.1, 0.3, 1.0,
                                                               1.3, 3],
                         'adaboostclassifier__n_estimators': [30, 40, 50, 60,
                                                              70],
                         'adaboostclassifier__random_state': [123]})

In [28]:
#not quite as good
grid_ada.score(X_test,y_test)

0.6600153491941673

In [29]:
pred = grid_ada.predict(X_test)

f1_score(y_test,pred, average='micro')

0.6600153491941673

In [30]:
grid_ada.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('adaboostclassifier',
                 AdaBoostClassifier(learning_rate=1.3, n_estimators=70,
                                    random_state=123))])

In [32]:
#n_estimators, learning_rate adjusted more
pipe_ada = make_pipeline(StandardScaler(), AdaBoostClassifier())
params = {'adaboostclassifier__n_estimators':[70,80,90,100,110],
    'adaboostclassifier__learning_rate':[1.3,2, 2.3, 3],
    'adaboostclassifier__random_state' : [123]
}
grid_ada = GridSearchCV(pipe_ada, param_grid = params)

In [33]:
grid_ada.fit(X_train,y_train)

GridSearchCV(estimator=Pipeline(steps=[('standardscaler', StandardScaler()),
                                       ('adaboostclassifier',
                                        AdaBoostClassifier())]),
             param_grid={'adaboostclassifier__learning_rate': [1.3, 2, 2.3, 3],
                         'adaboostclassifier__n_estimators': [70, 80, 90, 100,
                                                              110],
                         'adaboostclassifier__random_state': [123]})

In [34]:
#not quite as good
grid_ada.score(X_test,y_test)

0.6600153491941673

In [35]:
pred = grid_ada.predict(X_test)

f1_score(y_test,pred, average='micro')

0.6600153491941673

In [36]:
grid_ada.best_estimator_

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('adaboostclassifier',
                 AdaBoostClassifier(learning_rate=1.3, n_estimators=80,
                                    random_state=123))])

In [49]:
#clean GradientBoostClassifier
# pipe_gr = make_pipeline(StandardScaler(), GradientBoostingClassifier())
# params = {'gradientboostingtclassifier__n_estimators':[30,40,50,60,70]
#     #'gradientboostingtclassifier__learning_rate':[.1,.3,1.0,1.3,3],
#     #'gradientboostingtclassifier__algorithm':['SAMME.R','SAMME'],
#      #'gradientboostingtclassifier__random_state' : [123]
# }
# grid_gr = GridSearchCV(pipe_gr, param_grid = params)

In [51]:
#grid_gr.fit(X_train,y_train)


ValueError: Invalid parameter gradientboostingtclassifier for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
                ('gradientboostingclassifier', GradientBoostingClassifier())]). Check the list of available parameters with `estimator.get_params().keys()`.

In [None]:
#grid_gr.score(X_test,y_test)

In [None]:
#pred = grid_gr.predict(X_test)

#f1_score(y_test,pred, average='micro')

In [None]:
####grid_gr.best_estimator_

In [80]:
### BELOW: INSTALL ELI5

In [71]:
!pip install eli5

Collecting eli5
  Downloading eli5-0.11.0-py2.py3-none-any.whl (106 kB)
Collecting tabulate>=0.7.7
  Downloading tabulate-0.8.9-py3-none-any.whl (25 kB)
Installing collected packages: tabulate, eli5
Successfully installed eli5-0.11.0 tabulate-0.8.9


In [73]:
import eli5

In [78]:
xx = eli5.explain_weights(pipe_bagged.named_steps['baggingclassifier'], top=50, feature_names=train_enc.columns)

In [79]:
xx