In [15]:
import pandas as pd
import altair as alt
import itertools

alt.data_transformers.enable("vegafusion")
#pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 500)

A useful guide to [Multiclass and multioutput algorithms](https://scikit-learn.org/stable/modules/multiclass.html) indicates that, as we have a single target with cardinality >2, we should use a 'multiclass' estimator.

In [39]:
# Take a peek at the data to get an idea of dtypes...
train_values = pd.read_csv('data/train_values.csv', index_col='building_id', nrows=10)
train_labels = pd.read_csv('data/train_labels.csv', index_col='building_id', nrows=10)
test_values = pd.read_csv('data/test_values.csv', index_col='building_id', nrows=10)

display(train_values.dtypes)

for df in [train_values, train_labels]:
    display(df.shape)
    display(df.head().T)

geo_level_1_id                             int64
geo_level_2_id                             int64
geo_level_3_id                             int64
count_floors_pre_eq                        int64
age                                        int64
area_percentage                            int64
height_percentage                          int64
land_surface_condition                    object
foundation_type                           object
roof_type                                 object
ground_floor_type                         object
other_floor_type                          object
position                                  object
plan_configuration                        object
has_superstructure_adobe_mud               int64
has_superstructure_mud_mortar_stone        int64
has_superstructure_stone_flag              int64
has_superstructure_cement_mortar_stone     int64
has_superstructure_mud_mortar_brick        int64
has_superstructure_cement_mortar_brick     int64
has_superstructure_t

(10, 38)

building_id,802906,28830,94947,590882,201944
geo_level_1_id,6,8,21,22,11
geo_level_2_id,487,900,363,418,131
geo_level_3_id,12198,2812,8973,10694,1488
count_floors_pre_eq,2,2,2,2,3
age,30,10,10,10,30
area_percentage,6,8,5,6,8
height_percentage,5,7,5,5,9
land_surface_condition,t,o,t,t,t
foundation_type,r,r,r,r,r
roof_type,n,n,n,n,n


(10, 1)

building_id,802906,28830,94947,590882,201944
damage_grade,3,2,3,2,3


In [40]:
objects = train_values.select_dtypes('object')
objects.apply(pd.unique)
object_columns = list(objects.columns)


In [41]:
# The description tells us which are categorical...
categorical_columns = ['land_surface_condition',
                        'foundation_type',
                        'roof_type',
                        'ground_floor_type',
                        'other_floor_type',
                        'position',
                        'plan_configuration',
                        'legal_ownership_status']

In [42]:
# The description tells us which are boolean, and they all happen to start with has_...
boolean_columns = [column for column in train_values.columns if column.startswith('has_')]

In [7]:
# These are ordered whereby: geographic region in which building exists, from 
# largest (level 1) to most specific sub-region (level 3). 
# Possible values: level 1: 0-30, level 2: 0-1427, level 3: 0-12567.
# Nota bene: They may not be independent if lower region id values repeat...
location_columns = ['geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id']

In [43]:
# Excluding boolean_columns and the location_columns, all others are numerical...
numbers = train_values.drop(boolean_columns + location_columns, axis=1).select_dtypes('number')
numerical_columns = list(numbers.columns)
numbers.describe()

Unnamed: 0,count_floors_pre_eq,age,area_percentage,height_percentage,count_families
count,10.0,10.0,10.0,10.0,10.0
mean,2.0,14.0,7.4,5.6,1.0
std,0.471405,11.005049,2.674987,1.505545,0.0
min,1.0,0.0,3.0,4.0,1.0
25%,2.0,10.0,6.0,5.0,1.0
50%,2.0,10.0,8.0,5.0,1.0
75%,2.0,22.5,8.0,6.0,1.0
max,3.0,30.0,13.0,9.0,1.0


In [50]:
# Geographic region in which building exists, from largest (level 1) to most specific sub-region (level 3). 
# Possible values: level 1: 0-30, level 2: 0-1427, level 3: 0-12567.
# Specific CategoricalDtype are required to keep them as int64...
type1 = pd.CategoricalDtype(categories=range(0, 30 + 1), ordered=False)
type2 = pd.CategoricalDtype(categories=range(0, 1427 + 1), ordered=False)
type3 = pd.CategoricalDtype(categories=range(0, 12567 + 1), ordered=False)

c = itertools.product(categorical_columns, ['category']) # categories_dtype=str, ordered=False
b = itertools.product(boolean_columns, ['boolean'])
dtype = dict(list(c) + list(b) + [('geo_level_1_id', type1), ('geo_level_2_id', type2), ('geo_level_3_id', type3)])

nrows = None # None for all... i.e. 260601

train_values = pd.read_csv('data/train_values.csv', index_col='building_id', dtype=dtype, nrows=nrows)
train_labels = pd.read_csv('data/train_labels.csv', index_col='building_id', nrows=nrows)
test_values = pd.read_csv('data/test_values.csv', index_col='building_id', dtype=dtype, nrows=nrows)

display(train_values.info())
for location_column in location_columns:
    display(train_values[location_column].dtype)

<class 'pandas.core.frame.DataFrame'>
Index: 260601 entries, 802906 to 747594
Data columns (total 38 columns):
 #   Column                                  Non-Null Count   Dtype   
---  ------                                  --------------   -----   
 0   geo_level_1_id                          260601 non-null  category
 1   geo_level_2_id                          260601 non-null  category
 2   geo_level_3_id                          260601 non-null  category
 3   count_floors_pre_eq                     260601 non-null  int64   
 4   age                                     260601 non-null  int64   
 5   area_percentage                         260601 non-null  int64   
 6   height_percentage                       260601 non-null  int64   
 7   land_surface_condition                  260601 non-null  category
 8   foundation_type                         260601 non-null  category
 9   roof_type                               260601 non-null  category
 10  ground_floor_type               

None

CategoricalDtype(categories=range(0, 31), ordered=False, categories_dtype=int64)

CategoricalDtype(categories=range(0, 1428), ordered=False, categories_dtype=int64)

CategoricalDtype(categories=range(0, 12568), ordered=False, categories_dtype=int64)

In [26]:
from ydata_profiling import ProfileReport

profile = train_values.profile_report(
                        title="Richter's Train Dataset Pandas profile",
                        correlations={
                            "auto": {"calculate": True, 'threshold': 0.0, 'warn_high_correlations': True},
                            "pearson": {"calculate": False},
                            "spearman": {"calculate": False},
                            "kendall": {"calculate": False},
                            "phi_k": {"calculate": False},
                            "cramers": {"calculate": False}
                        }
)

profile.to_notebook_iframe() # to_widgets() did not complete in a reasonable time...


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

In [46]:
possible_categorical_values = {
    'land_surface_condition': ['n', 'o', 't'],
    'foundation_type': ['h', 'i', 'r', 'u', 'w'],
    'roof_type': ['n', 'q', 'x'],
    'ground_floor_type': ['f', 'm', 'v', 'x', 'z'],
    'other_floor_type': ['j', 'q', 's', 'x'],
    'position': ['j', 'o', 's', 't'],
    'plan_configuration': ['a', 'c', 'd', 'f', 'm', 'n', 'o', 'q', 's', 'u'],
    'legal_ownership_status': ['a', 'r', 'v', 'w']
}

display(possible_categorical_values)
display(train_values[categorical_columns].apply(pd.unique))
display(test_values[categorical_columns].apply(pd.unique))

{'land_surface_condition': ['n', 'o', 't'],
 'foundation_type': ['h', 'i', 'r', 'u', 'w'],
 'roof_type': ['n', 'q', 'x'],
 'ground_floor_type': ['f', 'm', 'v', 'x', 'z'],
 'other_floor_type': ['j', 'q', 's', 'x'],
 'position': ['j', 'o', 's', 't'],
 'plan_configuration': ['a', 'c', 'd', 'f', 'm', 'n', 'o', 'q', 's', 'u'],
 'legal_ownership_status': ['a', 'r', 'v', 'w']}

land_surface_condition    ['t', 'o', 'n']
Categories (3, object): ['n', ...
foundation_type           ['r', 'w', 'i', 'u', 'h']
Categories (5, objec...
roof_type                 ['n', 'q', 'x']
Categories (3, object): ['n', ...
ground_floor_type         ['f', 'x', 'v', 'z', 'm']
Categories (5, objec...
other_floor_type          ['q', 'x', 'j', 's']
Categories (4, object): [...
position                  ['t', 's', 'j', 'o']
Categories (4, object): [...
plan_configuration        ['d', 'u', 's', 'q', 'm', 'c', 'a', 'n', 'f', ...
legal_ownership_status    ['v', 'a', 'r', 'w']
Categories (4, object): [...
dtype: object

land_surface_condition    ['t', 'n', 'o']
Categories (3, object): ['n', ...
foundation_type           ['r', 'w', 'u', 'h', 'i']
Categories (5, objec...
roof_type                 ['n', 'x', 'q']
Categories (3, object): ['n', ...
ground_floor_type         ['f', 'v', 'x', 'm', 'z']
Categories (5, objec...
other_floor_type          ['q', 'j', 'x', 's']
Categories (4, object): [...
position                  ['s', 't', 'j', 'o']
Categories (4, object): [...
plan_configuration        ['d', 'u', 'q', 'a', 'c', 'm', 's', 'o', 'f', ...
legal_ownership_status    ['v', 'a', 'w', 'r']
Categories (4, object): [...
dtype: object

In [51]:
# 'geo_level_1_id', 'geo_level_2_id', 'geo_level_3_id'
# TODO: Want to check if values in smaller regions are replicated in larger regions...
# The geo_level_n_id are a good proxy for local conditions that most likely affect the prediction. 
# It is qualatitive (categorical) rather than quantative (should not be scaled). It is specifically nominal 
# rather than ordinal (ordered).
# Although they could be one hot encoded, this will result in large dimensional data.
# 1. OneHotEncoder -> Principal Component Analysis (PCA) to 3 components.
# 



display(train_values[location_columns].describe())

# location = train_values[location_columns].groupby(location_columns[::-1]).value_counts()
# #index = location.to_frame()
# display(location)
# #display(index)

# level0 = location.index.get_level_values(0)#.to_series()
# level1 = location.index.get_level_values(1)#.to_series()
# level2 = location.index.get_level_values(2)#.to_series()

# level0_duplicates = level0[level0.duplicated()]
# level1_duplicates = level1[level1.duplicated()]
# level2_duplicates = level2[level2.duplicated()]


# display(level0_duplicates) # geo_level_3_id, no duplication which means no overlap in geo_level_2_id
# display(level1_duplicates) # geo_level_2_id, duplication as there are N observations per geo_level_1_id
# display(level2_duplicates) # geo_level_1_id, duplication as there are N observations per geo_level_1_id

# train_values.query('geo_level_2_id in @level1_duplicates')[location_columns].groupby('geo_level_3_id').value_counts()

#, location.index.get_level_values(1).has_duplicates, location.index.get_level_values(2).has_duplicates

#index.has_duplicates, location.index.get_level_values(1).has_duplicates, location.index.get_level_values(2).has_duplicates


Unnamed: 0,geo_level_1_id,geo_level_2_id,geo_level_3_id
count,260601,260601,260601
unique,31,1414,11595
top,6,39,633
freq,24381,4038,651


With 260K+ observations, and according to [Choosing the right estimator](https://scikit-learn.org/stable/tutorial/machine_learning_map/index.html), a good option would be 'Linear SVC'...

In [52]:
alt.Chart(train_values[numerical_columns]).mark_bar().encode(
    x = alt.X(alt.repeat("repeat"), type='quantitative'),
    y = 'count()',
).properties(
    width=100,
    height=100
).repeat(
    repeat=numerical_columns,
    columns=4
)

In [54]:
import sklearn.utils.multiclass as multiclass
from sklearn.pipeline import make_pipeline, Pipeline, make_union, FeatureUnion, FunctionTransformer
from sklearn.compose import make_column_transformer, ColumnTransformer
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.multiclass import OutputCodeClassifier
from sklearn.preprocessing import StandardScaler, OneHotEncoder, TargetEncoder
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

target_column = 'damage_grade'
type_of_target = multiclass.type_of_target(train_labels[target_column])
print(f'target variable {target_column} is {type_of_target}...')

# The combination of OneHotEncoder and PCA (Principal Component analysis) for this dataset crashed the kernel on my machine...

# CountEncoder (category_encoders), also called Frequency encoding, useful when there’s a correlation between the frequency of a category and the target variable
# HashingEncoder (category_encoders)
# CatBoostEncoder (catboost)
# TargetEncoder, each category is encoded based on a shrunk estimate of the average target values for observations belonging to the category
# Embedding (tensorflow)

preprocessing = make_pipeline(
    make_union(
        make_pipeline(
            make_column_transformer(
                #(OneHotEncoder(sparse_output=False, handle_unknown="ignore"), location_columns),
                (TargetEncoder(), location_columns),
            ),
            #PCA(n_components=3) # 'mle' Maximum Likelihood Estimation
        ).set_output(transform='pandas'),
        make_column_transformer(
            (OneHotEncoder(sparse_output=False, handle_unknown="ignore"), categorical_columns),
            ("passthrough", numerical_columns),
            ("passthrough", boolean_columns)
        )
    )
).set_output(transform='pandas')

display(preprocessing)

X_train_transformed = preprocessing.fit_transform(train_values, train_labels[target_column])
X_test_transformed = preprocessing.transform(test_values)

target variable damage_grade is multiclass...


In [55]:
# Nota bene: The kernel crashed with OneHotEncoder/PCA...
display(X_train_transformed.T)

building_id,802906,28830,94947,590882,201944,333020,728451,475515,441126,989500,...,560805,207683,226421,159555,827012,688636,669485,602512,151409,747594
pipeline__targetencoder__geo_level_1_id_1,0.086554,0.033635,0.020935,0.128043,0.046934,0.034035,0.141254,0.192076,0.084991,0.354995,...,0.190575,0.054627,0.033635,0.037406,0.034423,0.08455,0.0124,0.0124,0.354995,0.020935
pipeline__targetencoder__geo_level_1_id_2,0.66625,0.446438,0.393363,0.74267,0.571385,0.44683,0.690812,0.6896,0.766084,0.559881,...,0.691565,0.548133,0.446438,0.480757,0.44684,0.777496,0.183146,0.183146,0.559881,0.393363
pipeline__targetencoder__geo_level_1_id_3,0.247196,0.519923,0.585695,0.129272,0.381678,0.51913,0.167914,0.118311,0.148916,0.085091,...,0.117848,0.397239,0.519923,0.481832,0.518732,0.137947,0.804449,0.804449,0.085091,0.585695
pipeline__targetencoder__geo_level_2_id_1,0.0,0.012496,0.079456,0.017951,0.028144,0.011833,0.066239,0.744543,0.030268,0.527496,...,0.0,0.078169,0.030532,0.039337,0.028731,0.171322,0.0,0.030045,0.499642,0.0
pipeline__targetencoder__geo_level_2_id_2,0.241716,0.47883,0.308474,0.856289,0.590439,0.428143,0.706452,0.211685,0.844193,0.403578,...,0.813884,0.53406,0.470002,0.633357,0.468478,0.726822,0.064489,0.06413,0.461334,0.2394
pipeline__targetencoder__geo_level_2_id_3,0.757796,0.508108,0.612029,0.125611,0.381358,0.559738,0.227213,0.038512,0.125443,0.067724,...,0.185808,0.387762,0.499197,0.327288,0.502584,0.099924,0.935336,0.905552,0.038715,0.760204
pipeline__targetencoder__geo_level_3_id_1,0.0,0.091346,0.020586,0.040977,0.0,0.0,0.0,0.885651,0.0,0.454292,...,0.0,0.030527,0.026854,0.0,0.0,0.0,0.0,0.068018,0.220296,0.0
pipeline__targetencoder__geo_level_3_id_2,0.222772,0.716427,0.369277,0.794123,0.636661,0.619031,0.507788,0.0,0.802014,0.500789,...,0.840607,0.645741,0.604355,0.916201,0.616305,0.858838,0.025037,0.146599,0.766789,0.095741
pipeline__targetencoder__geo_level_3_id_3,0.773855,0.19057,0.609258,0.164118,0.362322,0.37774,0.479654,0.076913,0.196028,0.034595,...,0.157823,0.323345,0.367518,0.08336,0.380911,0.13713,0.9746,0.778728,0.010895,0.902702
columntransformer__onehotencoder__land_surface_condition_n,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0


In [56]:

# baseline
clf = LinearSVC(random_state=0, dual='auto', max_iter=2000)
clf.fit(X_train_transformed, train_labels[target_column])
y_pred = clf.predict(X_train_transformed)
baseline_micro_f1 = f1_score(train_labels[target_column], y_pred, average='micro')
print(f'Baseline micro f1 {baseline_micro_f1}')

# Create and train the classifier with OutputCode Classifier
clf = OutputCodeClassifier(LinearSVC(random_state=0, dual='auto', max_iter=2000), code_size=2, random_state=0)
clf.fit(X_train_transformed, train_labels[target_column])
y_pred = clf.predict(X_train_transformed)
occ_micro_f1 = f1_score(train_labels[target_column], y_pred, average='micro')
print(f'OCC micro f1 {occ_micro_f1}')

Baseline micro f1 0.7320194473543846
OCC micro f1 0.7303310424748946


# Score History

Baseline micro f1 0.7320194473543846
OCC micro f1 0.7303310424748946

Baseline micro f1 0.63235
OCC micro f1 0.63137

Baseline micro f1 0.5820737449203955
OCC micro f1 0.5829064355086895

In [57]:
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, KFold
from sklearn.ensemble import HistGradientBoostingRegressor, HistGradientBoostingClassifier
from sklearn.metrics import make_scorer

target_column = 'damage_grade'

#split data into 80% training and 20% test sets
X_train, X_test, y_train, y_test = train_test_split(X_train_transformed, train_labels[target_column], test_size=0.2, random_state=0)

# TODO: Try XGBoost and/or CatBoost

hist_regressor = HistGradientBoostingRegressor(
    random_state=0, categorical_features="from_dtype", early_stopping=True, max_iter=1000
)

hist_classifier = HistGradientBoostingClassifier(
    random_state=0, categorical_features="from_dtype", early_stopping=True, max_iter=1000
)

boosting_params = {
    "max_depth": [3, 8],
    "max_leaf_nodes": [2, 5, 10, 20, 50, 100],
    "learning_rate": [0.01, 1],
}

#cv = KFold(n_splits=4, shuffle=True, random_state=0)
results = []
for model, params in [(hist_regressor, boosting_params), 
                    (hist_classifier, boosting_params)]:
    search = RandomizedSearchCV(
        model, params,
        # scoring='f1_micro', # Classification metrics can't handle a mix of multiclass and continuous targets
        # Nota bene: round the y_pred values as this is a regression/classification problem.
        scoring=make_scorer(lambda y_true, y_pred: f1_score(y_true, y_pred.round(), average='micro'), greater_is_better=True),
        n_jobs=-1,
        return_train_score=True,
        cv=2, # StratifiedKFold/KFold with shuffle=False, 
    ).fit(X_train, y_train)
    result = {"model": type(model), "best_estimator": search.best_estimator_, "cv_results": pd.DataFrame(search.cv_results_)}
    results.append(result)

In [58]:
# 'concat' all cv_results, repeating the 'model' for each value...
# those with prefix of param_...
model_names=map(lambda n: n.__name__, [x['model'] for x in results])
dfs = [x['cv_results'] for x in results]
foo = pd.concat(dfs, keys=model_names, names=['model']).reset_index()
display(foo.T)
params=[column for column in foo.columns if column.startswith('param_')]
alt.Chart(foo).mark_line(point=True).encode(
    x = alt.X(alt.repeat("repeat"), type='quantitative'),
    y = 'mean_test_score:Q',
    color = 'model'
).properties(
    width=100,
    height=100
).repeat(
    repeat=['mean_fit_time'] + params,
)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19
model,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingRegressor,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier,HistGradientBoostingClassifier
level_1,0,1,2,3,4,5,6,7,8,9,0,1,2,3,4,5,6,7,8,9
mean_fit_time,5.0683,58.094998,6.718887,45.990518,88.372224,2.80121,5.289756,67.802228,3.569551,39.77144,239.33028,5.420825,120.591212,290.282187,3.517344,177.286989,252.011629,221.966794,5.031579,144.153536
std_fit_time,0.133469,0.150615,0.021859,0.069159,4.056765,0.097513,0.238647,0.05363,0.379837,0.130246,21.06178,0.023549,0.187279,30.14457,0.040324,1.309213,1.915517,1.163775,0.77127,8.072114
mean_score_time,0.617165,9.640735,0.681868,7.497446,13.988511,0.514053,0.790628,9.390869,0.547948,4.013763,60.246575,0.875028,14.251962,81.945699,0.485347,39.608501,35.530866,18.980366,0.774692,10.098283
std_score_time,0.036692,0.129156,0.009021,0.038273,1.020867,0.024248,0.039307,0.055035,0.026088,0.080233,1.509522,0.014432,0.003854,7.547659,0.003633,0.472203,0.608058,0.416553,0.141237,1.167046
param_max_leaf_nodes,100,10,2,5,100,20,5,20,10,5,20,50,2,100,100,5,20,100,20,2
param_max_depth,8,8,8,3,8,8,8,8,8,8,8,8,3,8,3,8,3,3,8,8
param_learning_rate,1,0.01,1,0.01,0.01,1,1,0.01,1,0.01,0.01,1,0.01,0.01,1,0.01,0.01,0.01,1,0.01
params,"{'max_leaf_nodes': 100, 'max_depth': 8, 'learn...","{'max_leaf_nodes': 10, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 2, 'max_depth': 8, 'learnin...","{'max_leaf_nodes': 5, 'max_depth': 3, 'learnin...","{'max_leaf_nodes': 100, 'max_depth': 8, 'learn...","{'max_leaf_nodes': 20, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 5, 'max_depth': 8, 'learnin...","{'max_leaf_nodes': 20, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 10, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 5, 'max_depth': 8, 'learnin...","{'max_leaf_nodes': 20, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 50, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 2, 'max_depth': 3, 'learnin...","{'max_leaf_nodes': 100, 'max_depth': 8, 'learn...","{'max_leaf_nodes': 100, 'max_depth': 3, 'learn...","{'max_leaf_nodes': 5, 'max_depth': 8, 'learnin...","{'max_leaf_nodes': 20, 'max_depth': 3, 'learni...","{'max_leaf_nodes': 100, 'max_depth': 3, 'learn...","{'max_leaf_nodes': 20, 'max_depth': 8, 'learni...","{'max_leaf_nodes': 2, 'max_depth': 8, 'learnin..."


In [59]:
from sklearn.metrics import f1_score

for result in results:
    estimator = result['best_estimator']
    y_pred = estimator.predict(X_test).round()
    micro_f1 = f1_score(y_test, y_pred, average='micro')
    print(f'{result["model"].__name__} micro f1 {micro_f1}')
    print(f'{estimator.get_params()}')

HistGradientBoostingRegressor micro f1 0.7437693060378734
{'categorical_features': 'from_dtype', 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'squared_error', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'quantile': None, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
HistGradientBoostingClassifier micro f1 0.7471844362157288
{'categorical_features': 'from_dtype', 'class_weight': None, 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.

HistGradientBoostingRegressor micro f1 0.7437693060378734
{'categorical_features': 'from_dtype', 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'squared_error', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'quantile': None, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
HistGradientBoostingClassifier micro f1 0.7471844362157288
{'categorical_features': 'from_dtype', 'class_weight': None, 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}

HistGradientBoostingRegressor micro f1 0.7093110262658046
{'categorical_features': 'from_dtype', 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'squared_error', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'quantile': None, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}
HistGradientBoostingClassifier micro f1 0.7297250628345581
{'categorical_features': 'from_dtype', 'class_weight': None, 'early_stopping': True, 'interaction_cst': None, 'l2_regularization': 0.0, 'learning_rate': 0.01, 'loss': 'log_loss', 'max_bins': 255, 'max_depth': 8, 'max_features': 1.0, 'max_iter': 1000, 'max_leaf_nodes': 100, 'min_samples_leaf': 20, 'monotonic_cst': None, 'n_iter_no_change': 10, 'random_state': 0, 'scoring': 'loss', 'tol': 1e-07, 'validation_fraction': 0.1, 'verbose': 0, 'warm_start': False}

In [61]:
estimator = results[1]['best_estimator']
submission = pd.read_csv("data/submission_format.csv")
submission["damage_grade"] = estimator.predict(X_test_transformed)
submission.to_csv("submission.csv", index=False)