In [1]:
import pandas as pd
from utils import load_tabular_data, evaluate_model, add_series_features
import xgboost as xgb
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV

In [2]:
df_train, df_test, data_dict = load_tabular_data('baseline_train.csv', 'baseline_test.csv', 'data_dictionary.csv')

In [3]:
# get train matrix and target vector
columns_not_in_test = list(set(df_train.columns).difference(set(df_test.columns)))
X_train = df_train.drop(columns_not_in_test + ['id'], axis=1)
y_train = df_train['sii']

In [4]:
X_test = df_test.drop(columns_not_in_test + ['id'], axis=1)
y_test = df_test['sii']

In [5]:
# get categorical and numerical columns
numerical_features = data_dict[(data_dict['Type'] == 'float') | (data_dict['Type'] == 'int')]['Field'].values
numerical_features = [feature for feature in numerical_features if feature in X_train.columns]

categorical_features = data_dict[(data_dict['Type'] == 'str') | (data_dict['Type'] == 'categorical int')]['Field'].values
categorical_features = [feature for feature in categorical_features if feature in X_train.columns]

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', 'passthrough', numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [7]:
# transform data for XGB model
dtrain = xgb.DMatrix(X_train, label=y_train)
dtest = xgb.DMatrix(X_test, label=y_test)

In [74]:
param_test1 = {
    'max_depth':range(3,10,2),
    'min_child_weight':range(1,6,2)
}

grid_search1 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, subsample=0.8, n_estimators=140),
                            param_grid=param_test1, scoring='accuracy', n_jobs=-1, cv=5)
grid_search1.fit(X_train, y_train)
grid_search1.best_params_, grid_search1.best_score_

({'max_depth': 3, 'min_child_weight': 1}, 0.6138177486599167)

In [75]:
param_test2 = {
    'max_depth':[2,3,4],
    'min_child_weight':[0,1,2]
}
grid_search2 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, subsample=0.8),
                            param_grid=param_test2, scoring='accuracy', n_jobs=-1, cv=5)
grid_search2.fit(X_train, y_train)
grid_search2.best_params_, grid_search2.best_score_

({'max_depth': 2, 'min_child_weight': 0}, 0.6128971923555165)

In [76]:
param_test3 = {
    'gamma':[i/10.0 for i in range(0,5)]
}
grid_search3 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, subsample=0.8, max_depth=2, min_child_weight=0),
                            param_grid=param_test3, scoring='accuracy', n_jobs=-1, cv=5)
grid_search3.fit(X_train, y_train)
grid_search3.best_params_, grid_search3.best_score_

({'gamma': 0.0}, 0.6128971923555165)

In [77]:
param_test4 = {
    'subsample':[i/10.0 for i in range(6,10)],
    'colsample_bytree':[i/10.0 for i in range(6,10)]
}
grid_search4 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, max_depth=2, min_child_weight=0, gamma=0),
                            param_grid=param_test4, scoring='accuracy', n_jobs=-1, cv=5)
grid_search4.fit(X_train, y_train)
grid_search4.best_params_, grid_search4.best_score_

({'colsample_bytree': 0.6, 'subsample': 0.7}, 0.6133579929573785)

In [78]:
param_test5 = {
    'subsample':[i/100.0 for i in range(65,80,5)],
    'colsample_bytree':[i/100.0 for i in range(55,70,5)]
}
grid_search5 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, max_depth=2, min_child_weight=0, gamma=0),
                            param_grid=param_test5, scoring='accuracy', n_jobs=-1, cv=5)
grid_search5.fit(X_train, y_train)
grid_search5.best_params_, grid_search5.best_score_

({'colsample_bytree': 0.55, 'subsample': 0.7}, 0.6179273377010125)

In [79]:
param_test6 = {
    'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
}
grid_search6 = GridSearchCV(estimator=xgb.XGBClassifier(objective='multi:softmax', eval_metric='mlogloss', num_class=4, learning_rate=0.1, max_depth=2, min_child_weight=0, gamma=0, subsample=0.7, colsample_bytree=0.55),
                            param_grid=param_test6, scoring='accuracy', n_jobs=-1, cv=5)
grid_search6.fit(X_train, y_train)
grid_search6.best_params_, grid_search6.best_score_

({'reg_alpha': 1e-05}, 0.6179273377010125)

In [8]:
xgb_model = xgb.XGBClassifier(objective='multi:softmax', 
                              eval_metric='mlogloss', 
                              num_class=4, 
                              learning_rate=0.01, 
                              max_depth=2, 
                              min_child_weight=0, 
                              gamma=0, 
                              subsample=0.7, 
                              colsample_bytree=0.55,
                              reg_alpha=1e-5,
                              n_estimators=5000)
xgb_model.fit(X_train, y_train)

In [9]:
eval = evaluate_model(xgb_model, X_test, y_test)
print(f'XGB model accuracy: {eval[1]}, kappa: {eval[0]}')

XGB model accuracy: 0.6021897810218978, kappa: 0.3229773543964831


In [None]:
kaggle_test = pd.read_csv('test.csv')
kaggle_test = add_series_features(kaggle_test, 'series_test.parquet')
id_column = kaggle_test['id']
kaggle_test = preprocessor.transform(kaggle_test.drop(columns_not_in_test + ['id'], axis=1))
predictions = xgb_model.predict(kaggle_test)

In [102]:
submission = pd.DataFrame({'id': id_column, 'sii': predictions})
submission.to_csv('submission.csv', index=False)