In [1]:
import re
from pprint import pprint
from datetime import datetime
from collections import Counter
import pandas as pd
import numpy as np

from sklearn.utils import class_weight
from sklearn.preprocessing import LabelEncoder
#from category_encoders import TargetEncoder, CatBoostEncoder
from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn import metrics

import xgboost as xgb

from hyperopt import Trials, STATUS_OK, tpe, hp, fmin

In [2]:
method = "target"
assert method in ["label", "target", "onehot", "catboost"]

features = pd.read_csv(f"../data/processed/{method}/train.csv")
test = pd.read_csv(f"../data/processed/{method}/test.csv")

targets = features["label"].values

features.drop(["label"], inplace=True, axis=1)

print(features.shape, test.shape)

(80176, 20) (34365, 20)


### Gridsearch with HypeOPT

In [3]:
def objective(space):
    classifier = xgb.XGBClassifier(objective="multi:softmax",
                                   n_estimators=int(space['n_estimators']),
                                   max_depth=int(space['max_depth']),
                                   learning_rate = space['learning_rate'],
                                   gamma = space['gamma'],
                                   min_child_weight = space['min_child_weight'],
                                   subsample = space['subsample'],
                                   colsample_bytree = space['colsample_bytree'],
                                )
    #classifier.fit(features, targets)    
    scores = cross_val_score(estimator=classifier,
                             n_jobs=10,
                             X=features, 
                             y=targets, 
                             cv=StratifiedKFold(n_splits=5),
                             scoring="f1_micro"
                            )
    mean_score = scores.mean()
    
    return {
        'loss': 1-mean_score, 
        'status': STATUS_OK 
    }

In [26]:
space = {
    'n_estimators': hp.quniform('n_estimators', 100, 160, 5),
    'max_depth' : hp.quniform('max_depth', 3, 20, 1),
    'learning_rate' : hp.quniform('learning_rate', 0.01, 0.2, 0.01),
    'gamma' : hp.quniform('gamma', 0, 5, 0.25),
    'min_child_weight' : hp.quniform('min_child_weight', 1, 20, 1),
    'subsample' : hp.quniform('subsample', 0.1, 1, 0.01),
    'colsample_bytree' : hp.quniform('colsample_bytree', 0.1, 1.0, 0.01)
}

trials = Trials()
best = fmin(fn=objective, 
            verbose=5,
            space=space,
            algo=tpe.suggest,
            max_evals=50,
            trials=trials)

100%|██████████| 50/50 [45:41<00:00, 54.83s/trial, best loss: 0.3944945956101422] 


In [6]:
best

{'colsample_bytree': 0.71,
 'gamma': 1.92,
 'learning_rate': 0.03,
 'max_depth': 6.0,
 'min_child_weight': 16.0,
 'n_estimators': 135.0,
 'subsample': 0.96}

In [7]:
best["max_depth"] = int(best["max_depth"])
best["n_estimators"] = int(best["n_estimators"])

In [16]:
best

{'colsample_bytree': 0.71,
 'gamma': 1.92,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 16.0,
 'n_estimators': 135,
 'subsample': 0.96}

### Testing cross_val_score: 

In [21]:
model = xgb.XGBClassifier(objective="multi:softmax", **{'colsample_bytree': 0.71,
 'gamma': 5,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 16.0,
 'n_estimators': 175,
 'subsample': 0.96})

scores = cross_val_score(model, features, targets, scoring="f1_micro", verbose=2, n_jobs=5)
print(f"mean score: {np.mean(scores)}")

[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done   2 out of   5 | elapsed:   43.9s remaining:  1.1min


mean score: 0.605106280591152


[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   46.2s remaining:    0.0s
[Parallel(n_jobs=5)]: Done   5 out of   5 | elapsed:   46.2s finished


### Train best model on all data

In [22]:
model = xgb.XGBClassifier(objective="multi:softmax", **{'colsample_bytree': 0.71,
 'gamma': 5,
 'learning_rate': 0.03,
 'max_depth': 6,
 'min_child_weight': 16.0,
 'n_estimators': 175,
 'subsample': 0.96})
model.fit(features, targets)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=0.71, gamma=5, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.03, max_delta_step=0, max_depth=6,
              min_child_weight=16.0, missing=nan, monotone_constraints='()',
              n_estimators=175, n_jobs=0, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.96,
              tree_method='exact', validate_parameters=1, verbosity=None)

### Submission 

In [23]:
submission = pd.DataFrame()
submission["Id"] = range(len(test))
submission["label"] = model.predict(test)
submission.to_csv(f"../data/submissions/xgb_{method}.csv", index=False)

### Save proba

In [24]:
probas = pd.DataFrame()
probas["Id"] = range(len(test))

probas = pd.concat([probas, pd.DataFrame(model.predict_proba(test))], axis=1)

In [25]:
probas.to_csv(f"../data/submissions/xgb_probas_{method}.csv", index=False)

#  0.60723 private public leaderboard