In [1]:
import pandas as pd
import numpy as np
import h5py

from sklearn import model_selection, linear_model, metrics, pipeline, preprocessing

# Set random seed
np.random.seed(42)

In [2]:
# Load the data and separate the id's
X_data = pd.read_hdf("cat.hdf5", "train")
y_data = pd.read_hdf("cat.hdf5", "train_target")
X_test = pd.read_hdf("cat.hdf5", "test")

data_id = X_data.loc[:, "id"]
test_id = X_test.loc[:, "id"]

X_data.drop(columns="id", inplace=True)
X_test.drop(columns="id", inplace=True)

In [3]:
# Split the data with labels into training and validation.
X_train, X_val, y_train, y_val = model_selection.train_test_split(X_data, y_data, test_size=0.3)

In [4]:
print(X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape)

(210000, 40) (210000,) (90000, 40) (90000,) (200000, 40)


## Logisitic Regression as baseline model

In [5]:
logistic = pipeline.Pipeline(
    [
        ("min_max_scaler", preprocessing.MinMaxScaler()),
        ("logistic_classifier", linear_model.LogisticRegressionCV(
            solver="lbfgs", max_iter=2000, cv=5, n_jobs=-1))
    ]
)

logistic.fit(X_train, y_train);

In [6]:
y_val_pred = logistic.predict_proba(X_val)[:, 1]

In [7]:
metrics.roc_auc_score(y_val, y_val_pred)

0.7663509689077292

In [8]:
def get_score(model):
    y_val_pred = model.predict_proba(X_val)[:, 1]
    val_score = metrics.roc_auc_score(y_val, y_val_pred)

    y_train_pred = model.predict_proba(X_train)[:, 1]
    train_score = metrics.roc_auc_score(y_train, y_train_pred)

    return (train_score, val_score)

In [9]:
get_score(logistic)

(0.7650231180301985, 0.7663509689077292)

## Now use XGBoost

In [10]:
from xgboost import XGBClassifier

In [11]:
# We do not use GridSearchCV:
# See https://github.com/dmlc/xgboost/issues/2819
# Instead we implement our own grid search.

from collections.abc import Iterable
import itertools

# Takes a parameter grid similar to the format in
# sklearn GridSearchCV and returns a list of names every time.
def get_grid_iter(param_grid):
    names = []
    values = []
    
    for param_name, param_values in param_grid.items():
        names.append(param_name)
        
        if isinstance(param_values, list):
            values.append(list(param_values))
        else:
            values.append(list([param_values]))
    
    it = (dict(zip(names, param)) for param in itertools.product(*values))
    item_cnt = np.prod(np.array([len(v) for v in values]))
    return it, item_cnt

# Similar to GridSearchCV
def hyper_opt(model_base, param_grid, metric, X_train, y_train, X_val, y_val, bin_prob, verbose=False):
    best_param = None
    best_param_id = -1
    best_score = -np.inf
    
    it, item_cnt = get_grid_iter(param_grid)
    
    step = 0
    
    for param in it:
        
        model = model_base(**param)
        model.fit(X_train, y_train)
        
        if bin_prob:
            y_train_pred = model.predict_proba(X_train)[:, 1]
            y_val_pred = model.predict_proba(X_val)[:, 1]
        else:
            y_train_pred = model.predict(X_train)
            y_val_pred = model.predict(X_val)
        
        train_score = metric(y_train, y_train_pred)
        val_score = metric(y_val, y_val_pred)
        
        if val_score > best_score:
            best_score = val_score
            best_param_id = step
        
        step += 1
        if verbose:
            print("[%d/%d] train:%f test:%f best:%f id:%d" %
                (step, item_cnt, train_score, val_score, best_score, best_param_id))
            
    return best_score, best_param, best_param_id

In [12]:
xgb_param_grid = {
    "n_estimators": [500, 1000, 1500, 2000, 3000],
    'max_depth':[1, 2, 3, 4, 5],
    'objective':'binary:logistic',
    'subsample':[0.6, 0.8, 1], 
    'colsample_bytree':[0.6, 0.8, 1.0],
    'learning_rate':[0.001, 0.01, 0.1],
    'tree_method':'gpu_hist',
    'evalmetric': 'auc'
}


In [13]:
%%time
score, param, param_id = hyper_opt(XGBClassifier, xgb_param_grid, metrics.roc_auc_score,
          X_train, y_train, X_val, y_val,
          bin_prob=True, verbose=True)

[1/675] train:0.671892 test:0.670447 best:0.670447 id:0
[2/675] train:0.716421 test:0.715462 best:0.715462 id:1
[3/675] train:0.761949 test:0.762208 best:0.762208 id:2
[4/675] train:0.668143 test:0.666607 best:0.762208 id:2
[5/675] train:0.716741 test:0.715839 best:0.762208 id:2
[6/675] train:0.762040 test:0.762301 best:0.762301 id:5
[7/675] train:0.668453 test:0.667074 best:0.762301 id:5
[8/675] train:0.716499 test:0.715538 best:0.762301 id:5
[9/675] train:0.762145 test:0.762317 best:0.762317 id:8
[10/675] train:0.674085 test:0.673059 best:0.762317 id:8
[11/675] train:0.717710 test:0.716821 best:0.762317 id:8
[12/675] train:0.762334 test:0.762609 best:0.762609 id:11
[13/675] train:0.670082 test:0.668960 best:0.762609 id:11
[14/675] train:0.717817 test:0.716933 best:0.762609 id:11
[15/675] train:0.762335 test:0.762573 best:0.762609 id:11
[16/675] train:0.669924 test:0.668792 best:0.762609 id:11
[17/675] train:0.717632 test:0.716733 best:0.762609 id:11
[18/675] train:0.762460 test:0.762

[142/675] train:0.680569 test:0.679047 best:0.775515 id:92
[143/675] train:0.732031 test:0.731360 best:0.775515 id:92
[144/675] train:0.766940 test:0.766940 best:0.775515 id:92
[145/675] train:0.686542 test:0.685249 best:0.775515 id:92
[146/675] train:0.732895 test:0.732318 best:0.775515 id:92
[147/675] train:0.766802 test:0.766877 best:0.775515 id:92
[148/675] train:0.682701 test:0.681313 best:0.775515 id:92
[149/675] train:0.732969 test:0.732375 best:0.775515 id:92
[150/675] train:0.766829 test:0.766867 best:0.775515 id:92
[151/675] train:0.682652 test:0.681199 best:0.775515 id:92
[152/675] train:0.732903 test:0.732282 best:0.775515 id:92
[153/675] train:0.766885 test:0.766942 best:0.775515 id:92
[154/675] train:0.686341 test:0.685503 best:0.775515 id:92
[155/675] train:0.734013 test:0.733442 best:0.775515 id:92
[156/675] train:0.766669 test:0.766727 best:0.775515 id:92
[157/675] train:0.681016 test:0.679888 best:0.775515 id:92
[158/675] train:0.734005 test:0.733428 best:0.775515 id:

[280/675] train:0.690390 test:0.689021 best:0.778349 id:230
[281/675] train:0.742455 test:0.742115 best:0.778349 id:230
[282/675] train:0.768664 test:0.768641 best:0.778349 id:230
[283/675] train:0.688587 test:0.687224 best:0.778349 id:230
[284/675] train:0.742591 test:0.742262 best:0.778349 id:230
[285/675] train:0.768727 test:0.768676 best:0.778349 id:230
[286/675] train:0.688879 test:0.687517 best:0.778349 id:230
[287/675] train:0.742586 test:0.742236 best:0.778349 id:230
[288/675] train:0.768785 test:0.768752 best:0.778349 id:230
[289/675] train:0.690412 test:0.689547 best:0.778349 id:230
[290/675] train:0.743840 test:0.743558 best:0.778349 id:230
[291/675] train:0.768338 test:0.768317 best:0.778349 id:230
[292/675] train:0.688804 test:0.687918 best:0.778349 id:230
[293/675] train:0.743844 test:0.743563 best:0.778349 id:230
[294/675] train:0.768335 test:0.768305 best:0.778349 id:230
[295/675] train:0.688804 test:0.687918 best:0.778349 id:230
[296/675] train:0.743844 test:0.743563 b

[417/675] train:0.769859 test:0.769700 best:0.779536 id:335
[418/675] train:0.694992 test:0.693834 best:0.779536 id:335
[419/675] train:0.749116 test:0.749025 best:0.779536 id:335
[420/675] train:0.769905 test:0.769716 best:0.779536 id:335
[421/675] train:0.694983 test:0.693810 best:0.779536 id:335
[422/675] train:0.749233 test:0.749133 best:0.779536 id:335
[423/675] train:0.769991 test:0.769823 best:0.779536 id:335
[424/675] train:0.696226 test:0.695085 best:0.779536 id:335
[425/675] train:0.750101 test:0.750036 best:0.779536 id:335
[426/675] train:0.769349 test:0.769263 best:0.779536 id:335
[427/675] train:0.695319 test:0.694174 best:0.779536 id:335
[428/675] train:0.750118 test:0.750050 best:0.779536 id:335
[429/675] train:0.769345 test:0.769262 best:0.779536 id:335
[430/675] train:0.695320 test:0.694175 best:0.779536 id:335
[431/675] train:0.750119 test:0.750060 best:0.779536 id:335
[432/675] train:0.769345 test:0.769256 best:0.779536 id:335
[433/675] train:0.714181 test:0.712557 b

[554/675] train:0.756393 test:0.756507 best:0.780192 id:476
[555/675] train:0.771558 test:0.771089 best:0.780192 id:476
[556/675] train:0.702814 test:0.701803 best:0.780192 id:476
[557/675] train:0.756450 test:0.756530 best:0.780192 id:476
[558/675] train:0.771633 test:0.771166 best:0.780192 id:476
[559/675] train:0.703809 test:0.702785 best:0.780192 id:476
[560/675] train:0.756828 test:0.756894 best:0.780192 id:476
[561/675] train:0.770694 test:0.770438 best:0.780192 id:476
[562/675] train:0.703757 test:0.702732 best:0.780192 id:476
[563/675] train:0.756811 test:0.756879 best:0.780192 id:476
[564/675] train:0.770694 test:0.770438 best:0.780192 id:476
[565/675] train:0.703758 test:0.702731 best:0.780192 id:476
[566/675] train:0.756814 test:0.756883 best:0.780192 id:476
[567/675] train:0.770693 test:0.770441 best:0.780192 id:476
[568/675] train:0.724437 test:0.722864 best:0.780192 id:476
[569/675] train:0.766856 test:0.764938 best:0.780192 id:476
[570/675] train:0.791079 test:0.780488 b

In [21]:
best_id = 587
list(get_grid_iter(xgb_param_grid)[0])[587]

{'n_estimators': 3000,
 'max_depth': 2,
 'objective': 'binary:logistic',
 'subsample': 1,
 'colsample_bytree': 0.6,
 'learning_rate': 0.1,
 'tree_method': 'gpu_hist',
 'evalmetric': 'auc'}