# RandomizedSearchCV for XGBoost

This is the boiler plate I use for randomized search for XGBoost.

!!! Importantly, we should clean data first before using this code, despite us including SMOTE here.

In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import warnings

warnings.filterwarnings("ignore", category=UserWarning)


def balance(X, y):
    smote = SMOTE()
    X_res, y_res = smote.fit_resample(X, y)
    return X_res, y_res


def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print(
            "\n Time taken: %i hours %i minutes and %s seconds."
            % (thour, tmin, round(tsec, 2))
        )

## Parameters of Search

Realistically, I should tune the n_estimators, since it should depend on the feature space.

In [None]:
params = {
    "n_estimators": [800, 900, 1000, 1100, 1200],
    "eta": [0.1, 0.3, 0.5],
    "min_child_weight": [1, 5, 8],
    "gamma": [0.6, 1, 2],
    "subsample": [0.8, 1.0],
    "colsample_bytree": [0.6, 0.8, 1.0],
    "max_depth": [5, 8, 10],
}

# Training XGBoost

## Training Setup

Modify this to the training data

In [None]:
df_train = None

df_test = None
test = df_test.drop(["id"], axis=1)

targets = []

In [None]:
xgb = XGBClassifier(
    objective="binary:logistic",
    verbosity=0,
    nthread=1,
)

In [None]:
folds = 5
param_comb = 225

skf = StratifiedKFold(n_splits=folds, shuffle=True)

predictions = {}
predictions["id"] = df_test["id"]

# Loop over each target label
for target in tqdm(targets):
    y_target = df_train[target]

    # Balance the data
    smote = SMOTE()
    X, y_target = smote.fit_resample(X, y_target)

    # Create a new StratifiedKFold instance for each target
    skf = StratifiedKFold(n_splits=folds, shuffle=True)

    # Create a new RandomizedSearchCV instance for each target
    random_search = RandomizedSearchCV(
        xgb,
        param_distributions=params,
        n_iter=param_comb,
        scoring="roc_auc",
        n_jobs=-1,
        cv=skf.split(X, y_target),
        verbose=0,
    )
    start_time = timer(None)
    random_search.fit(X, y_target)
    print(random_search.best_params_)
    y_test = random_search.predict_proba(test)
    predictions[target] = y_test
    print(f"Finished grid search and generating predictions for target: {target}")

print("Finished all targets.")

df_pred_pos = pd.DataFrame()
df_pred_pos["id"] = predictions["id"]

for target in targets:
    df_pred_pos[target] = predictions[target][:, 1]

df_pred_pos.to_csv("pred.csv", index=False)

# Optionally Submit to Kaggle

In [None]:
COMPETITION_NAME = ""

### ////////// Uncomment the following line to submit to Kaggle ////////// ###

# command = f"kaggle competitions submit -c {COMPETITION_NAME} -f pred.csv -m 'submission'"
# os.system(command)