In [1]:
import pandas as pd
import numpy as np
from utils import peek
import config
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV
import pickle
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings("ignore", module="optuna")

## Baseline

In [2]:
train = pd.read_csv(config.TRAIN)

In [3]:
def train_lr_cv(X, y, param, verbose=True):
    clf = LogisticRegression(**param)
    cvs = []
    for fold in range(0, 5):
        train_x = X.query(f'fold != {fold}').drop(columns='fold')
        train_y = y[train_x.index]
        valid_x = X.query(f'fold == {fold}').drop(columns='fold')
        valid_y = y[valid_x.index]
        valid_score = clf.fit(train_x, train_y).predict_proba(valid_x)[:, 1]
        cvs.append(roc_auc_score(valid_y, valid_score))
    
    if verbose:
        print("five fold auc:", *cvs, sep=", ")
        print("mean auc:", np.mean(cvs))
        
    return cvs

In [4]:
def train_lr(X, y, param):
    clf = LogisticRegression(**param)
    clf.fit(X, y)
    return clf

In [5]:
param = {'C':0.3, 'max_iter':20, 'solver':'newton-cg', 'class_weight':'balanced'}
cvs = train_lr_cv(train.drop('order_status_key', axis=1), train['order_status_key'], param)

five fold auc:, 0.7520236612702366, 0.7386064278187566, 0.7496056380266907, 0.7717918728445043, 0.7620043258832011
mean auc: 0.7548063851686778


In [6]:
param = {'C':0.3, 'max_iter':20, 'solver':'newton-cg', 'class_weight':'balanced'}
model = train_lr(train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key'], param)

In [7]:
pickle.dump(model, open(config.BASELINE, "wb"))

## Random Forest

In [8]:
train = pd.read_csv(config.TRAIN2)

In [29]:
# tune using optuna
clf = RandomForestClassifier(random_state=42, class_weight="balanced", criterion='entropy', n_jobs=-1)
param_distributions = {
    'n_estimators':optuna.distributions.IntDistribution(100, 500, log=True),
    'max_depth':optuna.distributions.IntDistribution(5, 10, step=1), 
    'min_samples_split':optuna.distributions.IntDistribution(5, 50, step=5), 
    'min_samples_leaf':optuna.distributions.IntDistribution(1, 10, step=1),
    'max_features':optuna.distributions.FloatDistribution(0.2, 0.6, step=0.1),
    'max_samples':optuna.distributions.FloatDistribution(0.4, 0.7, step=0.1)
}
optuna_search = optuna.integration.OptunaSearchCV(clf, param_distributions, n_trials=100, scoring="roc_auc", n_jobs=-1)
X, y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
optuna_search.fit(X, y)
y_pred = optuna_search.predict(X)

  optuna_search = optuna.integration.OptunaSearchCV(clf, param_distributions, n_trials=100, scoring="roc_auc", n_jobs=-1)
[32m[I 2023-04-28 17:18:39,791][0m A new study created in memory with name: no-name-0847689d-3989-429d-8e06-65bef752f881[0m
[32m[I 2023-04-28 17:19:15,480][0m Trial 0 finished with value: 0.751346986587073 and parameters: {'n_estimators': 104, 'max_depth': 7, 'min_samples_split': 15, 'min_samples_leaf': 8, 'max_features': 0.5, 'max_samples': 0.7}. Best is trial 0 with value: 0.751346986587073.[0m
[32m[I 2023-04-28 17:19:20,059][0m Trial 8 finished with value: 0.7505822756244529 and parameters: {'n_estimators': 115, 'max_depth': 8, 'min_samples_split': 20, 'min_samples_leaf': 8, 'max_features': 0.4, 'max_samples': 0.4}. Best is trial 0 with value: 0.751346986587073.[0m
[32m[I 2023-04-28 17:19:20,269][0m Trial 11 finished with value: 0.7523077192907761 and parameters: {'n_estimators': 103, 'max_depth': 7, 'min_samples_split': 25, 'min_samples_leaf': 8, 'max

In [30]:
optuna_search.best_params_

{'n_estimators': 358,
 'max_depth': 9,
 'min_samples_split': 35,
 'min_samples_leaf': 3,
 'max_features': 0.2,
 'max_samples': 0.4}

In [10]:
# tune using grid search
# params = {
#     'n_estimators':[30,40,50,100,200],
#     'max_depth':[8,9,10,11,12], 
#     'min_samples_split':[5,10,15,20], 
#     'min_samples_leaf':[1,2,4,6],
#     'max_features':[0.2,0.3,0.4],
#     'max_samples':[0.3,0.4,0.5,0.6]
#             }
# clf = RandomForestClassifier(random_state=42, class_weight="balanced", criterion='entropy', n_jobs=-1, verbose=0)
# grid_search = GridSearchCV(clf, param_grid=params, scoring="roc_auc", n_jobs=-1)
# X, y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
# grid_search.fit(X, y)

KeyboardInterrupt: 

In [None]:
# grid_search.best_params_

In [11]:
def train_rf_cv(X, y, param, verbose=True):
    clf = RandomForestClassifier(**param)
    cvs = []
    for fold in range(0, 5):
        train_x = X.query(f'fold != {fold}').drop(columns='fold')
        train_y = y[train_x.index]
        valid_x = X.query(f'fold == {fold}').drop(columns='fold')
        valid_y = y[valid_x.index]
        valid_score = clf.fit(train_x, train_y).predict_proba(valid_x)[:, 1]
        cvs.append(roc_auc_score(valid_y, valid_score))
    
    if verbose:
        print("five fold auc:", *cvs, sep=", ")
        print("mean auc:", np.mean(cvs))
        
    return cvs

In [12]:
def train_rf(X, y, param):
    clf = RandomForestClassifier(**param)
    clf.fit(X, y)
    return clf

In [40]:
param = {'n_estimators':358, 'criterion':'entropy', 'max_depth':9, 'min_samples_split':35, 'min_samples_leaf':3, \
         'max_features':0.2,'n_jobs':-1, 'random_state':42, 'class_weight':'balanced', 'max_samples':0.4}
# param.update(optuna_search.best_params_)
cvs = train_rf_cv(train.drop('order_status_key', axis=1), train['order_status_key'], param)

five fold auc:, 0.7483475428680908, 0.7357026535108727, 0.7483220872694556, 0.7775678512520618, 0.7565669310261955
mean auc: 0.7533014131853354


In [41]:
clf = train_rf(train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key'], param)
pickle.dump(clf, open(config.RF, "wb"))

## XGBoost

In [4]:
train = pd.read_csv(config.TRAIN3)
cat_features = ['gender', 'degree', 'card_num']
train[cat_features] = train[cat_features].astype('category')

In [None]:
 Value: 0.7541475985895597
  Params: 
    grow_policy: lossguide
    max_depth: 5
    eta: 2.2381136185393096e-05
    min_child_weight: 28
    subsample: 0.8
    colsample_bytree: 0.4
    scale_pos_weight: 6.0
    max_leaves: 9
    max_bin: 130

In [8]:
def objective(trial):
    data = train.copy()
    param = {
        'booster':'gbtree', 
        'objective':'binary:logistic',
        'eval_metric':'auc',
        'grow_policy':'lossguide',
        'max_depth':trial.suggest_int('max_depth', 3, 10, step=1), 
        'eta':trial.suggest_float('eta', 1e-6, 1e-3, log=True),
        'min_child_weight':trial.suggest_int('min_child_weight', 2, 40, step=2),
        'subsample':trial.suggest_float('subsample', 0.3, 1, step=0.1),
        'colsample_bytree':trial.suggest_float('colsample_bytree', 0.2, 0.7, step=0.1),
        'scale_pos_weight':trial.suggest_float('scale_pos_weight', 1, 10, step=0.5),
        'max_leaves':trial.suggest_int('max_leaves', 2, 12, step=1),
        'max_bin':trial.suggest_int('max_bin', 100, 300, step=10)
    }
    
    # Add a callback for pruning.
    pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    aucs = []
    for fold in range(0, 5):
        train_fold = data.query(f"fold != {fold}").drop('fold', axis=1)
        valid_fold = data.query(f"fold == {fold}").drop('fold', axis=1)
        train_x, train_y = train_fold.drop(['order_status_key'], axis=1), train_fold['order_status_key']
        valid_x, valid_y = valid_fold.drop('order_status_key', axis=1), valid_fold['order_status_key'] 
        dtrain = xgb.DMatrix(train_x, label=train_y, enable_categorical=True)
        dvalid = xgb.DMatrix(valid_x, label=valid_y, enable_categorical=True)
        clf = xgb.train(param, dtrain, evals=[(dvalid, 'validation')], verbose_eval=0, num_boost_round=200, early_stopping_rounds=10, callbacks=[pruning_callback])
        preds = clf.predict(dvalid)
        value = roc_auc_score(valid_y, preds)
        aucs.append(value)
    return np.mean(aucs)


if __name__ == "__main__":
    study = optuna.create_study(
        pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), direction="maximize"
    )
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

[32m[I 2023-05-04 15:23:38,302][0m A new study created in memory with name: no-name-e8efc702-f444-46eb-8aa4-f83faa695436[0m
[32m[I 2023-05-04 15:23:39,036][0m Trial 0 finished with value: 0.7466479764461019 and parameters: {'max_depth': 5, 'eta': 0.0002367492595841048, 'min_child_weight': 2, 'subsample': 0.3, 'colsample_bytree': 0.7, 'scale_pos_weight': 2.5, 'max_leaves': 12, 'max_bin': 170}. Best is trial 0 with value: 0.7466479764461019.[0m
[32m[I 2023-05-04 15:23:39,956][0m Trial 1 finished with value: 0.7516017900428498 and parameters: {'max_depth': 6, 'eta': 9.430384174287375e-05, 'min_child_weight': 24, 'subsample': 0.7, 'colsample_bytree': 0.30000000000000004, 'scale_pos_weight': 9.5, 'max_leaves': 2, 'max_bin': 100}. Best is trial 1 with value: 0.7516017900428498.[0m
[32m[I 2023-05-04 15:23:40,462][0m Trial 2 finished with value: 0.7482624902268016 and parameters: {'max_depth': 4, 'eta': 0.0004944493395366299, 'min_child_weight': 20, 'subsample': 0.6000000000000001, 

Number of finished trials: 100
Best trial:
  Value: 0.7532627244847865
  Params: 
    max_depth: 8
    eta: 0.00017987652249675388
    min_child_weight: 28
    subsample: 0.8
    colsample_bytree: 0.4
    scale_pos_weight: 6.0
    max_leaves: 8
    max_bin: 250


In [None]:
param = {'grow_policy': 'lossguide',
        'max_depth': 5,
        'eta': 2.2381136185393096e-05,
        'min_child_weight': 28,
        'subsample': 0.8,
        'colsample_bytree': 0.4,
        'scale_pos_weight': 6.0,
        'max_leaves': 9,
        'max_bin': 130}
train_x, train_y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
dtrain = xgb.DMatrix(train_x, label=train_y, enable_categorical=True)
clf = xgb.train(param, dtrain, num_boost_round=200, early_stopping_rounds=10)
