In [2]:
import pandas as pd
import numpy as np
from utils import peek
import config
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import GridSearchCV, learning_curve, LearningCurveDisplay, StratifiedKFold
import pickle
import optuna
import xgboost as xgb
from xgboost import XGBClassifier
import lightgbm as lgbm
from catboost import CatBoostClassifier, Pool
import warnings
import matplotlib.pyplot as plt
warnings.filterwarnings("ignore", module="optuna")
warnings.filterwarnings("ignore", module="lightgbm")

## Baseline

In [3]:
train = pd.read_csv(config.TRAIN)

In [4]:
def train_lr_cv(X, y, param, verbose=True):
    clf = LogisticRegression(**param)
    cvs = []
    for fold in range(0, 5):
        train_x = X.query(f'fold != {fold}').drop(columns='fold')
        train_y = y[train_x.index]
        valid_x = X.query(f'fold == {fold}').drop(columns='fold')
        valid_y = y[valid_x.index]
        valid_score = clf.fit(train_x, train_y).predict_proba(valid_x)[:, 1]
        cvs.append(roc_auc_score(valid_y, valid_score))
    
    if verbose:
        print("five fold auc:", *cvs, sep=", ")
        print("mean auc:", np.mean(cvs))
        
    return cvs

In [5]:
def train_lr(X, y, param):
    clf = LogisticRegression(**param)
    clf.fit(X, y)
    return clf

In [14]:
param = {'C':1, 'max_iter':30, 'solver':'newton-cg', 'class_weight':'balanced'}
cvs = train_lr_cv(train.drop('order_status_key', axis=1), train['order_status_key'], param)

five fold auc:, 0.7512034198678033, 0.7362594597183638, 0.749833558254611, 0.7793312340680761, 0.7639389569814949
mean auc: 0.7561133257780698


In [15]:
param = {'C':1, 'max_iter':20, 'solver':'newton-cg', 'class_weight':'balanced'}
model = train_lr(train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key'], param)

In [16]:
pickle.dump(model, open(config.BASELINE, "wb"))

## Random Forest

In [17]:
train = pd.read_csv(config.TRAIN2)

In [18]:
# tune using optuna
clf = RandomForestClassifier(random_state=42, class_weight="balanced", criterion='entropy', n_jobs=-1)
param_distributions = {
    'n_estimators':optuna.distributions.IntDistribution(100, 500, log=True),
    'max_depth':optuna.distributions.IntDistribution(5, 10, step=1), 
    'min_samples_split':optuna.distributions.IntDistribution(5, 50, step=5), 
    'min_samples_leaf':optuna.distributions.IntDistribution(1, 10, step=1),
    'max_features':optuna.distributions.FloatDistribution(0.2, 0.6, step=0.1),
    'max_samples':optuna.distributions.FloatDistribution(0.4, 0.7, step=0.1)
}
optuna_search = optuna.integration.OptunaSearchCV(clf, param_distributions, n_trials=100, scoring="roc_auc", n_jobs=-1)
X, y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
optuna_search.fit(X, y)
y_pred = optuna_search.predict(X)

  optuna_search = optuna.integration.OptunaSearchCV(clf, param_distributions, n_trials=100, scoring="roc_auc", n_jobs=-1)
[32m[I 2023-05-10 11:47:25,605][0m A new study created in memory with name: no-name-5a398565-1abb-4ea7-b981-818c061e1ba4[0m
[32m[I 2023-05-10 11:48:13,255][0m Trial 23 finished with value: 0.7510565192904775 and parameters: {'n_estimators': 117, 'max_depth': 6, 'min_samples_split': 40, 'min_samples_leaf': 10, 'max_features': 0.4, 'max_samples': 0.4}. Best is trial 23 with value: 0.7510565192904775.[0m
[32m[I 2023-05-10 11:48:13,744][0m Trial 12 finished with value: 0.7538933504344463 and parameters: {'n_estimators': 106, 'max_depth': 7, 'min_samples_split': 35, 'min_samples_leaf': 1, 'max_features': 0.2, 'max_samples': 0.7}. Best is trial 12 with value: 0.7538933504344463.[0m
[32m[I 2023-05-10 11:48:13,981][0m Trial 34 finished with value: 0.7538975469108851 and parameters: {'n_estimators': 106, 'max_depth': 10, 'min_samples_split': 35, 'min_samples_leaf'

In [19]:
optuna_search.best_params_

{'n_estimators': 272,
 'max_depth': 8,
 'min_samples_split': 5,
 'min_samples_leaf': 3,
 'max_features': 0.30000000000000004,
 'max_samples': 0.5}

In [21]:
def train_rf_cv(X, y, param, verbose=True):
    clf = RandomForestClassifier(**param)
    cvs = []
    for fold in range(0, 5):
        train_x = X.query(f'fold != {fold}').drop(columns='fold')
        train_y = y[train_x.index]
        valid_x = X.query(f'fold == {fold}').drop(columns='fold')
        valid_y = y[valid_x.index]
        valid_score = clf.fit(train_x, train_y).predict_proba(valid_x)[:, 1]
        cvs.append(roc_auc_score(valid_y, valid_score))
    
    if verbose:
        print("five fold auc:", *cvs, sep=", ")
        print("mean auc:", np.mean(cvs))
        
    return cvs

In [22]:
def train_rf(X, y, param):
    clf = RandomForestClassifier(**param)
    clf.fit(X, y)
    return clf

In [23]:
param = {'n_estimators':272, 'criterion':'entropy', 'max_depth':8, 'min_samples_split':5, 'min_samples_leaf':3, \
         'max_features':0.3,'n_jobs':-1, 'random_state':42, 'class_weight':'balanced', 'max_samples':0.5}
# param.update(optuna_search.best_params_)
cvs = train_rf_cv(train.drop('order_status_key', axis=1), train['order_status_key'], param)

five fold auc:, 0.7496287958616725, 0.7326432129514321, 0.7443634727845254, 0.7819523166891588, 0.7558940158615717
mean auc: 0.7528963628296721


In [24]:
clf = train_rf(train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key'], param)
pickle.dump(clf, open(config.RF, "wb"))

## XGBoost

In [None]:
train = pd.read_csv(config.TRAIN3)
cat_features = ['gender', 'degree', 'card_num']
train[cat_features] = train[cat_features].astype('category')

In [None]:
def objective(trial):
    data = train.copy()
    param = {
        'booster':'gbtree', 
        'objective':'binary:logistic',
        # 'eval_metric':roc_auc_score,
        'grow_policy':'lossguide',
        'n_estimators': 88,
        'max_leaves': 28,
        'max_depth': 4,
        'eta': 0.00039413682468496394,
        'min_child_weight': 20,
        'subsample': 0.7,
        'colsample_bytree': 0.4,
        'scale_pos_weight': 6.0,
        'max_bin': 140,
        'random_state':42,
        'tree_method':'approx',
        'enable_categorical':True
        # 'early_stopping_rounds':10
    }
    
    # Add a callback for pruning.
    # pruning_callback = optuna.integration.XGBoostPruningCallback(trial, "validation-auc")
    aucs = []
    for fold in range(0, 5):
        train_fold = data.query(f"fold != {fold}").drop('fold', axis=1)
        valid_fold = data.query(f"fold == {fold}").drop('fold', axis=1)
        train_x, train_y = train_fold.drop('order_status_key', axis=1), train_fold['order_status_key']
        valid_x, valid_y = valid_fold.drop('order_status_key', axis=1), valid_fold['order_status_key'] 
        clf = XGBClassifier(**param).fit(train_x, train_y, verbose=False)
        preds = clf.predict_proba(valid_x)[:, 1]
        value = roc_auc_score(valid_y, preds)
        aucs.append(value)
    return np.mean(aucs)


if __name__ == "__main__":
    study = optuna.create_study(
        # pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
        direction="maximize"
    )
    study.optimize(objective, n_trials=100)

    print("Number of finished trials: {}".format(len(study.trials)))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: {}".format(trial.value))

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
param = {
        'booster':'gbtree', 
        'objective':'binary:logistic',
        'grow_policy':'lossguide',
        'n_estimators': 88,
        'max_leaves': 28,
        'max_depth': 4,
        'eta': 0.00039413682468496394,
        'min_child_weight': 20,
        'subsample': 0.8,
        'colsample_bytree': 0.4,
        'scale_pos_weight': 9,
        'max_bin': 140,
        'random_state':42,
        'tree_method':'approx',
        'enable_categorical':True}
train_x, train_y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
clf = XGBClassifier(**param).fit(train_x, train_y)

In [None]:
clf.save_model(config.XGBOOST)

## Catboost

In [None]:
train = pd.read_csv(config.TRAIN3)
cat_features= ['gender', 'degree', 'card_num']
train[cat_features] = train[cat_features].astype('category')

In [None]:
Number of finished trials: 100
value: 0.7557021810734862 and parameters: {'iterations': 220, 'learning_rate': 0.02200043117064521}

In [None]:
def objective(trial):
    data = train.copy()
    param_distribution = {
        'loss_function':'Logloss',
        'iterations':trial.suggest_int('iterations', 200, 250, step=5),
        'learning_rate':trial.suggest_float('learning_rate', 1e-2, 0.05, log=True),
        # 'bagging_temperature':trial.suggest_float('bagging_temperature', 1e-2, 1e2, log=True),
        'subsample':0.7,
        # 'sampling_frequency':'PerTree',
        'depth':trial.suggest_int('depth', 5, 10, step=1),
        'grow_policy':'Lossguide',
        # 'min_data_in_leaf':trial.suggest_int('min_data_in_leaf', 10, 20, step=1),
        'max_leaves':trial.suggest_int('max_leaves', 15, 20, step = 1),
        'scale_pos_weight':trial.suggest_float('scale_pos_weight', 0.5, 20, log=True),
        # 'border_count':trial.suggest_int('border_count', 100, 300, step=10),
        'random_seed':42,
        'verbose':False
    }
    
    cvs = []
    clf = CatBoostClassifier(**param_distribution)
    for fold in range(0, 5):
        train_fold = data.query(f'fold !={fold}')
        valid_fold = data.query(f'fold == {fold}')
        train_x = train_fold.drop(columns=['fold', 'order_status_key'])
        train_y = train_fold['order_status_key']
        train_data = Pool(train_x, train_y, cat_features= ['gender', 'degree', 'card_num'])
        valid_x = valid_fold.drop(columns=['fold', 'order_status_key'])
        valid_y = valid_fold['order_status_key']
        valid_data = Pool(valid_x, valid_y, cat_features= ['gender', 'degree', 'card_num'])
        valid_score = clf.fit(train_data).predict_proba(valid_data)[:, 1]
        cvs.append(roc_auc_score(valid_y, valid_score))
        
    return np.mean(cvs)


study = optuna.create_study(
        # pruner=optuna.pruners.MedianPruner(n_warmup_steps=10), 
        direction="maximize"
    )
study.optimize(objective, n_trials=100)

print("Number of finished trials: {}".format(len(study.trials)))

print("Best trial:")
trial = study.best_trial

print("  Value: {}".format(trial.value))

print("  Params: ")
for key, value in trial.params.items():
    print("    {}: {}".format(key, value))

In [None]:
param = {'loss_function':'Logloss', 'iterations':225, 'learning_rate':0.020248583615876215, 'depth':7, 'max_leaves':17, 'scale_pos_weight': 1.0350162147070745, 
         'subsample':0.7, 'grow_policy':'Lossguide', 'verbose':False}
train_x, train_y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
train_data = Pool(train_x, train_y, cat_features= ['gender', 'degree', 'card_num'])
clf = CatBoostClassifier(**param).fit(train_data)
clf.save_model(config.CATBOOST)

## LightGBM

In [26]:
train = pd.read_csv(config.TRAIN3)

In [None]:
Best trial
Avg of five fold auc:0.757285
Params
num_iterations:995
learning_rate:0.00471212286205368
num_leaves:5
max_depth:11
min_data_in_leaft:7
max_int:405
min_data_in_bin:19
bagging_fraction:0.6000000000000001
neg_bagging_fraction:0.3693772602188242
feature_fraction:0.3

In [34]:
def objective(trial):
    data = train.copy()
    
    param_distribution = {
        'objective': 'binary',
        'num_iterations': trial.suggest_int('num_iterations', 800, 1200, log=True),
        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.05, log=True),
        'num_leaves': trial.suggest_int('num_leaves', 3, 10, step=1),
        'max_depth': trial.suggest_int('max_depth', 8, 15, step=1),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaft', 5, 8, step=1),
        'max_bin': trial.suggest_int('max_int', 370, 430, step=5),
        'min_data_in_bin': trial.suggest_int('min_data_in_bin', 17, 22, step=1),
        'bagging_fraction':trial.suggest_float('bagging_fraction', 0.4, 1, step=0.1),
        'neg_bagging_fraction':trial.suggest_float('neg_bagging_fraction', 0.1, 1, log=True),
        # 'cat_smooth': trial.suggest_float('cat_smooth', 1, 1e3, log=True),
        'feature_fraction':0.3,
        'verbosity':0,
        'force_col_wise':True,
        'linear_tree':True,
        'seed':42
    }
    
    aucs = []
    for fold in range(0, 5):
        train_fold = data.query(f'fold != {fold}')
        train_x, train_y = train_fold.drop(columns=['fold', 'order_status_key'], axis=1), train_fold['order_status_key']
        dtrain = lgbm.Dataset(train_x, train_y)
        valid_fold = data.query(f'fold == {fold}')
        valid_x, valid_y = valid_fold.drop(columns=['fold', 'order_status_key'], axis=1), valid_fold['order_status_key']
        dvalid = lgbm.Dataset(valid_x, valid_y)
        clf = lgbm.train(param_distribution, dtrain, categorical_feature=['gender', 'card_num', 'degree'])
        valid_score = clf.predict(valid_x)
        aucs.append(roc_auc_score(valid_y, valid_score))
    
    return np.mean(aucs)


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=1000)

print("Best trial")
trial = study.best_trial

print(f"Avg of five fold auc:{trial.value:.6f}")

print("Params")
for key, value in trial.params.items():
    print(f"{key}:{value}")

[32m[I 2023-05-10 12:53:05,950][0m A new study created in memory with name: no-name-b9002735-bed3-4dd5-acde-4ae024b1b14e[0m
[32m[I 2023-05-10 12:53:09,614][0m Trial 0 finished with value: 0.7519011817843829 and parameters: {'num_iterations': 873, 'learning_rate': 2.1057936655038092e-05, 'num_leaves': 8, 'max_depth': 11, 'min_data_in_leaft': 6, 'max_int': 375, 'min_data_in_bin': 18, 'bagging_fraction': 0.8, 'neg_bagging_fraction': 0.44267734142050535}. Best is trial 0 with value: 0.7519011817843829.[0m
[32m[I 2023-05-10 12:53:12,997][0m Trial 1 finished with value: 0.7542895972088474 and parameters: {'num_iterations': 1109, 'learning_rate': 0.010319594606706875, 'num_leaves': 4, 'max_depth': 12, 'min_data_in_leaft': 8, 'max_int': 380, 'min_data_in_bin': 19, 'bagging_fraction': 0.8, 'neg_bagging_fraction': 0.9743869850194536}. Best is trial 1 with value: 0.7542895972088474.[0m
[32m[I 2023-05-10 12:53:17,997][0m Trial 2 finished with value: 0.7503311222143234 and parameters: {'

Best trial
Avg of five fold auc:0.757862
Params
num_iterations:953
learning_rate:0.004580578839323919
num_leaves:6
max_depth:11
min_data_in_leaft:6
max_int:380
min_data_in_bin:22
bagging_fraction:0.6000000000000001
neg_bagging_fraction:0.20589882762815556


In [38]:
param = {'num_iterations':995, 'learning_rate':0.00471212286205368, 'num_leaves':5,' max_depth':11, 'min_data_in_leaf':7
        ,'max_bin':405, 'min_data_in_bin':19, 'objective':'binary', 'feature_fraction':0.3, 'verbosity':0, 'force_col_wise':True, 
         'bagging_fraction':0.6, 'neg_bagging_fraction':0.3693772602188242, 'linear_tree':True, 'seed':42}
train_x, train_y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
dtrain = lgbm.Dataset(train_x, label=train_y)
clf = lgbm.train(param, dtrain, categorical_feature=['degree', 'card_num', 'gender'])
clf.save_model(config.LIGHTGBM)

<lightgbm.basic.Booster at 0x7f6bb472ceb0>

## learning curve

In [None]:
train = pd.read_csv(config.TRAIN)
train_x, train_y = train.drop(['order_status_key', 'fold'], axis=1), train['order_status_key']
lr_param = {'C':1, 'max_iter':20, 'solver':'newton-cg', 'class_weight':'balanced'}
rf_param = {'n_estimators':358, 'criterion':'entropy', 'max_depth':9, 'min_samples_split':35, 'min_samples_leaf':3, \
         'max_features':0.2,'n_jobs':-1, 'random_state':42, 'class_weight':'balanced', 'max_samples':0.4}
lr = LogisticRegression(**lr_param)
rf= RandomForestClassifier(**rf_param)
common_params = {
    "X": train_x,
    "y": train_y,
    "train_sizes": np.linspace(0.1, 1.0, num=10),
    "cv": StratifiedKFold(n_splits=5, shuffle=True, random_state=42),
    "score_type": "both",
    "n_jobs": 4,
    "line_kw": {"marker": "o"},
    "std_display_style": "fill_between",
    "score_name": "roc_auc",
}


In [None]:
fig, ax = plt.subplots(1, 2, figsize=(20, 6))
for ax_idx, estimator in enumerate([lr, rf]):
    LearningCurveDisplay.from_estimator(estimator, **common_params, ax=ax[ax_idx])
    ax[ax_idx].set_title(f"learning curve for {estimator.__class__.__name__}")

通过分析学习曲线可知，增加训练样本不能改善样本性能，下一步的精力应该放在改善模型/特征工程/收集更多特征上