In [1]:
from google.colab import drive

In [2]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
!ls /content/drive/MyDrive/Amex/

Models	Predictions	 test.parquet	   train_labels.csv
OOF	test_fe.parquet  train_fe.parquet  train.parquet


In [4]:
!pip3 install optuna catboost

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Preprocessing

In [5]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb
from itertools import combinations

# ====================================================
# Configurations
# ====================================================
class CFG:
    input_dir = '/content/drive/MyDrive/Amex/'
    seed = 42
    #seed = 52
    #seed = 62
    n_folds = 5
    target = 'target'
    boosting_type = 'dart'
    metric = 'binary_logloss'
    #metric = None

# ====================================================
# Seed everything
# ====================================================
# 実験用に乱数seedを固定する
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)

# ====================================================
# Read data
# ====================================================
# 保存した素性付きデータを読み込む
def read_data():
    train = pd.read_parquet(CFG.input_dir + 'train_fe.parquet')
    test = pd.read_parquet(CFG.input_dir + 'test_fe.parquet')
    return train, test

# ====================================================
# Amex metric
# ====================================================
def amex_metric(y_true, y_pred):
    labels = np.transpose(np.array([y_true, y_pred]))
    labels = labels[labels[:, 1].argsort()[::-1]]
    weights = np.where(labels[:,0]==0, 20, 1)
    cut_vals = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])
    gini = [0,0]
    for i in [1,0]:
        labels = np.transpose(np.array([y_true, y_pred]))
        labels = labels[labels[:, i].argsort()[::-1]]
        weight = np.where(labels[:,0]==0, 20, 1)
        weight_random = np.cumsum(weight / np.sum(weight))
        total_pos = np.sum(labels[:, 0] *  weight)
        cum_pos_found = np.cumsum(labels[:, 0] * weight)
        lorentz = cum_pos_found / total_pos
        gini[i] = np.sum((lorentz - weight_random) * weight)
    return 0.5 * (gini[1]/gini[0] + top_four)

# ====================================================
# LGBM amex metric
# ====================================================
def xgb_amex_metric(y_pred: np.ndarray, dtrain: xgb.DMatrix):
    y_true = dtrain.get_label()
    return 'amex_metric', amex_metric(y_true, y_pred)

In [6]:
seed_everything(CFG.seed)
train, test = read_data()

In [7]:
# importanceが0のものを弾いておく
IGNORE_COL = ['B_31_max',
 'D_108_min',
 'D_111_diff1',
 'D_111_min',
 'D_123_min',
 'D_127_min',
 'D_137_min',
 'D_138_min',
 'D_65_min',
 'D_86_diff1',
 'D_88_diff1',
 'D_92_min',
 'D_96_min',
 'R_10_min',
 'R_13_min',
 'R_14_min',
 'R_15_min',
 'R_17_min',
 'R_19_diff1',
 'R_20_min',
 'R_21_min',
 'R_22_min',
 'R_24_min',
 'R_25_last',
 'R_25_min',
 'R_26_diff1',
 'R_4_min',
 'R_5_min',
 'R_7_min',
 'R_8_min',
 'S_20_min',]

# Optuna

In [15]:

# for optuna
#Label encode categorical features
cat_features = [
    "B_30",
    "B_38",
    "D_114",
    "D_116",
    "D_117",
    "D_120",
    "D_126",
    "D_63",
    "D_64",
    "D_66",
    "D_68"
]

# lastはcategoricalそのものなので指定する
cat_features = [f"{cf}_last" for cf in cat_features]
# cat_featuresで指定するならlabel encodingしなくてもよさそう？
for cat_col in cat_features:
    encoder = LabelEncoder()
    train[cat_col] = train[cat_col].fillna('nanstr').astype('category')
    test[cat_col] = test[cat_col].fillna('nanstr').astype('category')

# Round last float features to 2 decimal place
# lastのfeatureはround(2)でまとめたものを加える (そのほうがスコア上がるらしい)
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
num_cols = [col for col in num_cols if 'last' in col]
#for col in num_cols:
#    train[col + '_round2'] = train[col].round(2)


# Get the difference between last and mean
# 平均からのlastのずれを素性に加える
num_cols = [col for col in train.columns if 'last' in col]
num_cols = [col[:-5] for col in num_cols if 'round' not in col]
for col in num_cols:
    try:
        train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
    except:
        pass
# Transform float64 and float32 to float16
# 容量節約
num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
for col in tqdm(num_cols):
    train[col] = train[col].astype(np.float16)

train = train.drop(IGNORE_COL , axis=1)
# Get feature list
features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]

  0%|          | 0/1519 [00:00<?, ?it/s]

In [8]:
from catboost import Pool
import optuna
from catboost import CatBoostClassifier, Pool, MetricVisualizer
from sklearn import metrics

# Optuna

In [17]:
OPTUNA_FLAG = True

In [18]:
kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
x_train, x_val, y_train, y_val = None, None, None, None
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
    print(' ')
    print('-'*50)
    if fold == 1:
        break
    # xはfeature, yはtarget
    print(f'Training fold {fold} with {len(features)} features...')
    x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
    y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]

    categorical_features_indices = np.where(x_train.dtypes == 'category')[0]

    train_pool = Pool(x_train, y_train, cat_features=categorical_features_indices)
    valid_pool = Pool(x_val, y_val, cat_features=categorical_features_indices)

 
--------------------------------------------------
Training fold 0 with 1868 features...
 
--------------------------------------------------


In [19]:
def objective(trial):
    # パラメータの指定
    params = {
        'loss_function' : 'Logloss',
        'eval_metric' : 'AUC',
        #'iterations' : trial.suggest_int('iterations', 100, 10000), 
        'iterations' : 1000,                        
        'depth' : trial.suggest_int('depth', 4, 10),                                       
        #'learning_rate' : trial.suggest_loguniform('learning_rate', 0.01, 0.3),    
        'learning_rate' : 0.05,           
        'random_strength' :trial.suggest_int('random_strength', 0, 100),                       
        'bagging_temperature' :trial.suggest_loguniform('bagging_temperature', 0.01, 100.00), 
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'od_wait' :trial.suggest_int('od_wait', 10, 50),
        'task_type': 'GPU',
    }

    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, plot=True, use_best_model = True, early_stopping_rounds=100)
    # どっち使ってもいいはず
    #predictions = model.predict_proba(valid_pool)[:,1]
    #auc = metrics.roc_auc_score(valid_y, predictions)
    #return 1.0 - auc
    return 1.0 - model.get_best_score()['validation']['AUC']

In [20]:
# optunaで最適値を見つける
if OPTUNA_FLAG:
  study = optuna.create_study()
  study.optimize(objective, n_trials=100)

Output hidden; open in https://colab.research.google.com to view.

In [21]:
study.best_params

{'depth': 9,
 'random_strength': 18,
 'bagging_temperature': 0.16252920431328538,
 'od_type': 'Iter',
 'od_wait': 17}

# Training

In [8]:
#train = train.head(10000)
#test = test.head(10000)

In [9]:
# ====================================================
# Train & Evaluate
# ====================================================
def train_and_evaluate(train, test):
    # Label encode categorical features
    cat_features = [
        "B_30",
        "B_38",
        "D_114",
        "D_116",
        "D_117",
        "D_120",
        "D_126",
        "D_63",
        "D_64",
        "D_66",
        "D_68"
    ]
    # lastはcategoricalそのものなので指定する
    cat_features = [f"{cf}_last" for cf in cat_features]
    # cat_featuresで指定するならlabel encodingしなくてもよさそう？
    for cat_col in cat_features:
        encoder = LabelEncoder()
        train[cat_col] = train[cat_col].fillna('nanstr').astype('category')
        test[cat_col] = test[cat_col].fillna('nanstr').astype('category')


    # Round last float features to 2 decimal place
    # lastのfeatureはround(2)でまとめたものを加える (そのほうがスコア上がるらしい)
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    num_cols = [col for col in num_cols if 'last' in col]
    #for col in num_cols:
    #    train[col + '_round2'] = train[col].round(2)
    #    test[col + '_round2'] = test[col].round(2)


    # Get the difference between last and mean
    # 平均からのlastのずれを素性に加える
    num_cols = [col for col in train.columns if 'last' in col]
    num_cols = [col[:-5] for col in num_cols if 'round' not in col]
    for col in num_cols:
        try:
            train[f'{col}_last_mean_diff'] = train[f'{col}_last'] - train[f'{col}_mean']
            test[f'{col}_last_mean_diff'] = test[f'{col}_last'] - test[f'{col}_mean']
        except:
            pass
    # Transform float64 and float32 to float16
    # 容量節約
    num_cols = list(train.dtypes[(train.dtypes == 'float32') | (train.dtypes == 'float64')].index)
    for col in tqdm(num_cols):
        train[col] = train[col].astype(np.float16)
        test[col] = test[col].astype(np.float16)

    train = train.drop(IGNORE_COL , axis=1)
    test = test.drop(IGNORE_COL , axis=1)
    # Get feature list
    features = [col for col in train.columns if col not in ['customer_ID', CFG.target]]
    params = {
    'loss_function' : 'Logloss',
    'eval_metric' : 'AUC',
    'iterations' : 10000,     
    'depth' : 9,                                       
    #'learning_rate' : 0.2987036544125214,  
    'learning_rate' : 0.01,             
    'random_strength' : 18,                       
    'bagging_temperature' : 0.16252920431328538, 
    'od_type': 'Iter',
    'od_wait': 17,
    'random_seed': 0,
    'task_type': 'GPU',
    }
    # Create a numpy array to store test predictions
    test_predictions = np.zeros(len(test))
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    # ここからCV
    kfold = StratifiedKFold(n_splits = CFG.n_folds, shuffle = True, random_state = CFG.seed)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train, train[CFG.target])):
        print(' ')
        print('-'*50)
        # xはfeature, yはtarget
        print(f'Training fold {fold} with {len(features)} features...')
        x_train, x_val = train[features].iloc[trn_ind], train[features].iloc[val_ind]
        y_train, y_val = train[CFG.target].iloc[trn_ind], train[CFG.target].iloc[val_ind]
        categorical_features_indices = np.where(x_train.dtypes == 'category')[0]

        train_pool = Pool(x_train, y_train, cat_features=categorical_features_indices)
        valid_pool = Pool(x_val, y_val, cat_features=categorical_features_indices)
        del x_train, y_train
        gc.collect()

        model = CatBoostClassifier(**params)
        model.fit(train_pool, eval_set=valid_pool, plot=True, use_best_model = True, early_stopping_rounds=150)  
        del train_pool, valid_pool
        gc.collect()

        # Save best model
        # このコードだとmodelは書き出しのみで読み込んではいない？
        joblib.dump(model, f'/content/drive/MyDrive/Amex/Models/catboost_{CFG.boosting_type}_fold{fold}_seed{CFG.seed}.pkl')
        # Predict validation
        # でかいので分割してpredict
        Nrow = x_val.shape[0]
        Ndiv = 5
        n = int(Nrow // Ndiv) + 1
        x_val_list = []
        for i in range(Ndiv):
            tmp = x_val.iloc[i*n : (i+1)*n, :]
            x_val_list.append(tmp)
        del x_val
        gc.collect()

        val_pred_list = [] 
        for i, v in enumerate(x_val_list):
            print('train pred i=', i)
            tmp = model.predict_proba(v)[:, 1]
            val_pred_list.append(tmp) 
        del x_val_list
        gc.collect()
        val_pred = np.concatenate(val_pred_list)
        del val_pred_list
        gc.collect()
        #val_pred = model.predict(x_val)

        # Add to out of folds array
        # CVを終えれば全部のindexが1回ずつ計算されることになる
        oof_predictions[val_ind] = val_pred


        # Predict the test set
        # でかいので分割してpredict
        Nrow = test[features].shape[0]
        n = int(Nrow // Ndiv) + 1
        x_test_list = []
        for i in range(Ndiv):
            tmp = test[features].iloc[i*n : (i+1)*n, :]
            x_test_list.append(tmp)
        #del test
        #gc.collect()

        test_pred_list = [] 
        for i, v in enumerate(x_test_list):
            print('test pred i=', i)
            tmp = model.predict_proba(v)[:, 1]
            test_pred_list.append(tmp) 
        del x_test_list
        gc.collect()
        test_pred = np.concatenate(test_pred_list)
        del test_pred_list
        gc.collect()
        #test_pred = model.predict(test[features])

        # 不要になった時点でモデル削除
        del model
        gc.collect()
        # testの場合はfoldの数で割る
        test_predictions += test_pred / CFG.n_folds
        # Compute fold metric
        score = amex_metric(y_val, val_pred)
        print(f'Our fold {fold} CV score is {score}')
        del y_val
        gc.collect()
    # Compute out of folds metric
    score = amex_metric(train[CFG.target], oof_predictions)
    print(f'Our out of folds CV score is {score}')
    # Create a dataframe to store out of folds predictions
    # trainでcvして予測したやつ
    oof_df = pd.DataFrame({'customer_ID': train['customer_ID'], 'target': train[CFG.target], 'prediction': oof_predictions})
    oof_df.to_csv(f'/content/drive/MyDrive/Amex/OOF/oof_catboost_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}_tmp2.csv', index = False)
    # Create a dataframe to store test prediction
    # 提出用の大事なやつはこれ
    test_df = pd.DataFrame({'customer_ID': test['customer_ID'], 'prediction': test_predictions})
    test_df.to_csv(f'/content/drive/MyDrive/Amex/Predictions/test_catboost_{CFG.boosting_type}_baseline_{CFG.n_folds}fold_seed{CFG.seed}_tmp2.csv', index = False)
    
train_and_evaluate(train, test)

  0%|          | 0/1519 [00:00<?, ?it/s]

 
--------------------------------------------------
Training fold 0 with 1868 features...


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
4468:	total: 7m 56s	remaining: 9m 50s
4469:	total: 7m 57s	remaining: 9m 50s
4470:	test: 0.9625475	best: 0.9625475 (4470)	total: 7m 57s	remaining: 9m 50s
4471:	total: 7m 57s	remaining: 9m 49s
4472:	total: 7m 57s	remaining: 9m 49s
4473:	total: 7m 57s	remaining: 9m 49s
4474:	total: 7m 57s	remaining: 9m 49s
4475:	test: 0.9625503	best: 0.9625503 (4475)	total: 7m 57s	remaining: 9m 49s
4476:	total: 7m 57s	remaining: 9m 49s
4477:	total: 7m 57s	remaining: 9m 49s
4478:	total: 7m 57s	remaining: 9m 49s
4479:	total: 7m 58s	remaining: 9m 49s
4480:	test: 0.9625507	best: 0.9625507 (4480)	total: 7m 58s	remaining: 9m 48s
4481:	total: 7m 58s	remaining: 9m 48s
4482:	total: 7m 58s	remaining: 9m 48s
4483:	total: 7m 58s	remaining: 9m 48s
4484:	total: 7m 58s	remaining: 9m 48s
4485:	test: 0.9625503	best: 0.9625509 (4483)	total: 7m 58s	remaining: 9m 48s
4486:	total: 7m 58s	remaining: 9m 48s
4487:	total: 7m 58s	remaining: 9m 48s
4488:	total: 7m 59s	remaining: 9m 48s


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
3241:	total: 5m 47s	remaining: 12m 4s
3242:	total: 5m 47s	remaining: 12m 4s
3243:	total: 5m 47s	remaining: 12m 3s
3244:	total: 5m 47s	remaining: 12m 3s
3245:	test: 0.9611346	best: 0.9611346 (3245)	total: 5m 47s	remaining: 12m 3s
3246:	total: 5m 47s	remaining: 12m 3s
3247:	total: 5m 48s	remaining: 12m 3s
3248:	total: 5m 48s	remaining: 12m 3s
3249:	total: 5m 48s	remaining: 12m 3s
3250:	test: 0.9611353	best: 0.9611353 (3250)	total: 5m 48s	remaining: 12m 3s
3251:	total: 5m 48s	remaining: 12m 3s
3252:	total: 5m 48s	remaining: 12m 3s
3253:	total: 5m 48s	remaining: 12m 2s
3254:	total: 5m 48s	remaining: 12m 2s
3255:	test: 0.9611382	best: 0.9611382 (3255)	total: 5m 48s	remaining: 12m 2s
3256:	total: 5m 49s	remaining: 12m 2s
3257:	total: 5m 49s	remaining: 12m 2s
3258:	total: 5m 49s	remaining: 12m 2s
3259:	total: 5m 49s	remaining: 12m 2s
3260:	test: 0.9611412	best: 0.9611412 (3260)	total: 5m 49s	remaining: 12m 2s
3261:	total: 5m 49s	remaining: 12m 2s


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
5017:	total: 8m 54s	remaining: 8m 50s
5018:	total: 8m 54s	remaining: 8m 50s
5019:	total: 8m 54s	remaining: 8m 50s
5020:	test: 0.9617156	best: 0.9617159 (5019)	total: 8m 54s	remaining: 8m 50s
5021:	total: 8m 55s	remaining: 8m 50s
5022:	total: 8m 55s	remaining: 8m 50s
5023:	total: 8m 55s	remaining: 8m 50s
5024:	total: 8m 55s	remaining: 8m 50s
5025:	test: 0.9617156	best: 0.9617159 (5019)	total: 8m 55s	remaining: 8m 49s
5026:	total: 8m 55s	remaining: 8m 49s
5027:	total: 8m 55s	remaining: 8m 49s
5028:	total: 8m 55s	remaining: 8m 49s
5029:	total: 8m 55s	remaining: 8m 49s
5030:	test: 0.9617153	best: 0.9617160 (5026)	total: 8m 56s	remaining: 8m 49s
5031:	total: 8m 56s	remaining: 8m 49s
5032:	total: 8m 56s	remaining: 8m 49s
5033:	total: 8m 56s	remaining: 8m 49s
5034:	total: 8m 56s	remaining: 8m 49s
5035:	test: 0.9617146	best: 0.9617160 (5026)	total: 8m 56s	remaining: 8m 48s
5036:	total: 8m 56s	remaining: 8m 48s
5037:	total: 8m 56s	remaining: 8m 48s


MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
5017:	total: 8m 58s	remaining: 8m 54s
5018:	total: 8m 58s	remaining: 8m 54s
5019:	total: 8m 58s	remaining: 8m 54s
5020:	test: 0.9612774	best: 0.9612777 (5018)	total: 8m 58s	remaining: 8m 54s
5021:	total: 8m 59s	remaining: 8m 54s
5022:	total: 8m 59s	remaining: 8m 54s
5023:	total: 8m 59s	remaining: 8m 54s
5024:	total: 8m 59s	remaining: 8m 53s
5025:	test: 0.9612796	best: 0.9612797 (5024)	total: 8m 59s	remaining: 8m 53s
5026:	total: 8m 59s	remaining: 8m 53s
5027:	total: 8m 59s	remaining: 8m 53s
5028:	total: 8m 59s	remaining: 8m 53s
5029:	total: 8m 59s	remaining: 8m 53s
5030:	test: 0.9612796	best: 0.9612799 (5026)	total: 8m 59s	remaining: 8m 53s
5031:	total: 9m	remaining: 8m 53s
5032:	total: 9m	remaining: 8m 53s
5033:	total: 9m	remaining: 8m 52s
5034:	total: 9m	remaining: 8m 52s
5035:	test: 0.9612794	best: 0.9612802 (5032)	total: 9m	remaining: 8m 52s
5036:	total: 9m	remaining: 8m 52s
5037:	total: 9m	remaining: 8m 52s
5038:	total: 9m	remaining: 8

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Default metric period is 5 because AUC is/are not implemented for GPU


[1;30;43mストリーミング出力は最後の 5000 行に切り捨てられました。[0m
5015:	test: 0.9623122	best: 0.9623122 (5015)	total: 9m 1s	remaining: 8m 57s
5016:	total: 9m 1s	remaining: 8m 57s
5017:	total: 9m 1s	remaining: 8m 57s
5018:	total: 9m 1s	remaining: 8m 57s
5019:	total: 9m 1s	remaining: 8m 57s
5020:	test: 0.9623137	best: 0.9623137 (5020)	total: 9m 1s	remaining: 8m 57s
5021:	total: 9m 1s	remaining: 8m 56s
5022:	total: 9m 1s	remaining: 8m 56s
5023:	total: 9m 1s	remaining: 8m 56s
5024:	total: 9m 1s	remaining: 8m 56s
5025:	test: 0.9623145	best: 0.9623145 (5025)	total: 9m 2s	remaining: 8m 56s
5026:	total: 9m 2s	remaining: 8m 56s
5027:	total: 9m 2s	remaining: 8m 56s
5028:	total: 9m 2s	remaining: 8m 56s
5029:	total: 9m 2s	remaining: 8m 56s
5030:	test: 0.9623157	best: 0.9623157 (5030)	total: 9m 2s	remaining: 8m 55s
5031:	total: 9m 2s	remaining: 8m 55s
5032:	total: 9m 2s	remaining: 8m 55s
5033:	total: 9m 2s	remaining: 8m 55s
5034:	total: 9m 3s	remaining: 8m 55s
5035:	test: 0.9623173	best: 0.9623173 (5035)	total: 9m 3s	