In [1]:
# True: Google Colab Notebook
# False: My local PC
colab = False
if colab: 
    from google.colab import drive
    drive.mount('/content/drive')
    !ls /content/drive/MyDrive/output/otto/
    base_path = '/content/drive/MyDrive'
    !pip3 install optuna
else:
    base_path = '../data'

# Preprocessing

In [2]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_50.parquet')
train = pd.read_parquet(f'{base_path}/output/otto/train_50_tmp.parquet')

#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20_old.parquet')

#train20 = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')

In [4]:
DEBUG_MODE = False
#DEBUG_MODE = True

OPTUNA_FLAG = False
#OPTUNA_FLAG = True

if DEBUG_MODE:
    train = train.head(100000)
IGNORE_COL = ['session','aid']

TYPE_MODE = 'clicks'
#TYPE_MODE = 'carts'
#TYPE_MODE = 'orders'
IGNORE_COL += ['y_clicks', 'y_carts', 'y_orders']


if TYPE_MODE == 'clicks':
    target = 'y_clicks'
    # under sampling 1.3 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'carts':
    target = 'y_carts'
    # under sampling 1.6 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'orders':
    target = 'y_orders'
    # under sampling 2.1 -> 2.5%
    pos_neg_ratio = 1/39

In [5]:
def reduce_memory(df):
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df['score_click'] = df['score_click'].astype('float32')
    df['score_cart'] = df['score_cart'].astype('float32')
    df['score_buy'] = df['score_buy'].astype('float32')
    df['score_click_only'] = df['score_click_only'].astype('float32')
    df['score_cart_only'] = df['score_cart_only'].astype('float32')
    df['score_buy_only'] = df['score_buy_only'].astype('float32')
    df['session_action_count'] = df['session_action_count'].astype('int16')
    df['session_click_count'] = df['session_click_count'].astype('int16')
    df['session_cart_count'] = df['session_cart_count'].astype('int16')
    df['session_order_count'] = df['session_order_count'].astype('int16')
    df['session_type_mean'] = df['session_type_mean'].astype('float32')
    
    click_topn_list = [10, 20]
    for i in click_topn_list:
        df[f'n_clicks_{i}'] = df[f'n_clicks_{i}'].astype('int8')

    df['n_carts'] = df['n_carts'].astype('int8')
    df['n_buys'] = df['n_buys'].astype('int8')
    df['clicks_count'] = df['clicks_count'].astype('int32')
    df['carts_count'] = df['carts_count'].astype('int16')
    df['orders_count'] = df['orders_count'].astype('int16')
    return df

# topn件だけを使う
def use_top_n(n, df):
    df = df.query(f'score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<{n}) or (-1 < n_carts and n_carts<{n}) or (-1 < n_buys and n_buys<{n})')
    return df

# 負例しかないものは学習に使えないので削る（学習のみ）
def remove_negative_session(df):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# 負例が多すぎる場合にunder samplingする
# ratio = pos/neg
def negative_sampling(df_x, df_y, ratio):
    print('before mean:', df_y.mean())

    Nrow = df_x.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1

    df_x_list = [df_x.iloc[i*n : (i+1)*n, :] for i in range(Ndiv)]
    df_y_list = [df_y.iloc[i*n : (i+1)*n] for i in range(Ndiv)]
    del df_x, df_y
    gc.collect()

    for i in range(Ndiv):
        print('under sampling.......',i + 1 , '/', Ndiv)
        tmpx, tmpy = RandomUnderSampler(sampling_strategy=ratio).fit_resample(df_x_list[i], df_y_list[i])
        df_x_list[i] = tmpx
        df_y_list[i] = tmpy
        del tmpx, tmpy
        gc.collect()
    print('under sampling end')
    after_x = pd.concat(df_x_list)
    del df_x_list
    gc.collect()
    print('post proccess1')
    after_y = pd.concat(df_y_list)
    del df_y_list
    gc.collect()

    print('after mean:', after_y.mean())
    return after_x, after_y

In [12]:
def join_session_features(df):
    session_df = pd.read_parquet(f'{base_path}/output/otto/valid_session_features.parquet')
    session_df['session'] = session_df['session'].astype('int32')
    session_df[f'session_action_count'] = session_df[f'session_action_count'].astype('int16')
    session_df[f'session_click_count'] = session_df[f'session_click_count'].astype('int16')
    session_df[f'session_cart_count'] = session_df[f'session_cart_count'].astype('int16')
    session_df[f'session_order_count'] = session_df[f'session_order_count'].astype('int16')
    session_df[f'session_type_mean'] = session_df[f'session_type_mean'].astype('float32')
    session_df[f'session_click_rate'] = session_df[f'session_click_rate'].astype('float32')
    session_df[f'session_cart_rate'] = session_df[f'session_cart_rate'].astype('float32')
    session_df[f'session_order_rate'] = session_df[f'session_order_rate'].astype('float32')

    remove_col = ['session_action_count', 'session_click_count', 'session_cart_count', 'session_order_count', 'session_type_mean']
    df = df.drop(remove_col , axis=1)
    df = df.merge(session_df, 'left', 'session')
    del session_df
    gc.collect()
    
    return df

In [7]:
def join_aid_features(df):
    aid_df = pd.read_parquet(f'{base_path}/output/otto/valid_aid_features.parquet')
    #week_list = ['4weeks', '3weeks', '2weeks', '1week']
    week_list = ['4weeks', '2weeks', '1week']
    aid_df['aid'] = aid_df['aid'].astype('int32')
    for i in week_list:
        aid_df[f'clicks_count_{i}'] = aid_df[f'clicks_count_{i}'].astype('int32')
        aid_df[f'carts_count_{i}'] = aid_df[f'carts_count_{i}'].astype('int16')
        aid_df[f'orders_count_{i}'] = aid_df[f'orders_count_{i}'].astype('int16')
        aid_df[f'clicks_rank_{i}'] = aid_df[f'clicks_rank_{i}'].astype('int32')
        aid_df[f'carts_rank_{i}'] = aid_df[f'carts_rank_{i}'].astype('int32')
        aid_df[f'orders_rank_{i}'] = aid_df[f'orders_rank_{i}'].astype('int32')
        for j in ['clicks', 'carts', 'orders']:
            #for k in [2,3,4]:
            for k in [2,4]:
                aid_df[f'aid_{j}_count_rate_1_{k}'] = aid_df[f'aid_{j}_count_rate_1_{k}'].astype('float32')

    remove_col = ['clicks_rank', 'carts_rank', 'orders_rank', 'clicks_count', 'carts_count', 'orders_count']
    #remove_col = ['clicks_rank_1week', 'carts_rank_1week', 'orders_rank_1week', 'clicks_count_1week', 'carts_count_1week', 'orders_count_1week']
    #df.drop(remove_col , axis=1)
    df = df.merge(aid_df, 'left', 'aid')
    del aid_df
    gc.collect()
    
    return df

In [8]:
train = reduce_memory(train)
train = use_top_n(50, train)
train = remove_negative_session(train)

In [9]:
train[target].sum()

1012735

In [10]:
train[target].mean()

0.013401377389554573

In [13]:
train = join_session_features(train)
train = join_aid_features(train)

In [14]:
# WIP
TRAIN_SECOND = False
if TRAIN_SECOND:
    # target以外の予測値の読み込み
    train = pd.read_csv(f'{base_path}/otto/oof_lgbm_{TYPE_MODE}.csv')

In [15]:
train.dtypes

session                        int32
aid                            int32
score_click                  float32
score_cart                   float32
score_buy                    float32
score_click_only             float32
score_cart_only              float32
score_buy_only               float32
n_clicks_10                     int8
n_clicks_20                     int8
n_carts                         int8
n_buys                          int8
clicks_rank                    int32
carts_rank                     int32
orders_rank                    int32
clicks_count                   int32
carts_count                    int16
orders_count                   int16
y_clicks                        bool
y_carts                         bool
y_orders                        bool
session_action_count           int16
session_click_count            int16
session_cart_count             int16
session_order_count            int16
session_type_mean            float32
session_click_rate           float32
s

# Training & Inference

In [16]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
# optuna
if OPTUNA_FLAG:
    import optuna.integration.lightgbm as lgb
else:
    import lightgbm as lgb

from itertools import combinations

In [17]:
# old 0.0382316
# new 0.03822381002014941.
# new num=1000
if OPTUNA_FLAG:
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',  # Noneにした方がよさそう？
        'boosting': 'gbdt',
        'seed': 42,        
        'n_jobs': -1,
        'learning_rate': 0.05
        }
    # Create a numpy array to store out of folds predictions
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold}...')

        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        train_tmp = train.drop(IGNORE_COL , axis=1)
        x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
        del train_tmp
        gc.collect()

        # under sampling
        x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
        del x_train, y_train
        gc.collect()

        #lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            #num_boost_round = 10500,
            num_boost_round = 200,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 20,
            verbose_eval = 10,
            )
        del lgb_train, lgb_valid
        gc.collect()
        break
    model.params

In [18]:
if OPTUNA_FLAG:
    print("Optuna results: ",model.params)

params = {'objective': 'binary',
          'metric': 'binary_logloss',
          'boosting': 'gbdt',
          'seed': 42,
          'n_jobs': -1,
          'learning_rate': 0.05,
          'feature_pre_filter': False,
          'lambda_l1': 7.777864227173249,
          'lambda_l2': 0.000181104589355317,
          'num_leaves': 202,
          'feature_fraction': 0.8999999999999999,
          'bagging_fraction': 1.0,
          'bagging_freq': 0,
          'min_child_samples': 25
          }


In [19]:
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))
session = train['session']
unique_session = session.unique()

kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
for fold, (trn_group_ind, val_group_ind) in enumerate(kfold.split(unique_session)):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold}...')
    # session単位で分割してKFoldする
    tr_groups, va_groups = unique_session[trn_group_ind], unique_session[val_group_ind]
    is_tr, is_va = session.isin(tr_groups), session.isin(va_groups)
    del tr_groups, va_groups
    gc.collect()
    # is_ir, is_va=Trueのindexを取得
    trn_ind, val_ind = is_tr[is_tr].index, is_va[is_va].index
    del is_tr, is_va
    gc.collect()

    y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
    train_tmp = train.drop(IGNORE_COL , axis=1)
    x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
    
    del train_tmp
    gc.collect()

    # under sampling
    x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_val, y_val)
    del x_train, y_train
    gc.collect()

    #lgb_valid = lgb.Dataset(x_val, y_val)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        #num_boost_round = 10500,
        num_boost_round = 2000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 20,
        verbose_eval = 10
        )
    del lgb_train, lgb_valid
    gc.collect()


    # Save best model
    if TRAIN_SECOND:
        joblib.dump(model, f'{base_path}/otto/otto_lgbm_fold{fold}_{TYPE_MODE}_second.pkl')
    else:
        joblib.dump(model, f'{base_path}/otto/otto_lgbm_fold{fold}_{TYPE_MODE}.pkl')
    # Predict validation
    # でかいので分割してpredict
    Nrow = x_val.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1
    x_val_list = []
    for i in range(Ndiv):
        tmp = x_val.iloc[i*n : (i+1)*n, :]
        x_val_list.append(tmp)
    del x_val
    gc.collect()

    val_pred_list = []
    for i, v in enumerate(x_val_list):
        print('train pred i=', i)
        tmp = model.predict(v)
        val_pred_list.append(tmp)
    del x_val_list
    gc.collect()
    val_pred = np.concatenate(val_pred_list)
    del val_pred_list
    gc.collect()

    # Add to out of folds array
    # CVを終えれば全部のindexが1回ずつ計算されることになる
    oof_predictions[val_ind] = val_pred

    # 不要になった時点でモデル削除
    del model, y_val
    gc.collect()


 
--------------------------------------------------
Training fold 0...
before mean: 0.013400967040807624
under sampling....... 1 / 5
under sampling....... 2 / 5
under sampling....... 3 / 5
under sampling....... 4 / 5
under sampling....... 5 / 5
under sampling end
post proccess1
after mean: 0.025
[LightGBM] [Info] Number of positive: 810188, number of negative: 31597332
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 9818
[LightGBM] [Info] Number of data points in the train set: 32407520, number of used features: 48
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.025000 -> initscore=-3.663562
[LightGBM] [Info] Start training from score -3.663562
Training until validation scores don't improve for 20 rounds
[10]	training's binary_logloss: 0.0806423	valid_1's binary_logloss: 0.0538225
[20]	training's binary_logloss: 0.0732785	valid_1's binary_logloss: 0.049239
[30]	training's binary_lo

[940]	training's binary_logloss: 0.0643953	valid_1's binary_logloss: 0.0445133
[950]	training's binary_logloss: 0.0643826	valid_1's binary_logloss: 0.044513
[960]	training's binary_logloss: 0.0643677	valid_1's binary_logloss: 0.044512
[970]	training's binary_logloss: 0.0643523	valid_1's binary_logloss: 0.044511
[980]	training's binary_logloss: 0.0643365	valid_1's binary_logloss: 0.0445107
[990]	training's binary_logloss: 0.0643206	valid_1's binary_logloss: 0.0445098
[1000]	training's binary_logloss: 0.0643048	valid_1's binary_logloss: 0.0445092
Did not meet early stopping. Best iteration is:
[1000]	training's binary_logloss: 0.0643048	valid_1's binary_logloss: 0.0445092
train pred i= 0
train pred i= 1
train pred i= 2
train pred i= 3
train pred i= 4
 
--------------------------------------------------
Training fold 1...
before mean: 0.013401513897263122
under sampling....... 1 / 5
under sampling....... 2 / 5
under sampling....... 3 / 5
under sampling....... 4 / 5
under sampling....... 5

[840]	training's binary_logloss: 0.0645548	valid_1's binary_logloss: 0.0445396
[850]	training's binary_logloss: 0.0645401	valid_1's binary_logloss: 0.0445385
[860]	training's binary_logloss: 0.0645246	valid_1's binary_logloss: 0.0445378
[870]	training's binary_logloss: 0.0645087	valid_1's binary_logloss: 0.0445367
[880]	training's binary_logloss: 0.0644925	valid_1's binary_logloss: 0.044536
[890]	training's binary_logloss: 0.064477	valid_1's binary_logloss: 0.044535
[900]	training's binary_logloss: 0.0644608	valid_1's binary_logloss: 0.0445346
[910]	training's binary_logloss: 0.0644448	valid_1's binary_logloss: 0.044534
[920]	training's binary_logloss: 0.064428	valid_1's binary_logloss: 0.044533
[930]	training's binary_logloss: 0.0644129	valid_1's binary_logloss: 0.0445323
[940]	training's binary_logloss: 0.0643968	valid_1's binary_logloss: 0.0445317
[950]	training's binary_logloss: 0.0643803	valid_1's binary_logloss: 0.0445314
[960]	training's binary_logloss: 0.0643644	valid_1's binar

[750]	training's binary_logloss: 0.0646638	valid_1's binary_logloss: 0.044632
[760]	training's binary_logloss: 0.064647	valid_1's binary_logloss: 0.0446306
[770]	training's binary_logloss: 0.0646307	valid_1's binary_logloss: 0.0446293
[780]	training's binary_logloss: 0.0646175	valid_1's binary_logloss: 0.044629
[790]	training's binary_logloss: 0.0646033	valid_1's binary_logloss: 0.0446284
[800]	training's binary_logloss: 0.0645885	valid_1's binary_logloss: 0.0446271
[810]	training's binary_logloss: 0.0645733	valid_1's binary_logloss: 0.0446265
[820]	training's binary_logloss: 0.0645574	valid_1's binary_logloss: 0.0446255
[830]	training's binary_logloss: 0.0645421	valid_1's binary_logloss: 0.0446253
[840]	training's binary_logloss: 0.0645284	valid_1's binary_logloss: 0.044625
[850]	training's binary_logloss: 0.0645141	valid_1's binary_logloss: 0.0446244
[860]	training's binary_logloss: 0.064498	valid_1's binary_logloss: 0.0446239
[870]	training's binary_logloss: 0.0644843	valid_1's bina

[660]	training's binary_logloss: 0.0647836	valid_1's binary_logloss: 0.044644
[670]	training's binary_logloss: 0.064768	valid_1's binary_logloss: 0.0446436
[680]	training's binary_logloss: 0.0647526	valid_1's binary_logloss: 0.0446419
[690]	training's binary_logloss: 0.0647345	valid_1's binary_logloss: 0.0446414
[700]	training's binary_logloss: 0.0647172	valid_1's binary_logloss: 0.0446405
[710]	training's binary_logloss: 0.0647006	valid_1's binary_logloss: 0.0446397
[720]	training's binary_logloss: 0.0646843	valid_1's binary_logloss: 0.0446384
[730]	training's binary_logloss: 0.0646686	valid_1's binary_logloss: 0.044638
[740]	training's binary_logloss: 0.064655	valid_1's binary_logloss: 0.0446378
[750]	training's binary_logloss: 0.0646397	valid_1's binary_logloss: 0.0446371
[760]	training's binary_logloss: 0.0646227	valid_1's binary_logloss: 0.0446367
[770]	training's binary_logloss: 0.064606	valid_1's binary_logloss: 0.0446355
[780]	training's binary_logloss: 0.0645889	valid_1's bina

[560]	training's binary_logloss: 0.0649455	valid_1's binary_logloss: 0.0446769
[570]	training's binary_logloss: 0.0649287	valid_1's binary_logloss: 0.0446758
[580]	training's binary_logloss: 0.06491	valid_1's binary_logloss: 0.0446741
[590]	training's binary_logloss: 0.0648924	valid_1's binary_logloss: 0.044673
[600]	training's binary_logloss: 0.064875	valid_1's binary_logloss: 0.0446718
[610]	training's binary_logloss: 0.0648565	valid_1's binary_logloss: 0.0446702
[620]	training's binary_logloss: 0.0648384	valid_1's binary_logloss: 0.0446685
[630]	training's binary_logloss: 0.0648212	valid_1's binary_logloss: 0.0446671
[640]	training's binary_logloss: 0.0648043	valid_1's binary_logloss: 0.0446655
[650]	training's binary_logloss: 0.0647886	valid_1's binary_logloss: 0.0446637
[660]	training's binary_logloss: 0.0647726	valid_1's binary_logloss: 0.0446632
[670]	training's binary_logloss: 0.0647573	valid_1's binary_logloss: 0.044663
[680]	training's binary_logloss: 0.0647429	valid_1's bina

In [20]:
df = pd.DataFrame(oof_predictions, columns=["score"])
if not TRAIN_SECOND:
    df.to_csv(f'{base_path}/otto/train_oof_lgbm_{TYPE_MODE}.csv', index = False)

pred_df = pd.concat([train[['session', 'aid']], df], axis=1)
pred_df['session_type'] = pred_df['session'].apply(lambda x: str(x) + f'_{TYPE_MODE}')
pred_df = pred_df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)

pred_df['n'] = pred_df.groupby('session_type').cumcount()
pred_df = pred_df.loc[pred_df.n<20].drop(['n','score','session'],axis=1)
pred_df['aid'] = pred_df['aid'].astype('int32')
pred_df = pred_df.groupby('session_type')['aid'].apply(list).reset_index()
pred_df['labels'] = pred_df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
pred_df = pred_df.drop(['aid'],axis=1)
pred_df

Unnamed: 0,session_type,labels
0,11098529_clicks,1105029 459126 217742 295362 1544564 1694360 6...
1,11098534_clicks,223062 908024 1342293 1607945 530377 1300062 1...
2,11098535_clicks,745365 1750442 803918 767201 236461 896972 132...
3,11098538_clicks,1263747 1550143 1711586 703265 1172033 717871 ...
4,11098539_clicks,631008 1408458 1658802 1057728 1251433 617897 ...
...,...,...
1012730,12899773_clicks,1311526 1484665 1578804 337571 184006 104552 1...
1012731,12899774_clicks,33035 1539309 819288 771913 270852 218795 9548...
1012732,12899775_clicks,1743151 1760714 1163166 1255910 1022572 832192...
1012733,12899777_clicks,384045 1308634 1688215 703474 395762 364190 14...


In [21]:
sub = pred_df.loc[pred_df.session_type.str.contains(TYPE_MODE)].copy()
sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

test_labels = pd.read_parquet(f'{base_path}/input/otto/otto-validation/test_labels.parquet')
test_labels = test_labels.loc[test_labels['type']==TYPE_MODE]
test_labels = test_labels.merge(sub, how='left', on=['session'])
test_labels['labels'] = test_labels['labels'].fillna('[]')
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
print(f'{TYPE_MODE} recall =',recall)

clicks recall = 0.5350195439108556


In [None]:
# click total: 1,755,534
# 0.52なら912,877の正解が必要

In [None]:
# clicks recall = 0.5271239406357268 おためしtop20, , PB = 0.579

# baseline top20のitem2itemを使ってgenerateしたもの, trainsform, duplicate削減、negativeのみremove
# clicks recall = 0.5279590141803007 num=100 きた！
# 既存データ + 50までbackfill, 20位まで num=100 clicks recall = 0.5289963053976738 きた！
#                                               orders recall = 0.6531281219777659
# 既存データ + 50までbackfill, 30位まで num=100 orders recall = 0.6533100544839979
# 既存データ + 50までbackfill, 50位まで num=100 orders recall = 0.6536483851096223
# 既存データ + 50までbackfill, 50位まで num=1000(137) orders recall = 0.6536451933112674

# under samplingなしだと上位50で2.1%がpositive
# 既存データ + 50までbackfill under sampling pos:neg = 1:2 33% pos, orders recall = 0.6190460991436405
#                                            pos:neg = 1:9 10% pos, orders recall = 0.6536036999326531
#                                            pos:neg = 1:19 5% pos, orders recall = 0.6536388097145575 ちょい下がるけどそんなに問題なさそう
#                                            pos:neg = 1:39 2.5% pos,orders recall= 0.6536930702865916 これくらいの比率で固定しよう, PB = 0.580
#                                                                    carts recall = 0.41731398378440265
#                                                                    clicks recall = 0.5295727681719636
# feature増版、click i2i, top10,20 pos:neg = 1:39 2.5%, num=100 orders recall = 0.6538813863895334
#                                                      num=1000 orders recall = 0.654031400912216 , PB = 0.581
#                                                      num=1000 carts recall = 0.41827325050912256 
#                                                      num=1000 clicks recall = 0.5309427217017728
# aid feature追加 2weeks, 4weeks                                orders recall = 0.6575391873043028 ほぼ変わらんのでこっち
# 2,3,4 weeks                                                   orders recall = 0.6575551462960776
# 2,4 under sampling のsplitだけ変えた                          orders recall = 0.6576381330533062, PB = 0.585
#                                                               carts recall = 0.4219628713472407
#                                                               clicks_recall = 0.5343416874865425
# binary_logloss -> auc, orders recall = 0.6573189532178115 -> binary_loglossのままで良さそう
# lr 0.1 -> 0.05, orders recall = 0.6577275034072447 ちょびっとだけ上がった
# optuna again (lr=0.05, num=200, order) orders recall = 0.6578519835430877 
# (other target leak 0.6839768530783299)

# kfold古
# session feature bug fix orders recall = 0.6576892018269854 logloss下がったのにrecall下がった。。, num=402 valid_1's binary_logloss: 0.0378125
# 古いsession素性残し num=335, valid_1's binary_logloss: 0.0378245

# kfold sessionごとに変更してsessionのleak修正, 元々のsession素性, orders recall = 0.6577498459957294 [248] valid_1's binary_logloss: 0.0388062
# session bug fix orders recall = 0.6579477374937361 [332] valid_1's binary_logloss: 0.0384341 -> binary_loglossがそろそろ信用できない。。
#                 carts recall = 0.4225023504636745
# session bug fix + 古い素性残 orders recall = 0.6577562295924393 [272] valid_1's binary_logloss: 0.0384143

# recallをmetricsに変更 (otto_lgb_train_rank.ipynb)
# session bug fix,orders recall = 0.6577945311726986 [283] valid_1's ndcg@20: 0.843099	valid_1's ndcg@50: 0.85066, 時間かかるけどそんなに変わらん
# session bug fix,orders recall = 0.657772188584214 map [205] valid_1's map@20: 0.790326	valid_1's map@50: 0.792477

# cart予測値をorderに加えてみたい