In [1]:
# True: Google Colab Notebook
# False: My local PC
colab = False
if colab: 
    from google.colab import drive
    drive.mount('/content/drive')
    !ls /content/drive/MyDrive/output/otto/
    base_path = '/content/drive/MyDrive'
    !pip3 install optuna
else:
    base_path = '../data'

# Preprocessing

In [2]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_50.parquet')
train = pd.read_parquet(f'{base_path}/output/otto/train_50_tmp.parquet')

#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20_old.parquet')

#train20 = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')

In [4]:
DEBUG_MODE = False
#DEBUG_MODE = True

OPTUNA_FLAG = False
#OPTUNA_FLAG = True

if DEBUG_MODE:
    train = train.head(100000)
IGNORE_COL_ID = ['session','aid']

TYPE_MODE = 'clicks'
#TYPE_MODE = 'carts'
#TYPE_MODE = 'orders'
IGNORE_COL_TARGET = ['y_clicks', 'y_carts', 'y_orders']


if TYPE_MODE == 'clicks':
    target = 'y_clicks'
    # under sampling 1.3 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'carts':
    target = 'y_carts'
    # under sampling 1.6 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'orders':
    target = 'y_orders'
    # under sampling 2.1 -> 2.5%
    pos_neg_ratio = 1/39

In [5]:
def reduce_memory(df):
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df['score_click'] = df['score_click'].astype('float32')
    df['score_cart'] = df['score_cart'].astype('float32')
    df['score_buy'] = df['score_buy'].astype('float32')
    df['score_click_only'] = df['score_click_only'].astype('float32')
    df['score_cart_only'] = df['score_cart_only'].astype('float32')
    df['score_buy_only'] = df['score_buy_only'].astype('float32')
    df['session_action_count'] = df['session_action_count'].astype('int16')
    df['session_click_count'] = df['session_click_count'].astype('int16')
    df['session_cart_count'] = df['session_cart_count'].astype('int16')
    df['session_order_count'] = df['session_order_count'].astype('int16')
    df['session_type_mean'] = df['session_type_mean'].astype('float32')
    
    click_topn_list = [10, 20]
    for i in click_topn_list:
        df[f'n_clicks_{i}'] = df[f'n_clicks_{i}'].astype('int8')

    df['n_carts'] = df['n_carts'].astype('int8')
    df['n_buys'] = df['n_buys'].astype('int8')
    df['clicks_count'] = df['clicks_count'].astype('int32')
    df['carts_count'] = df['carts_count'].astype('int16')
    df['orders_count'] = df['orders_count'].astype('int16')
    return df

# topn件だけを使う
def use_top_n(n, df):
    df = df.query(f'score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<{n}) or (-1 < n_carts and n_carts<{n}) or (-1 < n_buys and n_buys<{n})')
    return df

# 負例しかないものは学習に使えないので削る（学習のみ）
def remove_negative_session(df):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# 負例が多すぎる場合にunder samplingする
# ratio = pos/neg
def negative_sampling(df_x, df_y, ratio):
    print('before mean:', df_y.mean())

    Nrow = df_x.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1

    df_x_list = [df_x.iloc[i*n : (i+1)*n, :] for i in range(Ndiv)]
    df_y_list = [df_y.iloc[i*n : (i+1)*n] for i in range(Ndiv)]
    del df_x, df_y
    gc.collect()

    for i in range(Ndiv):
        print('under sampling.......',i + 1 , '/', Ndiv)
        tmpx, tmpy = RandomUnderSampler(sampling_strategy=ratio, random_state=0).fit_resample(df_x_list[i], df_y_list[i])
        df_x_list[i] = tmpx
        df_y_list[i] = tmpy
        del tmpx, tmpy
        gc.collect()
    print('under sampling end')
    after_x = pd.concat(df_x_list)
    del df_x_list
    gc.collect()
    print('post proccess1')
    after_y = pd.concat(df_y_list)
    del df_y_list
    gc.collect()
    # sessionの順番がばらばらになるので再びsort
    tmp = pd.concat([after_x, after_y], axis=1).sort_values('session')
    after_y = tmp[target]
    after_x = tmp.drop(target , axis=1)

    print('after mean:', after_y.mean())
    return after_x, after_y

In [6]:
def join_session_features(df):
    
    session_df = pd.read_parquet(f'{base_path}/output/otto/valid_session_features.parquet')
    session_df['session'] = session_df['session'].astype('int32')
    session_df['session_action_count'] = session_df['session_action_count'].astype('int16')
    session_df['session_click_count'] = session_df['session_click_count'].astype('int16')
    session_df['session_cart_count'] = session_df['session_cart_count'].astype('int16')
    session_df['session_order_count'] = session_df['session_order_count'].astype('int16')
    session_df['session_type_mean'] = session_df['session_type_mean'].astype('float32')
    session_df['session_click_rate'] = session_df['session_click_rate'].astype('float32')
    session_df['session_cart_rate'] = session_df['session_cart_rate'].astype('float32')
    session_df['session_order_rate'] = session_df['session_order_rate'].astype('float32')

    session_df['session_last_type'] = session_df['session_last_type'].astype('int8')
    session_df['session_first_action_ts_diff'] = session_df['session_first_action_ts_diff'].astype('int32')
    session_df['session_last_action_ts_diff'] = session_df['session_last_action_ts_diff'].astype('int32')
    session_df['session_ts_period'] = session_df['session_ts_period'].astype('int32')
    session_df['session_mean_action_ts_diff'] = session_df['session_mean_action_ts_diff'].astype('float32')
    session_df['session_unique_aid_action_count'] = session_df['session_unique_aid_action_count'].astype('int16')
    session_df['session_unique_aid_click_count'] = session_df['session_unique_aid_click_count'].astype('int16')
    session_df['session_unique_aid_cart_count'] = session_df['session_unique_aid_cart_count'].astype('int16')
    session_df['session_unique_aid_order_count'] = session_df['session_unique_aid_order_count'].astype('int16')
    session_df['session_click_rate'] = session_df['session_click_rate'].astype('float32')
    session_df['session_cart_rate'] = session_df['session_cart_rate'].astype('float32')
    session_df['session_order_rate'] = session_df['session_order_rate'].astype('float32') 
    session_df['session_unique_aid_click_rate'] = session_df['session_unique_aid_click_rate'].astype('float32')
    session_df['session_unique_aid_cart_rate'] = session_df['session_unique_aid_cart_rate'].astype('float32')
    session_df['session_unique_aid_order_rate'] = session_df['session_unique_aid_order_rate'].astype('float32')
    session_df['session_total_uu_action_ratio'] = session_df['session_total_uu_action_ratio'].astype('float32')
    session_df['session_clicks_uu_action_ratio'] = session_df['session_clicks_uu_action_ratio'].astype('float32')
    session_df['session_carts_uu_action_ratio'] = session_df['session_carts_uu_action_ratio'].astype('float32')
    session_df['session_orders_uu_action_ratio'] = session_df['session_orders_uu_action_ratio'].astype('float32')

    week_list = ['4weeks', '2weeks', '1week']
    for i in week_list:
        lis = [f'aid_clicks_count_{i}', 
              f'aid_carts_count_{i}', 
              f'aid_orders_count_{i}', 
              f'aid_total_count_{i}', 
              f'aid_total_uu_{i}', 
              f'aid_clicks_uu_{i}', 
              f'aid_carts_uu_{i}', 
              f'aid_orders_uu_{i}',
              f'aid_total_uu_action_ratio_{i}',
              f'aid_clicks_uu_action_ratio_{i}',
              f'aid_carts_uu_action_ratio_{i}',
              f'aid_orders_uu_action_ratio_{i}']
    for l in lis:
        session_df[f'session_mean_{l}'] = session_df[f'session_mean_{l}'].astype('float32')

    remove_col = ['session_action_count', 'session_click_count', 'session_cart_count', 'session_order_count', 'session_type_mean']

    df = df.drop(remove_col , axis=1)
    df = df.merge(session_df, 'left', 'session')
    del session_df
    gc.collect()
    
    return df

In [7]:
def join_aid_features(df):
    aid_df = pd.read_parquet(f'{base_path}/output/otto/valid_aid_features.parquet')
    #week_list = ['4weeks', '3weeks', '2weeks', '1week']
    week_list = ['4weeks', '2weeks', '1week']
    aid_df['aid'] = aid_df['aid'].astype('int32')
    for i in week_list:
        aid_df[f'aid_total_count_{i}'] = aid_df[f'aid_total_count_{i}'].astype('int32')
        aid_df[f'aid_clicks_count_{i}'] = aid_df[f'aid_clicks_count_{i}'].astype('int32')
        aid_df[f'aid_carts_count_{i}'] = aid_df[f'aid_carts_count_{i}'].astype('int16')
        aid_df[f'aid_orders_count_{i}'] = aid_df[f'aid_orders_count_{i}'].astype('int16')
        aid_df[f'aid_clicks_rank_{i}'] = aid_df[f'aid_clicks_rank_{i}'].astype('int32')
        aid_df[f'aid_carts_rank_{i}'] = aid_df[f'aid_carts_rank_{i}'].astype('int32')
        aid_df[f'aid_orders_rank_{i}'] = aid_df[f'aid_orders_rank_{i}'].astype('int32')

        aid_df[f'aid_total_uu_{i}'] = aid_df[f'aid_total_uu_{i}'].astype('int32')
        aid_df[f'aid_clicks_uu_{i}'] = aid_df[f'aid_clicks_uu_{i}'].astype('int32')
        aid_df[f'aid_carts_uu_{i}'] = aid_df[f'aid_carts_uu_{i}'].astype('int32')
        aid_df[f'aid_orders_uu_{i}'] = aid_df[f'aid_orders_uu_{i}'].astype('int32')
        aid_df[f'aid_total_uu_action_ratio_{i}']  = aid_df[f'aid_total_uu_action_ratio_{i}'].astype('float32')  
        aid_df[f'aid_clicks_uu_action_ratio_{i}'] = aid_df[f'aid_clicks_uu_action_ratio_{i}'].astype('float32') 
        aid_df[f'aid_carts_uu_action_ratio_{i}']  = aid_df[f'aid_carts_uu_action_ratio_{i}'].astype('float32')
        aid_df[f'aid_orders_uu_action_ratio_{i}'] = aid_df[f'aid_orders_uu_action_ratio_{i}'].astype('float32')
        aid_df[f'aid_mean_session_action_count_{i}'] = aid_df[f'aid_mean_session_action_count_{i}'].astype('float32') 
        aid_df[f'aid_mean_session_click_count_{i}']  = aid_df[f'aid_mean_session_click_count_{i}'].astype('float32')  
        aid_df[f'aid_mean_session_cart_count_{i}']   = aid_df[f'aid_mean_session_cart_count_{i}'].astype('float32')   
        aid_df[f'aid_mean_session_order_count_{i}']  = aid_df[f'aid_mean_session_order_count_{i}'].astype('float32')  
        aid_df[f'aid_mean_session_type_mean_{i}']    = aid_df[f'aid_mean_session_type_mean_{i}'].astype('float32') 

    for j in ['clicks', 'carts', 'orders']:
        #for k in [2,3,4]:
        for k in [2,4]:
            aid_df[f'aid_{j}_count_rate_1_{k}'] = aid_df[f'aid_{j}_count_rate_1_{k}'].astype('float32')
            aid_df[f'aid_{j}_uu_rate_1_{k}'] = aid_df[f'aid_{j}_uu_rate_1_{k}'].astype('float32')
        # i loopの一回だけ実行
        if i == 'clicks':
            aid_df[f'aid_total_uu_rate_1_{k}'] = aid_df[f'aid_total_uu_rate_1_{k}'].astype('float32')

    remove_col = ['clicks_rank', 'carts_rank', 'orders_rank', 'clicks_count', 'carts_count', 'orders_count']
    df = df.drop(remove_col , axis=1)
    df = df.merge(aid_df, 'left', 'aid')
    del aid_df
    gc.collect()
    
    return df

In [8]:
def join_interactive_features(df):
    # session単位でのmean aid feature v.s. aid feature
    #week_list = ['4weeks', '2weeks', '1week']
    week_list = ['4weeks']

    for i in week_list:
        # ratio feature
        lis1 = [f'aid_clicks_count_{i}', 
                f'aid_carts_count_{i}', 
                f'aid_orders_count_{i}', 
                f'aid_total_count_{i}', 
                f'aid_total_uu_{i}', 
                f'aid_clicks_uu_{i}', 
                f'aid_carts_uu_{i}', 
                f'aid_orders_uu_{i}',]
        for l1 in lis1:
            df[f'i_ratio_{l1}'] = df[f'session_mean_{l1}'] / (df[l1] +  + 0.000001)
            df[f'i_ratio_{l1}'] = df[f'i_ratio_{l1}'].astype('float32')

       # aid単位でのmean session feature v.s. session feature
        lis3 = ['session_action_count',
                'session_click_count',
                'session_cart_count',
                'session_order_count',]

        for l3 in lis3:
            df[f'i_ratio_{l3}_{i}'] = df[f'aid_mean_{l3}_{i}'] / (df[l3] +  + 0.000001)
            df[f'i_ratio_{l3}_{i}'] = df[f'i_ratio_{l3}_{i}'].astype('float32')
        df[f'i_diff_session_type_mean_{i}'] = df[f'aid_mean_session_type_mean_{i}'] - df['session_type_mean']
    return df

In [9]:
# importanceが極端に低いものを削る (18件)
def remove_features(df):
    DROP_COL = ['session_type_mean']
    return df.drop(DROP_COL, axis=1)

In [10]:
train = reduce_memory(train)
#train = use_top_n(50, train)
train = remove_negative_session(train)

In [11]:
train = remove_negative_session(train)
print('target sum:', train[target].sum())
print('target mean:', train[target].mean())

target sum: 1016195
target mean: 0.012007926783190924


In [12]:
#train = join_session_features(train)
#train = join_aid_features(train)
#train = join_interactive_features(train)
#train = remove_features(train)

# Training & Inference

In [13]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
# optuna
if OPTUNA_FLAG:
    import optuna.integration.lightgbm as lgb
else:
    import lightgbm as lgb

from itertools import combinations

In [14]:
if OPTUNA_FLAG:
    session = train['session']
    unique_session = session.unique()
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'ndcg_eval_at': [20],
        'boosting': 'gbdt',
        'seed': 42,        
        'n_jobs': -1,
        'learning_rate': 0.1
        }
    # Create a numpy array to store out of folds predictions
    N_splits = 5
    kfold = KFold(n_splits = N_splits, shuffle = True, random_state = 42)
    for fold, (trn_group_ind, val_group_ind) in enumerate(kfold.split(unique_session)):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold}/{N_splits}....')
        # session単位で分割してKFoldする
        tr_groups, va_groups = unique_session[trn_group_ind], unique_session[val_group_ind]
        is_tr, is_va = session.isin(tr_groups), session.isin(va_groups)
        del tr_groups, va_groups
        gc.collect()
        # is_ir, is_va=Trueのindexを取得
        trn_ind, val_ind = is_tr[is_tr].index, is_va[is_va].index
        del is_tr, is_va
        gc.collect()

        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        train_tmp = train.drop(IGNORE_COL_TARGET , axis=1)
        x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
        del train_tmp
        gc.collect()

        # under sampling
        x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

        # queryの準備, sessionごとにsortする, lightGBMでranking metricsを使うときに必要
        query_list_train = x_train['session'].value_counts()
        #x_train = x_train.drop('session' , axis=1)
        query_list_train = query_list_train.sort_index()

        query_list_valid = x_val['session'].value_counts()
        #x_val = x_val.drop('session' , axis=1)
        query_list_valid = query_list_valid.sort_index()

        # memory節約のため, under sampling後にfeature追加
        print('add session features....')
        #x_train, x_val = join_session_features(x_train), join_session_features(x_val)
        print('add aid features....')
        #x_train, x_val = join_aid_features(x_train), join_aid_features(x_val)
        print('add interactive features....')
        #x_train, x_val = join_interactive_features(x_train), join_interactive_features(x_val)
        print('remove features....')
        #x_train, x_val = remove_features(x_train),  remove_features(x_val)
        print('remove id from features....')
        x_train, x_val = x_train.drop(IGNORE_COL_ID, axis=1), x_val.drop(IGNORE_COL_ID, axis=1)
        print('x_train shape:', x_train.shape)

        lgb_train = lgb.Dataset(x_train, y_train, group=query_list_train)
        lgb_valid = lgb.Dataset(x_val, y_val, group=query_list_valid)

        del x_train, y_train
        gc.collect()

        #lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            #num_boost_round = 10500,
            num_boost_round = 100,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 20,
            verbose_eval = 10,
            )
        del lgb_train, lgb_valid
        gc.collect()
        break
    model.params

In [15]:
if OPTUNA_FLAG:
    print("Optuna results: ",model.params)
 # old
'''
params = {'objective': 'lambdarank',
          'metric': 'ndcg',
          #'metric': 'map',
          #'ndcg_eval_at': [10, 20, 50],
          'map_eval_at': [10, 20, 50],
          'boosting': 'gbdt',
          'seed': 42,
          'n_jobs': -1,
          #'learning_rate': 0.05,
          'learning_rate': 0.1,
          'feature_pre_filter': False,
          'lambda_l1': 7.777864227173249,
          'lambda_l2': 0.000181104589355317,
          'num_leaves': 202,
          'feature_fraction': 0.8999999999999999,
          'bagging_fraction': 1.0,
          'bagging_freq': 0,
          'min_child_samples': 25,
          'random_state': 0
          }
'''
# lartest
params = {'objective': 'lambdarank',
          'metric': 'ndcg',
          'map_eval_at': [20],
          'boosting': 'gbdt',
          'seed': 42,
          'n_jobs': -1,
          'learning_rate': 0.05,
          #'learning_rate': 0.1,
          'feature_pre_filter': False,
          'lambda_l1': 0.0, 
          'lambda_l2': 0.0, 
          'num_leaves': 222, 
          'feature_fraction': 0.62, 
          'bagging_fraction': 1.0, 
          'bagging_freq': 0, 
          'min_child_samples': 25
}

In [16]:

# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))
session = train['session']
unique_session = session.unique()

N_splits = 5
kfold = KFold(n_splits = N_splits, shuffle = True, random_state = 42)
for fold, (trn_group_ind, val_group_ind) in enumerate(kfold.split(unique_session)):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold}/{N_splits}....')
    # session単位で分割してKFoldする
    tr_groups, va_groups = unique_session[trn_group_ind], unique_session[val_group_ind]
    is_tr, is_va = session.isin(tr_groups), session.isin(va_groups)
    del tr_groups, va_groups
    gc.collect()
    # is_ir, is_va=Trueのindexを取得
    trn_ind, val_ind = is_tr[is_tr].index, is_va[is_va].index
    del is_tr, is_va
    gc.collect()

    y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
    train_tmp = train.drop(IGNORE_COL_TARGET , axis=1)
    x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
    del train_tmp
    gc.collect()
    # under sampling
    x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

    # queryの準備, sessionごとにsortする, lightGBMでranking metricsを使うときに必要
    query_list_train = x_train['session'].value_counts()
    #x_train = x_train.drop('session' , axis=1)
    query_list_train = query_list_train.sort_index()

    query_list_valid = x_val['session'].value_counts()
    #x_val = x_val.drop('session' , axis=1)
    query_list_valid = query_list_valid.sort_index()

    # memory節約のため, under sampling後にfeature追加
    print('add session features....')
    x_train, x_val = join_session_features(x_train), join_session_features(x_val)
    print('add aid features....')
    x_train, x_val = join_aid_features(x_train), join_aid_features(x_val)
    print('add interactive features....')
    x_train, x_val = join_interactive_features(x_train), join_interactive_features(x_val)
    print('remove features....')
    x_train, x_val = remove_features(x_train),  remove_features(x_val)
    print('remove id from features....')
    x_train, x_val = x_train.drop(IGNORE_COL_ID, axis=1), x_val.drop(IGNORE_COL_ID, axis=1)
    print('x_train shape:', x_train.shape)

    lgb_train = lgb.Dataset(x_train, y_train, group=query_list_train)
    lgb_valid = lgb.Dataset(x_val, y_val, group=query_list_valid)

    del x_train, y_train
    gc.collect()

    model = lgb.train(
        params = params,
        train_set = lgb_train,
        #num_boost_round = 10500,
        num_boost_round = 2000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 20,
        verbose_eval = 10,
        )
    del lgb_train, lgb_valid
    gc.collect()


    # Save best model
    joblib.dump(model, f'{base_path}/otto/otto_lgbm_fold{fold}_{TYPE_MODE}.pkl')
    # Predict validation
    # でかいので分割してpredict
    Nrow = x_val.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1
    x_val_list = []
    for i in range(Ndiv):
        tmp = x_val.iloc[i*n : (i+1)*n, :]
        x_val_list.append(tmp)
    del x_val
    gc.collect()

    val_pred_list = []
    for i, v in enumerate(x_val_list):
        print('train pred i=', i)
        tmp = model.predict(v)
        val_pred_list.append(tmp)
    del x_val_list
    gc.collect()
    val_pred = np.concatenate(val_pred_list)
    del val_pred_list
    gc.collect()

    # Add to out of folds array
    # CVを終えれば全部のindexが1回ずつ計算されることになる
    oof_predictions[val_ind] = val_pred

    # 不要になった時点でモデル削除
    del model, y_val
    gc.collect()

    # tmp recall for each fold
    df = pd.DataFrame(val_pred, columns=["score"])
    tmp = train[['session', 'aid']].iloc[val_ind].reset_index(drop=True)
    pred_df = pd.concat([tmp, df], axis=1)
    del tmp
    gc.collect()

    pred_df['session_type'] = pred_df['session'].apply(lambda x: str(x) + f'_{TYPE_MODE}')
    pred_df = pred_df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)

    pred_df['n'] = pred_df.groupby('session_type').cumcount()
    pred_df = pred_df.loc[pred_df.n<20].drop(['n','score','session'],axis=1)
    pred_df['aid'] = pred_df['aid'].astype('int32')
    pred_df = pred_df.groupby('session_type')['aid'].apply(list).reset_index()
    pred_df['labels'] = pred_df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
    pred_df = pred_df.drop(['aid'],axis=1)

    sub = pred_df.loc[pred_df.session_type.str.contains(TYPE_MODE)].copy()
    sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
    sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

    test_labels = pd.read_parquet(f'{base_path}/input/otto/otto-validation/test_labels.parquet')
    test_labels = test_labels.loc[test_labels['type']==TYPE_MODE]
    # foldごとのreallなのでinnter
    test_labels = test_labels.merge(sub, how='inner', on=['session']) 
    test_labels['labels'] = test_labels['labels'].fillna('[]')
    test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
    test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
    recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
    print(f'fold {fold} {TYPE_MODE} recall =',recall)


 
--------------------------------------------------
Training fold 0/5....
before mean: 0.012008089074769792
under sampling....... 1 / 5
under sampling....... 2 / 5
under sampling....... 3 / 5
under sampling....... 4 / 5
under sampling....... 5 / 5
under sampling end
post proccess1
after mean: 0.025
add session features....
add aid features....
add interactive features....
remove features....
remove id from features....
x_train shape: (32518240, 156)
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35329
[LightGBM] [Info] Number of data points in the train set: 32518240, number of used features: 156
Training until validation scores don't improve for 20 rounds
[10]	training's ndcg@20: 0.730664	valid_1's ndcg@20: 0.640021
[20]	training's ndcg@20: 0.731556	valid_1's ndcg@20: 0.640864
[30]	training's ndcg@20: 0.732465	valid_1's ndcg@20: 0.641415
[40]	training's ndcg@20: 0.733116	valid_1's ndcg@20: 0.641803
[50]	training's ndcg@20: 0.733904	valid_1's nd

under sampling....... 2 / 5
under sampling....... 3 / 5
under sampling....... 4 / 5
under sampling....... 5 / 5
under sampling end
post proccess1
after mean: 0.025
add session features....
add aid features....
add interactive features....
remove features....
remove id from features....
x_train shape: (32518240, 156)
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 35313
[LightGBM] [Info] Number of data points in the train set: 32518240, number of used features: 156
Training until validation scores don't improve for 20 rounds
[10]	training's ndcg@20: 0.731142	valid_1's ndcg@20: 0.637306
[20]	training's ndcg@20: 0.732149	valid_1's ndcg@20: 0.638265
[30]	training's ndcg@20: 0.732904	valid_1's ndcg@20: 0.63877
[40]	training's ndcg@20: 0.733651	valid_1's ndcg@20: 0.639458
[50]	training's ndcg@20: 0.734419	valid_1's ndcg@20: 0.640132
[60]	training's ndcg@20: 0.735073	valid_1's ndcg@20: 0.640584
[70]	training's ndcg@20: 0.7357	valid_1's ndcg@20: 0.641082


In [17]:
df = pd.DataFrame(oof_predictions, columns=["score"])
#df.to_csv(f'{base_path}/otto/oof_lgbm_{TYPE_MODE}.csv', index = False)

pred_df = pd.concat([train[['session', 'aid']], df], axis=1)
pred_df['session_type'] = pred_df['session'].apply(lambda x: str(x) + f'_{TYPE_MODE}')
pred_df = pred_df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)

pred_df['n'] = pred_df.groupby('session_type').cumcount()
pred_df = pred_df.loc[pred_df.n<20].drop(['n','score','session'],axis=1)
pred_df['aid'] = pred_df['aid'].astype('int32')
pred_df = pred_df.groupby('session_type')['aid'].apply(list).reset_index()
pred_df['labels'] = pred_df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
pred_df = pred_df.drop(['aid'],axis=1)
pred_df

Unnamed: 0,session_type,labels
0,11098529_clicks,1105029 459126 295362 217742 1544564 1694360 1...
1,11098534_clicks,223062 908024 1342293 1607945 530377 1300062 1...
2,11098535_clicks,745365 803918 1750442 767201 896972 236461 273...
3,11098537_clicks,358965 1409748 336024 1723620 1405280 1092681 ...
4,11098538_clicks,1263747 1550143 1711586 703265 717871 1289587 ...
...,...,...
1016190,12899773_clicks,1311526 1484665 1578804 184006 337571 139669 9...
1016191,12899774_clicks,33035 1539309 270852 819288 95488 771913 21879...
1016192,12899775_clicks,1743151 1760714 1255910 1163166 1022572 310546...
1016193,12899777_clicks,384045 1308634 1688215 395762 703474 1486067 3...


In [18]:
sub = pred_df.loc[pred_df.session_type.str.contains(TYPE_MODE)].copy()
sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

test_labels = pd.read_parquet(f'{base_path}/input/otto/otto-validation/test_labels.parquet')
test_labels = test_labels.loc[test_labels['type']==TYPE_MODE]
test_labels = test_labels.merge(sub, how='left', on=['session'])
test_labels['labels'] = test_labels['labels'].fillna('[]')
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
print(f'{TYPE_MODE} recall =',recall)

clicks recall = 0.5369716564874277


In [19]:
# click total: 1,755,534
# 0.52なら912,877の正解が必要

In [20]:
# ranker model, fold0を factor=1.111369 で割ればそれっぽい値が出る, each foldは session inner joinなので高めに出る
# num=100, lr=0.1, no feature add, valid_1's ndcg@50: 0.843586, fold 0 orders recall = 0.7263385039885385, orders recall = 0.6535526311589739
# add aid feature, valid_1's ndcg@50: 0.84962, fold 0 orders recall = 0.7305304490864389, (orders recall = 0.657324?)
# add aid + session feature, valid_1's ndcg@50: 0.849762, fold 0 orders recall = 0.7302651361055592, orders recall = 0.6575806806829172
# all valid_1's ndcg@50: 0.848664, fold 0 orders recall = 0.730990324919964, orders recall = 0.6579892308723504


# hypter paramやりなおし、regularization param大幅変更 num=100, lr=0.1
# no add: valid_1's ndcg@20: 0.835401, fold 0 orders recall = 0.7261793162000106, orders recall = 0.6535877409408783  -> order,carts差し替えPB = 0.581
#                                                                                 carts recall = 0.41837386076234817
# sessionのみ: valid_1's ndcg@20: 0.836164, fold 0 orders recall = 0.7268514424182394, orders recall = 0.6545069788671031 -> order,carts差し替えPB = 0.583
#                                                                                      carts recall = 0.4196106730132077
# aidのみ: valid_1's ndcg@20: 0.842205, fold 0 orders recall = 0.7302828236376179, orders recall = 0.6575838724812721 -> order,carts差し替えPB = 0.586
#                                                                                  carts recall = 0.42261163401459195                              
# aid+session: valid_1's ndcg@20: 0.843169, fold 0 orders recall = 0.7309018872596706, orders recall = 0.6582030813621319 -> csv明日提出
#                                                                                      carts recall = 0.42347896378377814
# all: valid_1's ndcg@20: 0.843514, fold 0 orders recall = 0.731184887772609, orders recall = 0.6584137400535583 -> 計算中
#                                                                             carts recall = 0.42349804503870025


# num=1000, lr=0.05 [281] valid_1's ndcg@20: 0.844737, fold 0 orders recall = 0.7315386384137821, orders recall = 0.6586627003252442

