In [1]:
# True: Google Colab Notebook
# False: My local PC
colab = False
if colab: 
    from google.colab import drive
    drive.mount('/content/drive')
    !ls /content/drive/MyDrive/output/otto/
    base_path = '/content/drive/MyDrive'
    !pip3 install optuna
else:
    base_path = '../data'

# Preprocessing

In [2]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from imblearn.under_sampling import RandomUnderSampler

In [3]:
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_50.parquet')
train = pd.read_parquet(f'{base_path}/output/otto/train_50_tmp.parquet')

#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20_old.parquet')

#train20 = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')

In [4]:
DEBUG_MODE = False
#DEBUG_MODE = True

OPTUNA_FLAG = False
if DEBUG_MODE:
    train = train.head(100000)
IGNORE_COL = ['session','aid']

TYPE_MODE = 'clicks'
#TYPE_MODE = 'carts'
#TYPE_MODE = 'orders'
IGNORE_COL += ['y_clicks', 'y_carts', 'y_orders']

if TYPE_MODE == 'clicks':
    target = 'y_clicks'
    # under sampling 1.3 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'carts':
    target = 'y_carts'
    # under sampling 1.6 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'orders':
    target = 'y_orders'
    # under sampling 2.1 -> 2.5%
    pos_neg_ratio = 1/39

In [6]:
def reduce_memory(df):
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df['score_click'] = df['score_click'].astype('float32')
    df['score_cart'] = df['score_cart'].astype('float32')
    df['score_buy'] = df['score_buy'].astype('float32')
    df['score_click_only'] = df['score_click_only'].astype('float32')
    df['score_cart_only'] = df['score_cart_only'].astype('float32')
    df['score_buy_only'] = df['score_buy_only'].astype('float32')
    df['session_action_count'] = df['session_action_count'].astype('int16')
    df['session_click_count'] = df['session_click_count'].astype('int16')
    df['session_cart_count'] = df['session_cart_count'].astype('int16')
    df['session_order_count'] = df['session_order_count'].astype('int16')
    df['session_type_mean'] = df['session_type_mean'].astype('float32')
    
    click_topn_list = [10, 20]
    for i in click_topn_list:
        df[f'n_clicks_{i}'] = df[f'n_clicks_{i}'].astype('int8')

    df['n_carts'] = df['n_carts'].astype('int8')
    df['n_buys'] = df['n_buys'].astype('int8')
    df['clicks_count'] = df['clicks_count'].astype('int32')
    df['carts_count'] = df['carts_count'].astype('int16')
    df['orders_count'] = df['orders_count'].astype('int16')
    return df

# topn件だけを使う
def use_top_n(n, df):
    df = df.query(f'score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks_20 and n_clicks_20<{n}) or (-1 < n_carts and n_carts<{n}) or (-1 < n_buys and n_buys<{n})')
    return df

# 負例しかないものは学習に使えないので削る（学習のみ）
def remove_negative_session(df):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# 負例が多すぎる場合にunder samplingする
# ratio = pos/neg
def negative_sampling(df_x, df_y, ratio):
    print('before mean:', df_y.mean())

    Nrow = df_x.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1

    df_x_list = [df_x.iloc[i*n : (i+1)*n, :] for i in range(Ndiv)]
    df_y_list = [df_y.iloc[i*n : (i+1)*n] for i in range(Ndiv)]
    del df_x, df_y
    gc.collect()

    for i in range(Ndiv):
        print('under sampling.......',i + 1 , '/', Ndiv)
        tmpx, tmpy = RandomUnderSampler(sampling_strategy=ratio).fit_resample(df_x_list[i], df_y_list[i])
        df_x_list[i] = tmpx
        df_y_list[i] = tmpy
        del tmpx, tmpy
        gc.collect()
    print('under sampling end')
    after_x = pd.concat(df_x_list)
    del df_x_list
    gc.collect()
    print('post proccess1')
    after_y = pd.concat(df_y_list)
    del df_y_list
    gc.collect()

    print('after mean:', after_y.mean())
    return after_x, after_y

In [7]:
def join_session_features(df):
    session_df = pd.read_parquet('/content/drive/MyDrive/output/otto/valid_session_features.parquet')
    week_list = ['4weeks', '2weeks', '1week']
    session_df['session'] = session_df['session'].astype('int32')
    for i in week_list:
        session_df[f'session_action_count_{i}'] = session_df[f'session_action_count_{i}'].astype('int16')
        session_df[f'session_click_count_{i}'] = session_df[f'session_click_count_{i}'].astype('int16')
        session_df[f'session_cart_count_{i}'] = session_df[f'session_cart_count_{i}'].astype('int16')
        session_df[f'session_order_count_{i}'] = session_df[f'session_order_count_{i}'].astype('int16')
        session_df[f'session_type_mean_{i}'] = session_df[f'session_type_mean_{i}'].astype('float32')
        session_df[f'session_click_rate_{i}'] = session_df[f'session_click_rate_{i}'].astype('float32')
        session_df[f'session_cart_rate_{i}'] = session_df[f'session_cart_rate_{i}'].astype('float32')
        session_df[f'session_order_rate_{i}'] = session_df[f'session_order_rate_{i}'].astype('float32')

    #remove_col = ['session_action_count', 'session_click_count', 'session_cart_count', 'session_order_count', 'session_type_mean']
    #df.drop(remove_col , axis=1)
    df = df.merge(session_df, 'left', 'session')
    del session_df
    gc.collect()
    
    return df

In [8]:
def join_aid_features(df):
    aid_df = pd.read_parquet(f'{base_path}/output/otto/valid_aid_features.parquet')
    #week_list = ['4weeks', '3weeks', '2weeks', '1week']
    week_list = ['4weeks', '2weeks', '1week']
    aid_df['aid'] = aid_df['aid'].astype('int32')
    for i in week_list:
        aid_df[f'clicks_count_{i}'] = aid_df[f'clicks_count_{i}'].astype('int32')
        aid_df[f'carts_count_{i}'] = aid_df[f'carts_count_{i}'].astype('int16')
        aid_df[f'orders_count_{i}'] = aid_df[f'orders_count_{i}'].astype('int16')
        aid_df[f'clicks_rank_{i}'] = aid_df[f'clicks_rank_{i}'].astype('int32')
        aid_df[f'carts_rank_{i}'] = aid_df[f'carts_rank_{i}'].astype('int32')
        aid_df[f'orders_rank_{i}'] = aid_df[f'orders_rank_{i}'].astype('int32')
        for j in ['clicks', 'carts', 'orders']:
            #for k in [2,3,4]:
            for k in [2,4]:
                aid_df[f'aid_{j}_count_rate_1_{k}'] = aid_df[f'aid_{j}_count_rate_1_{k}'].astype('float32')

    remove_col = ['clicks_rank', 'carts_rank', 'orders_rank', 'clicks_count', 'carts_count', 'orders_count']
    #remove_col = ['clicks_rank_1week', 'carts_rank_1week', 'orders_rank_1week', 'clicks_count_1week', 'carts_count_1week', 'orders_count_1week']
    #df.drop(remove_col , axis=1)
    df = df.merge(aid_df, 'left', 'aid')
    del aid_df
    gc.collect()
    
    return df

In [9]:
train = reduce_memory(train)
train = use_top_n(50, train)
train = remove_negative_session(train)

In [12]:
train[target].sum()

1012735

In [13]:
train[target].mean()

0.013401377389554573

In [14]:
#train = join_session_features(train)
train = join_aid_features(train)

In [15]:
train

Unnamed: 0,session,aid,score_click,score_cart,score_buy,score_click_only,score_cart_only,score_buy_only,session_action_count,session_click_count,session_cart_count,session_order_count,session_type_mean,n_clicks_10,n_clicks_20,n_carts,n_buys,clicks_rank,carts_rank,orders_rank,clicks_count,carts_count,orders_count,y_clicks,y_carts,y_orders,clicks_count_4weeks,carts_count_4weeks,orders_count_4weeks,clicks_rank_4weeks,carts_rank_4weeks,orders_rank_4weeks,clicks_count_2weeks,carts_count_2weeks,orders_count_2weeks,clicks_rank_2weeks,carts_rank_2weeks,orders_rank_2weeks,clicks_count_1week,carts_count_1week,orders_count_1week,clicks_rank_1week,carts_rank_1week,orders_rank_1week,aid_clicks_count_rate_1_2,aid_clicks_count_rate_1_4,aid_carts_count_rate_1_2,aid_carts_count_rate_1_4,aid_orders_count_rate_1_2,aid_orders_count_rate_1_4
0,11098529,1105029,0.071773,0.071773,0.0,0.071773,0.0,0.0,1,1,0,0,0.0,-1,-1,-1,-1,207743,-1,-1,5,0,0,True,False,False,200,5,1,126128,344989,527328,76,2,1,122647,393631,331822,5,0,0,231744,-1,-1,0.065789,0.025000,0.000000,0.000000,0.000000,0.000000
1,11098529,459126,,,,,,,0,0,0,0,,0,0,0,3,72021,123811,-1,16,1,0,False,False,False,455,14,1,56855,159598,539614,199,6,0,47159,141798,-1,16,1,0,71369,125761,-1,0.080402,0.035165,0.166667,0.071429,0.000000,0.000000
2,11098529,1339838,,,,,,,0,0,0,0,,1,1,1,7,-1,-1,-1,0,0,0,False,False,False,1645,52,0,12332,43351,-1,631,19,0,11754,46034,-1,0,0,0,-1,-1,-1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,11098529,1544564,,,,,,,0,0,0,0,,2,2,4,-1,34053,57613,-1,32,2,0,False,False,False,820,39,0,29691,59118,-1,287,13,0,31556,69451,-1,32,2,0,34807,60893,-1,0.111498,0.039024,0.153846,0.051282,0.000000,0.000000
4,11098529,217742,,,,,,,0,0,0,0,,3,3,5,-1,131217,-1,-1,9,0,0,False,False,False,184,7,2,135883,288970,292434,107,2,1,89687,378158,280473,9,0,0,126394,-1,-1,0.084112,0.048913,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75569466,12899778,162064,,,,,,,0,0,0,0,,-1,-1,-1,45,901,46,30,455,192,31,False,False,False,3169,2768,1309,4965,85,36,1950,1220,523,2230,50,17,455,192,31,896,46,30,0.233333,0.143578,0.157377,0.069364,0.059273,0.023682
75569467,12899778,631899,,,,,,,0,0,0,0,,-1,-1,-1,46,108,83,31,1355,145,31,False,False,False,17258,2216,973,266,140,83,9260,1127,495,124,64,21,1355,145,31,108,84,31,0.146328,0.078514,0.128660,0.065433,0.062626,0.031860
75569468,12899778,1436280,,,,,,,0,0,0,0,,-1,-1,-1,47,151,82,32,1219,145,30,False,False,False,25897,2999,536,110,74,273,12329,1559,313,52,31,82,1219,145,30,151,82,32,0.098873,0.047071,0.093008,0.048349,0.095847,0.055970
75569469,12899778,954951,,,,,,,0,0,0,0,,-1,-1,-1,48,109,117,33,1353,123,30,False,False,False,17585,1660,606,254,232,221,8811,879,315,144,113,81,1353,123,30,109,117,33,0.153558,0.076941,0.139932,0.074096,0.095238,0.049505


In [16]:
train.dtypes

session                        int32
aid                            int32
score_click                  float32
score_cart                   float32
score_buy                    float32
score_click_only             float32
score_cart_only              float32
score_buy_only               float32
session_action_count           int16
session_click_count            int16
session_cart_count             int16
session_order_count            int16
session_type_mean            float32
n_clicks_10                     int8
n_clicks_20                     int8
n_carts                         int8
n_buys                          int8
clicks_rank                    int32
carts_rank                     int32
orders_rank                    int32
clicks_count                   int32
carts_count                    int16
orders_count                   int16
y_clicks                        bool
y_carts                         bool
y_orders                        bool
clicks_count_4weeks            int32
c

# Training & Inference

In [17]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
# optuna
if OPTUNA_FLAG:
    import optuna.integration.lightgbm as lgb
else:
    import lightgbm as lgb

from itertools import combinations

In [18]:
if OPTUNA_FLAG:
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',  # Noneにした方がよさそう？
        'boosting': 'gbdt',
        'seed': 42,        
        'n_jobs': -1,
        }
    # Create a numpy array to store out of folds predictions
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold}...')

        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        train_tmp = train.drop(IGNORE_COL , axis=1)
        x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
        del train_tmp
        gc.collect()

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
        del x_train, y_train
        gc.collect()

        #lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            #num_boost_round = 10500,
            num_boost_round = 100,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 10,
            )
        del lgb_train, lgb_valid
        gc.collect()
        break
    model.params

In [19]:
if OPTUNA_FLAG:
    print("Optuna results: ",model.params)

params =  {'objective': 'binary',
 'metric': 'binary_logloss',
 'boosting': 'gbdt',
 'seed': 42,
 'n_jobs': -1,
 'feature_pre_filter': False,
 'lambda_l1': 6.595370151657238,
 'lambda_l2': 1.0592737233474818e-08,
 'num_leaves': 255,
 'feature_fraction': 1.0,
 'bagging_fraction': 0.9703737428957173,
 'bagging_freq': 2,
 'min_child_samples': 20,
 'learning_rate': 0.1}

In [20]:
# Create a numpy array to store out of folds predictions
oof_predictions = np.zeros(len(train))
kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
    print(' ')
    print('-'*50)
    print(f'Training fold {fold}...')

    y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
    train_tmp = train.drop(IGNORE_COL , axis=1)
    x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
    del train_tmp
    gc.collect()

    # under sampling
    x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

    lgb_train = lgb.Dataset(x_train, y_train)
    lgb_valid = lgb.Dataset(x_val, y_val)
    del x_train, y_train
    gc.collect()

    #lgb_valid = lgb.Dataset(x_val, y_val)
    model = lgb.train(
        params = params,
        train_set = lgb_train,
        #num_boost_round = 10500,
        num_boost_round = 1000,
        valid_sets = [lgb_train, lgb_valid],
        early_stopping_rounds = 10,
        verbose_eval = 5,
        )
    del lgb_train, lgb_valid
    gc.collect()


    # Save best model
    joblib.dump(model, f'{base_path}/otto/otto_lgbm_fold{fold}_{TYPE_MODE}.pkl')
    # Predict validation
    # でかいので分割してpredict
    Nrow = x_val.shape[0]
    Ndiv = 5
    n = int(Nrow // Ndiv) + 1
    x_val_list = []
    for i in range(Ndiv):
        tmp = x_val.iloc[i*n : (i+1)*n, :]
        x_val_list.append(tmp)
    del x_val
    gc.collect()

    val_pred_list = []
    for i, v in enumerate(x_val_list):
        print('train pred i=', i)
        tmp = model.predict(v)
        val_pred_list.append(tmp)
    del x_val_list
    gc.collect()
    val_pred = np.concatenate(val_pred_list)
    del val_pred_list
    gc.collect()

    # Add to out of folds array
    # CVを終えれば全部のindexが1回ずつ計算されることになる
    oof_predictions[val_ind] = val_pred

    # 不要になった時点でモデル削除
    del model, y_val
    gc.collect()


 
--------------------------------------------------
Training fold 0...
before mean: 0.013403676775819653
under sampling....... 1 / 5
under sampling....... 2 / 5
under sampling....... 3 / 5
under sampling....... 4 / 5
under sampling....... 5 / 5
under sampling end
post proccess1
after mean: 0.025
[LightGBM] [Info] Number of positive: 810327, number of negative: 31602753
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 9186
[LightGBM] [Info] Number of data points in the train set: 32413080, number of used features: 45
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.025000 -> initscore=-3.663562
[LightGBM] [Info] Start training from score -3.663562
Training until validation scores don't improve for 10 rounds
[5]	training's binary_logloss: 0.0785937	valid_1's binary_logloss: 0.0531982
[10]	training's binary_logloss: 0.0726076	valid_1's binary_logloss: 0.0492853
[15]	training's binary_logloss: 0.0697347	valid_1's binary_logloss: 0.047359
[20]	trainin

[195]	training's binary_logloss: 0.0654366	valid_1's binary_logloss: 0.0448724
[200]	training's binary_logloss: 0.0654181	valid_1's binary_logloss: 0.0448714
[205]	training's binary_logloss: 0.0653947	valid_1's binary_logloss: 0.0448715
[210]	training's binary_logloss: 0.0653706	valid_1's binary_logloss: 0.0448706
[215]	training's binary_logloss: 0.0653445	valid_1's binary_logloss: 0.0448689
[220]	training's binary_logloss: 0.0653216	valid_1's binary_logloss: 0.044867
[225]	training's binary_logloss: 0.0652959	valid_1's binary_logloss: 0.0448645
[230]	training's binary_logloss: 0.0652757	valid_1's binary_logloss: 0.0448617
[235]	training's binary_logloss: 0.0652513	valid_1's binary_logloss: 0.04486
[240]	training's binary_logloss: 0.0652266	valid_1's binary_logloss: 0.0448591
[245]	training's binary_logloss: 0.0652062	valid_1's binary_logloss: 0.0448583
[250]	training's binary_logloss: 0.0651853	valid_1's binary_logloss: 0.0448563
[255]	training's binary_logloss: 0.065163	valid_1's bin

[300]	training's binary_logloss: 0.0650479	valid_1's binary_logloss: 0.0446373
[305]	training's binary_logloss: 0.0650244	valid_1's binary_logloss: 0.0446364
[310]	training's binary_logloss: 0.0650083	valid_1's binary_logloss: 0.0446354
[315]	training's binary_logloss: 0.0649871	valid_1's binary_logloss: 0.0446337
[320]	training's binary_logloss: 0.0649646	valid_1's binary_logloss: 0.0446313
[325]	training's binary_logloss: 0.064944	valid_1's binary_logloss: 0.0446306
[330]	training's binary_logloss: 0.064923	valid_1's binary_logloss: 0.0446287
[335]	training's binary_logloss: 0.0649016	valid_1's binary_logloss: 0.0446276
[340]	training's binary_logloss: 0.0648796	valid_1's binary_logloss: 0.0446281
Early stopping, best iteration is:
[334]	training's binary_logloss: 0.0649056	valid_1's binary_logloss: 0.0446273
train pred i= 0
train pred i= 1
train pred i= 2
train pred i= 3
train pred i= 4
 
--------------------------------------------------
Training fold 3...
before mean: 0.0133951744

Training until validation scores don't improve for 10 rounds
[5]	training's binary_logloss: 0.0785848	valid_1's binary_logloss: 0.0533342
[10]	training's binary_logloss: 0.0726019	valid_1's binary_logloss: 0.0494165
[15]	training's binary_logloss: 0.0697295	valid_1's binary_logloss: 0.0474823
[20]	training's binary_logloss: 0.0682307	valid_1's binary_logloss: 0.0464571
[25]	training's binary_logloss: 0.0674094	valid_1's binary_logloss: 0.0458917
[30]	training's binary_logloss: 0.0669437	valid_1's binary_logloss: 0.0455702
[35]	training's binary_logloss: 0.0666677	valid_1's binary_logloss: 0.0453829
[40]	training's binary_logloss: 0.0664965	valid_1's binary_logloss: 0.0452701
[45]	training's binary_logloss: 0.0663859	valid_1's binary_logloss: 0.0452046
[50]	training's binary_logloss: 0.0663068	valid_1's binary_logloss: 0.0451586
[55]	training's binary_logloss: 0.0662452	valid_1's binary_logloss: 0.0451277
[60]	training's binary_logloss: 0.0661937	valid_1's binary_logloss: 0.0451056
[65]

In [21]:
df = pd.DataFrame(oof_predictions, columns=["score"])
pred_df = pd.concat([train[['session', 'aid']], df], axis=1)
pred_df['session_type'] = pred_df['session'].apply(lambda x: str(x) + f'_{TYPE_MODE}')
pred_df = pred_df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)

pred_df['n'] = pred_df.groupby('session_type').cumcount()
pred_df = pred_df.loc[pred_df.n<20].drop(['n','score','session'],axis=1)
pred_df['aid'] = pred_df['aid'].astype('int32')
pred_df = pred_df.groupby('session_type')['aid'].apply(list).reset_index()
pred_df['labels'] = pred_df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
pred_df = pred_df.drop(['aid'],axis=1)
pred_df

Unnamed: 0,session_type,labels
0,11098529_clicks,1105029 459126 295362 1544564 217742 1694360 1...
1,11098534_clicks,223062 908024 530377 1342293 1607945 1649004 1...
2,11098535_clicks,745365 1750442 803918 236461 1320151 226839 89...
3,11098538_clicks,1263747 52785 1570378 1550143 703265 351587 17...
4,11098539_clicks,631008 1408458 1658802 1057728 1251433 617897 ...
...,...,...
1012730,12899773_clicks,1311526 1484665 1578804 184006 337571 946627 1...
1012731,12899774_clicks,33035 1539309 771913 819288 270852 218795 9548...
1012732,12899775_clicks,1743151 1760714 1163166 1255910 1022572 832192...
1012733,12899777_clicks,384045 1308634 1688215 703474 395762 364190 14...


In [23]:
sub = pred_df.loc[pred_df.session_type.str.contains(TYPE_MODE)].copy()
sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

test_labels = pd.read_parquet(f'{base_path}/input/otto/otto-validation/test_labels.parquet')
test_labels = test_labels.loc[test_labels['type']==TYPE_MODE]
test_labels = test_labels.merge(sub, how='left', on=['session'])
test_labels['labels'] = test_labels['labels'].fillna('[]')
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
print(f'{TYPE_MODE} recall =',recall)

clicks recall = 0.5343416874865425


In [None]:
# click total: 1,755,534
# 0.52なら912,877の正解が必要

In [None]:
# clicks recall = 0.5271239406357268 おためしtop20, , PB = 0.579

# baseline top20のitem2itemを使ってgenerateしたもの, trainsform, duplicate削減、negativeのみremove
# clicks recall = 0.5279590141803007 num=100 きた！
# 既存データ + 50までbackfill, 20位まで num=100 clicks recall = 0.5289963053976738 きた！
#                                               orders recall = 0.6531281219777659
# 既存データ + 50までbackfill, 30位まで num=100 orders recall = 0.6533100544839979
# 既存データ + 50までbackfill, 50位まで num=100 orders recall = 0.6536483851096223
# 既存データ + 50までbackfill, 50位まで num=1000(137) orders recall = 0.6536451933112674

# under samplingなしだと上位50で2.1%がpositive
# 既存データ + 50までbackfill under sampling pos:neg = 1:2 33% pos, orders recall = 0.6190460991436405
#                                            pos:neg = 1:9 10% pos, orders recall = 0.6536036999326531
#                                            pos:neg = 1:19 5% pos, orders recall = 0.6536388097145575 ちょい下がるけどそんなに問題なさそう
#                                            pos:neg = 1:39 2.5% pos,orders recall= 0.6536930702865916 これくらいの比率で固定しよう, PB = 0.580
#                                                                    carts recall = 0.41731398378440265
#                                                                    clicks recall = 0.5295727681719636
# feature増版、click i2i, top10,20 pos:neg = 1:39 2.5%, num=100 orders recall = 0.6538813863895334
#                                                      num=1000 orders recall = 0.654031400912216 , PB = 0.581
#                                                      num=1000 carts recall = 0.41827325050912256 
#                                                      num=1000 clicks recall = 0.5309427217017728
# aid feature追加 2weeks, 4weeks                                orders recall = 0.6575391873043028 ほぼ変わらんのでこっち
# 2,3,4 weeks                                                   orders recall = 0.6575551462960776
#                                                               
		
		
