In [107]:
from google.colab import drive

In [108]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [109]:
!ls /content/drive/MyDrive/output/otto/

test.parquet	train_20_old2.parquet  train_20.parquet  train.parquet
test_preds.csv	train_20_old.parquet   train_50.parquet


In [4]:
!pip3 install optuna

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting optuna
  Downloading optuna-3.0.5-py3-none-any.whl (348 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 KB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting importlib-metadata<5.0.0
  Downloading importlib_metadata-4.13.0-py3-none-any.whl (23 kB)
Collecting cmaes>=0.8.2
  Downloading cmaes-0.9.1-py3-none-any.whl (21 kB)
Collecting cliff
  Downloading cliff-4.1.0-py3-none-any.whl (81 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m81.0/81.0 KB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
Collecting alembic>=1.5.0
  Downloading alembic-1.9.1-py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.4/210.4 KB[0m [31m17.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting Mako
  Downloading Mako-1.2.4-py3-none-any.

# Preprocessing

In [110]:
# ====================================================
# Library
# ====================================================
import gc
import warnings
warnings.filterwarnings('ignore')
import scipy as sp
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
import itertools
from imblearn.under_sampling import RandomUnderSampler

In [138]:
DEBUG_MODE = False
OPTUNA_FLAG = False
if DEBUG_MODE:
    train = train.head(100000)
IGNORE_COL = ['session','aid']

TYPE_MODE = 'clicks'
#TYPE_MODE = 'carts'
#TYPE_MODE = 'orders'
IGNORE_COL += ['y_clicks', 'y_carts', 'y_orders']

if TYPE_MODE == 'clicks':
    target = 'y_clicks'
    # under sampling 1.3 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'carts':
    target = 'y_carts'
    # under sampling 1.6 -> 2.5%
    pos_neg_ratio = 1/39
elif TYPE_MODE == 'orders':
    target = 'y_orders'
    # under sampling 2.1 -> 2.5%
    pos_neg_ratio = 1/39

In [129]:
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')
train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_50.parquet')
#train = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20_old.parquet')

#train20 = pd.read_parquet('/content/drive/MyDrive/output/otto/train_20.parquet')

In [130]:
def reduce_memory(df):
    df['session'] = df['session'].astype('int32')
    df['aid'] = df['aid'].astype('int32')
    df['session_action_count'] = df['session_action_count'].astype('int16')
    df['session_click_count'] = df['session_click_count'].astype('int16')
    df['session_cart_count'] = df['session_cart_count'].astype('int16')
    df['session_order_count'] = df['session_order_count'].astype('int16')
    df['n_clicks'] = df['n_clicks'].astype('int8')
    df['n_carts'] = df['n_carts'].astype('int8')
    df['n_buys'] = df['n_buys'].astype('int8')
    return df

# topn件だけを使う
def use_top_n(n, df):
    df = df.query(f'score_click >= -1 or score_cart >= -1 or score_buy >= -1 or (-1 < n_clicks and n_clicks<{n}) or (-1 < n_carts and n_carts<{n}) or (-1 < n_buys and n_buys<{n})')
    return df

# 負例しかないものは学習に使えないので削る（学習のみ）
def remove_negative_session(df):
    true_df = df.groupby('session')[target].agg('sum') > 0
    session = pd.DataFrame(true_df[true_df]).reset_index()['session']
    df = df.merge(session, how = 'inner', on = 'session')
    return df

# 負例が多すぎる場合にunder samplingする
# ratio = pos/neg
def negative_sampling(df_x, df_y, ratio):
    print('before mean:', df_y.mean())
    after_x, after_y = RandomUnderSampler(sampling_strategy=ratio).fit_resample(df_x, df_y)
    print('after mean:', after_y.mean())
    return after_x, after_y

In [131]:
train = reduce_memory(train)
#train20 = reduce_memory(train20)

In [132]:
train = use_top_n(50, train)
#train20 = use_top_n(20, train20)

In [133]:
train = remove_negative_session(train)
#train20 = remove_negative_session(train20)

In [134]:
train[train[['session','aid']].duplicated()]

Unnamed: 0,session,aid,score_click,score_cart,score_buy,session_action_count,session_click_count,session_cart_count,session_order_count,n_clicks,n_carts,n_buys,clicks_rank,carts_rank,orders_rank,y_clicks,y_carts,y_orders


In [135]:
train[target].sum()

1012697

In [136]:
train[target].mean()

0.013402186393136853

In [137]:
train

Unnamed: 0,session,aid,score_click,score_cart,score_buy,session_action_count,session_click_count,session_cart_count,session_order_count,n_clicks,n_carts,n_buys,clicks_rank,carts_rank,orders_rank,y_clicks,y_carts,y_orders
0,11098529,1105029,0.071773,0.071773,0.0,1,1,0,0,-1,-1,-1,207743,-1,-1,True,False,False
1,11098529,459126,,,,0,0,0,0,0,0,3,72021,123811,-1,False,False,False
2,11098529,1339838,,,,0,0,0,0,1,1,7,-1,-1,-1,False,False,False
3,11098529,1544564,,,,0,0,0,0,2,4,-1,34053,57613,-1,False,False,False
4,11098529,217742,,,,0,0,0,0,3,5,-1,131217,-1,-1,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75562069,12899778,162064,,,,0,0,0,0,-1,-1,45,901,46,30,False,False,False
75562070,12899778,631899,,,,0,0,0,0,-1,-1,46,108,83,31,False,False,False
75562071,12899778,1436280,,,,0,0,0,0,-1,-1,47,151,82,32,False,False,False
75562072,12899778,954951,,,,0,0,0,0,-1,-1,48,109,117,33,False,False,False


# Training & Inference

In [122]:
# ====================================================
# Library
# ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import joblib
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.preprocessing import LabelEncoder
# optuna
if OPTUNA_FLAG:
    import optuna.integration.lightgbm as lgb
else:
    import lightgbm as lgb

from itertools import combinations

In [123]:
if OPTUNA_FLAG:
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',  # Noneにした方がよさそう？
        'boosting': 'gbdt',
        'seed': 42,        
        'n_jobs': -1,
        }
    # Create a numpy array to store out of folds predictions
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold}...')

        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        train_tmp = train.drop(IGNORE_COL , axis=1)
        x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
        del x_train, y_train
        gc.collect()

        #lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            #num_boost_round = 10500,
            num_boost_round = 100,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 100,
            verbose_eval = 10,
            )
        del lgb_train, lgb_valid
        gc.collect()
        break
    model.params

In [139]:
if OPTUNA_FLAG:
    print("Optuna results: ",model.params)

params =  {'objective': 'binary',
 'metric': 'binary_logloss',
 'boosting': 'gbdt',
 'seed': 42,
 'n_jobs': -1,
 'feature_pre_filter': False,
 'lambda_l1': 6.595370151657238,
 'lambda_l2': 1.0592737233474818e-08,
 'num_leaves': 255,
 'feature_fraction': 1.0,
 'bagging_fraction': 0.9703737428957173,
 'bagging_freq': 2,
 'min_child_samples': 20,
 'learning_rate': 0.1}

In [140]:
    # Create a numpy array to store out of folds predictions
    oof_predictions = np.zeros(len(train))
    kfold = KFold(n_splits = 5, shuffle = True, random_state = 42)
    for fold, (trn_ind, val_ind) in enumerate(kfold.split(train)):
        print(' ')
        print('-'*50)
        print(f'Training fold {fold}...')

        y_train, y_val = train[target].iloc[trn_ind], train[target].iloc[val_ind]
        train_tmp = train.drop(IGNORE_COL , axis=1)
        x_train, x_val = train_tmp.iloc[trn_ind], train_tmp.iloc[val_ind]
        
        # under sampling
        x_train, y_train = negative_sampling(x_train, y_train, pos_neg_ratio)

        lgb_train = lgb.Dataset(x_train, y_train)
        lgb_valid = lgb.Dataset(x_val, y_val)
        del x_train, y_train
        gc.collect()

        #lgb_valid = lgb.Dataset(x_val, y_val)
        model = lgb.train(
            params = params,
            train_set = lgb_train,
            #num_boost_round = 10500,
            num_boost_round = 1000,
            valid_sets = [lgb_train, lgb_valid],
            early_stopping_rounds = 10,
            verbose_eval = 5,
            )
        del lgb_train, lgb_valid
        gc.collect()

        
        # Save best model
        joblib.dump(model, f'/content/drive/MyDrive/otto/otto_lgbm_fold{fold}_{TYPE_MODE}.pkl')
        # Predict validation
        # でかいので分割してpredict
        Nrow = x_val.shape[0]
        Ndiv = 5
        n = int(Nrow // Ndiv) + 1
        x_val_list = []
        for i in range(Ndiv):
            tmp = x_val.iloc[i*n : (i+1)*n, :]
            x_val_list.append(tmp)
        del x_val
        gc.collect()

        val_pred_list = [] 
        for i, v in enumerate(x_val_list):
            print('train pred i=', i)
            tmp = model.predict(v)
            val_pred_list.append(tmp) 
        del x_val_list
        gc.collect()
        val_pred = np.concatenate(val_pred_list)
        del val_pred_list
        gc.collect()

        # Add to out of folds array
        # CVを終えれば全部のindexが1回ずつ計算されることになる
        oof_predictions[val_ind] = val_pred

        # 不要になった時点でモデル削除
        del model, y_val
        gc.collect()


 
--------------------------------------------------
Training fold 0...
before mean: 0.013397445302379622
after mean: 0.025
Training until validation scores don't improve for 10 rounds.
[5]	training's binary_logloss: 0.0790101	valid_1's binary_logloss: 0.0534971
[10]	training's binary_logloss: 0.0731758	valid_1's binary_logloss: 0.0496607
[15]	training's binary_logloss: 0.0704008	valid_1's binary_logloss: 0.0477772
[20]	training's binary_logloss: 0.0689644	valid_1's binary_logloss: 0.0467849
[25]	training's binary_logloss: 0.0681849	valid_1's binary_logloss: 0.0462381
[30]	training's binary_logloss: 0.0677479	valid_1's binary_logloss: 0.0459283
[35]	training's binary_logloss: 0.0674998	valid_1's binary_logloss: 0.0457532
[40]	training's binary_logloss: 0.0673526	valid_1's binary_logloss: 0.0456501
[45]	training's binary_logloss: 0.0672548	valid_1's binary_logloss: 0.0455851
[50]	training's binary_logloss: 0.0671913	valid_1's binary_logloss: 0.0455462
[55]	training's binary_logloss: 0.0

In [141]:
df = pd.DataFrame(oof_predictions, columns=["score"])
pred_df = pd.concat([train[['session', 'aid']], df], axis=1)
pred_df['session_type'] = pred_df['session'].apply(lambda x: str(x) + f'_{TYPE_MODE}')
pred_df = pred_df.sort_values(['session_type','score'],ascending=[True, False]).reset_index(drop=True)

pred_df['n'] = pred_df.groupby('session_type').cumcount()
pred_df = pred_df.loc[pred_df.n<20].drop(['n','score','session'],axis=1)
pred_df['aid'] = pred_df['aid'].astype('int32')
pred_df = pred_df.groupby('session_type')['aid'].apply(list).reset_index()
pred_df['labels'] = pred_df['aid'].map(lambda x: ''.join(str(x)[1:-1].split(',')))
pred_df = pred_df.drop(['aid'],axis=1)
pred_df

Unnamed: 0,session_type,labels
0,11098529_clicks,1105029 459126 1339838 217742 1544564 295362 1...
1,11098534_clicks,223062 908024 1342293 530377 1607945 1039538 1...
2,11098535_clicks,745365 1750442 803918 1423032 896972 767201 23...
3,11098538_clicks,1263747 1550143 1570378 52785 703265 351587 13...
4,11098539_clicks,1408458 631008 1658802 1057728 1251433 1005087...
...,...,...
1012692,12899773_clicks,1311526 1578804 1484665 946627 184006 337571 2...
1012693,12899774_clicks,33035 1539309 819288 771913 95488 270852 18125...
1012694,12899775_clicks,1743151 1760714 1163166 1255910 1022572 783827...
1012695,12899777_clicks,384045 1308634 703474 395762 1688215 1486067 3...


In [142]:
sub = pred_df.loc[pred_df.session_type.str.contains(TYPE_MODE)].copy()
sub['session'] = sub.session_type.apply(lambda x: int(x.split('_')[0]))
sub.labels = sub.labels.apply(lambda x: [int(i) for i in x.split(' ')[:20]])

test_labels = pd.read_parquet('/content/drive/MyDrive/input/otto/otto-validation/test_labels.parquet')
test_labels = test_labels.loc[test_labels['type']==TYPE_MODE]
test_labels = test_labels.merge(sub, how='left', on=['session'])
test_labels['labels'] = test_labels['labels'].fillna('[]')
test_labels['hits'] = test_labels.apply(lambda df: len(set(df.ground_truth).intersection(set(df.labels))), axis=1)
test_labels['gt_count'] = test_labels.ground_truth.str.len().clip(0,20)
recall = test_labels['hits'].sum() / test_labels['gt_count'].sum()
print(f'{TYPE_MODE} recall =',recall)

clicks recall = 0.5295727681719636


In [93]:
# click total: 1,755,534
# 0.52なら912,877の正解が必要

In [70]:
# clicks recall = 0.5271239406357268 おためしtop20

# baseline top20のitem2itemを使ってgenerateしたもの, trainsform, duplicate削減、negativeのみremove
# clicks recall = 0.5279590141803007 num=100 きた！
# 既存データ + 50までbackfill, 20位まで num=100 clicks recall = 0.5289963053976738 きた！
#                                               orders recall = 0.6531281219777659
# 既存データ + 50までbackfill, 30位まで num=100 orders recall = 0.6533100544839979
# 既存データ + 50までbackfill, 50位まで num=100 orders recall = 0.6536483851096223
# 既存データ + 50までbackfill, 50位まで num=1000(137) orders recall = 0.6536451933112674

# under samplingなしだと上位50で2.1%がpositive
# 既存データ + 50までbackfill under sampling pos:neg = 1:2 33% pos, orders recall = 0.6190460991436405
#                                            pos:neg = 1:9 10% pos, orders recall = 0.6536036999326531
#                                            pos:neg = 1:19 5% pos, orders recall = 0.6536388097145575 ちょい下がるけどそんなに問題なさそう
#                                            pos:neg = 1:39 2.5% pos,orders recall= 0.6536930702865916 これくらいの比率で固定しよう
#                                                                    carts recall = 0.41731398378440265
#                                                                    clicks recall = 0.5295727681719636


