In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp -r /content/drive/MyDrive/datasets_for_colab_temp/urop/* /content

### lightgbm

In [3]:
!git clone --recursive https://github.com/Microsoft/LightGBM
!cd LightGBM && rm -rf build && mkdir build && cd build && cmake -DUSE_GPU=1 ../../LightGBM && make -j4 && cd ../python-package && python3 setup.py install --precompile --gpu;

Cloning into 'LightGBM'...
remote: Enumerating objects: 27743, done.[K
remote: Counting objects: 100% (343/343), done.[K
remote: Compressing objects: 100% (197/197), done.[K
remote: Total 27743 (delta 179), reused 248 (delta 137), pack-reused 27400[K
Receiving objects: 100% (27743/27743), 19.92 MiB | 33.88 MiB/s, done.
Resolving deltas: 100% (20483/20483), done.
Submodule 'include/boost/compute' (https://github.com/boostorg/compute) registered for path 'external_libs/compute'
Submodule 'eigen' (https://gitlab.com/libeigen/eigen.git) registered for path 'external_libs/eigen'
Submodule 'external_libs/fast_double_parser' (https://github.com/lemire/fast_double_parser.git) registered for path 'external_libs/fast_double_parser'
Submodule 'external_libs/fmt' (https://github.com/fmtlib/fmt.git) registered for path 'external_libs/fmt'
Cloning into '/content/LightGBM/external_libs/compute'...
remote: Enumerating objects: 21733, done.        
remote: Counting objects: 100% (5/5), done.       

# data loading

In [1]:
import os
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from lightgbm.sklearn import LGBMRanker

from tqdm import tqdm

In [2]:
def get_df_log_of(df_log, dname):
    df_log = df_log.copy()
    if dname == 'CV':
        df_log = df_log[(df_log['t_dat'] < '2019-10-02')]
        df_log['target'] = 'train'
        df_log.loc[(df_log['t_dat'] >= '2019-09-18'), 'target'] = 'valid'
        df_log.loc[(df_log['t_dat'] >= '2019-09-25'), 'target'] = 'test'
    elif dname == 'LB':
        df_log = df_log[(df_log['t_dat'] >= '2019-09-19')]
        df_log['target'] = 'train'
        df_log.loc[(df_log['t_dat'] >= '2020-09-16'), 'target'] = 'valid'
    else:
        raise Exception("dname = CV or LB only")
    return df_log

def customer_hex_id_to_int(series):
    return series.str[-16:].apply(hex_id_to_int)

def hex_id_to_int(str):
    return int(str[-16:], 16)

def article_id_str_to_int(series):
    return series.astype('int32')

def article_id_int_to_str(series):
    return '0' + series.astype('str')

from typing import List

def calc_ap(answerset: set, predictions: List[str], top_k: int = 12) -> float:
    if not answerset:
        return 0.0
    if len(predictions) > top_k:
        predictions = predictions[:top_k]
    score = 0.0
    hit_count = 0.0
    seenset = set()
    for index, prediction in enumerate(predictions):
        if prediction in answerset and prediction not in seenset:
            hit_count += 1.0
            score += hit_count / (index + 1.0)
            seenset.add(prediction)
    return score / min(len(answerset), top_k)

In [3]:
df_sub_raw = pd.read_parquet('raw/sample_submission.pq')

df_user = pd.read_parquet('data/df_user_preprocessed.pq')
df_log_all = pd.read_parquet('data/df_log_preprocessed.pq')
df_log_CV = get_df_log_of(df_log_all, 'CV')
df_log_LB = get_df_log_of(df_log_all, 'LB')

del df_log_all

In [4]:
df_user.reset_index(inplace=True)
df_user.drop(columns=['Active', 'age'], inplace=True)

In [5]:
df_log_CV['customer_id'] = customer_hex_id_to_int(df_log_CV['customer_id'])
df_log_LB['customer_id'] = customer_hex_id_to_int(df_log_LB['customer_id'])
df_log_CV['article_id'] = article_id_str_to_int(df_log_CV['article_id'])
df_log_LB['article_id'] = article_id_str_to_int(df_log_LB['article_id'])
df_log_CV.drop(columns='t_dat', inplace=True)
df_log_LB.drop(columns='t_dat', inplace=True)
df_user['customer_id'] = customer_hex_id_to_int(df_user['customer_id'])

# CV

In [9]:
df_log_CV_test = df_log_CV[df_log_CV['target'] == 'test']
df_log_CV_train_valid = df_log_CV[df_log_CV['target'] != 'test']

In [10]:
uid2aiidset_CV = {}
for row in df_log_CV_test[['customer_id', 'article_id']].itertuples():
    _, uid, iid = row
    if uid not in uid2aiidset_CV:
        uid2aiidset_CV[uid] = set()
    uid2aiidset_CV[uid].add('0'+str(iid))

### generating candidates

In [11]:
week_cut = 8

val_week = df_log_CV_train_valid['week'].iloc[-1]
c2weeks = df_log_CV_train_valid.groupby('customer_id')['week'].unique()

candidates_lastpurchase = []
lastpurchase_valweek = {}

c2weeks2shiftedweeks = {}
for id, weeks in c2weeks.items():
    c2weeks2shiftedweeks[id] = {}
    for week in weeks:
        c2weeks2shiftedweeks[id][week] = week

for j in range(week_cut):
    for id, weeks in c2weeks.items():
        for week in weeks:
            c2weeks2shiftedweeks[id][week] = min(c2weeks2shiftedweeks[id][week]+1, val_week)

    df_copy_temp = df_log_CV_train_valid.copy()

    weeks = []
    for id, aid, week in zip(df_log_CV_train_valid['customer_id'], df_log_CV_train_valid['article_id'], df_log_CV_train_valid['week']):
        weeks.append(c2weeks2shiftedweeks[id][week])
        if c2weeks2shiftedweeks[id][week] == val_week:
            if id in lastpurchase_valweek:
                lastpurchase_valweek[id].append('0'+str(aid))
            else:
                lastpurchase_valweek[id] = ['0'+str(aid)]
    df_copy_temp.week = weeks

    candidates_lastpurchase.append(df_copy_temp.sample(frac=0.125, random_state=24+j).copy())

    del df_copy_temp

del c2weeks

In [12]:
local_week_cut = 4

df_log_temp = df_log_CV_train_valid[df_log_CV_train_valid['week']>val_week-local_week_cut]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
local_popular_iids = article_id_int_to_str(df_iid_count_temp.sort_values('count', ascending=False).iloc[:12].index).tolist()

del df_iid_count_temp
del df_log_temp

In [13]:
df_log_CV_train_valid['purchased'] = 1

data = pd.concat([df_log_CV_train_valid, *candidates_lastpurchase])
data.purchased.fillna(0, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [14]:
data['customer_id'] = data['customer_id'].astype('int64')

data = data.merge(df_user, on='customer_id', how='left', copy=False)

data = data.sort_values(['week', 'customer_id'])
data = data.reset_index(drop=True)

In [None]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33921359 entries, 0 to 33921358
Data columns (total 19 columns):
 #   Column            Dtype         
---  ------            -----         
 0   customer_id       int64         
 1   article_id        int32         
 2   price             float64       
 3   sales_channel_id  int64         
 4   timestamp         int64         
 5   dto               datetime64[ns]
 6   week              int64         
 7   log_price         float64       
 8   target            object        
 9   purchased         float64       
 10  FN                int32         
 11  ACTIVE            uint8         
 12  LEFT CLUB         uint8         
 13  PRE-CREATE        uint8         
 14  Monthly           uint8         
 15  NONE              uint8         
 16  Regularly         uint8         
 17  age_gmm_1         float64       
 18  age_gmm_2         float64       
dtypes: datetime64[ns](1), float64(5), int32(2), int64(4), object(1), uint8(6)
memory u

### training

In [15]:
train = data[data.week != val_week]
val = data[data.week==val_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id'])

train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

In [16]:
columns_to_use = ['article_id', 'sales_channel_id',
                  'week', 'log_price', 'FN',
                  'ACTIVE', 'LEFT CLUB', 'PRE-CREATE',
                  'Monthly', 'NONE', 'Regularly',
                  'age_gmm_1', 'age_gmm_2']

train_X = train[columns_to_use]
train_y = train['purchased']

val_X = val[columns_to_use]

### one model

In [None]:
ranker = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    boosting_type='goss',
    n_estimators=30,
    importance_type='gain',
    verbose=10,
    max_depth=4,
    device='gpu'
)

In [None]:
ranker = ranker.fit(
    train_X, train_y,
    group=train_baskets
)

In [None]:
for i in ranker.feature_importances_.argsort()[::-1]:
    print(columns_to_use[i], ranker.feature_importances_[i]/ranker.feature_importances_.sum())

article_id 0.5006277692668195
week 0.2554346641132712
sales_channel_id 0.15664731193244236
log_price 0.08187873944975718
age_gmm_1 0.004467460309324328
age_gmm_2 0.0009440549283854498
Regularly 0.0
NONE 0.0
Monthly 0.0
PRE-CREATE 0.0
LEFT CLUB 0.0
ACTIVE 0.0
FN 0.0


### validation

In [None]:
val['preds'] = ranker.predict(val_X)

c_id2predicted_article_ids = val \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

In [None]:
aps = []

for uid, aiidset in uid2aiidset_CV.items():
    preds = []
    if uid in c_id2predicted_article_ids:
        preds.extend(['0'+str(p) for p in c_id2predicted_article_ids[uid]])
    if uid in lastpurchase_valweek:
        for iid in lastpurchase_valweek[uid]:
            if iid in preds: continue
            preds.append(iid)
    for iid in local_popular_iids:
        if iid in preds: continue
        preds.append(iid)
    aps.append(calc_ap(aiidset, preds[:12]))

mean_ap = np.array(aps).mean()

In [None]:
mean_ap

0.01885726045994695

### searching for good hyperparameters

In [19]:
n_estimators_list = [1, 3, 30, 50, 100, 200, 300]
max_depth_list = [3, 4, 6, 8, 9, 10]
hyp_list = [(n, m) for n in n_estimators_list for m in max_depth_list]

rankers = []
for n, m in hyp_list:
    rankers.append(LGBMRanker(
        objective='lambdarank',
        metric='ndcg',
        boosting_type='goss',
        n_estimators=n,
        importance_type='gain',
        verbose=-1,
        max_depth=m,
        device='gpu'))

In [None]:
fit_rankers = []
for r in tqdm(rankers):
    fit_rankers.append(r.fit(train_X, train_y, group=train_baskets))

In [22]:
val_copy = val.copy(deep=False)
best_ap = 0
best_hyp = (0, 0)

for i in tqdm(range(len(hyp_list))):
    r = fit_rankers[i]
    n, m = hyp_list[i]
    val_copy['preds'] = r.predict(val_X)
    c_id2predicted_article_ids = val_copy \
        .sort_values(['customer_id', 'preds'], ascending=False) \
        .groupby('customer_id')['article_id'].apply(list).to_dict()

    aps = []
    for uid, aiidset in uid2aiidset_CV.items():
        preds = []
        if uid in c_id2predicted_article_ids:
            preds.extend(['0'+str(p) for p in c_id2predicted_article_ids[uid]])
        if uid in lastpurchase_valweek:
            for iid in lastpurchase_valweek[uid]:
                if iid in preds: continue
                preds.append(iid)
        for iid in local_popular_iids:
            if iid in preds: continue
            preds.append(iid)
        aps.append(calc_ap(aiidset, preds[:12]))
    mean_ap = np.array(aps).mean()

    if mean_ap > best_ap:
        best_ap = mean_ap
        best_hyp = (n, m)

print(best_hyp, best_ap)

100%|██████████| 42/42 [05:59<00:00,  8.55s/it]

(200, 8) 0.01910618398681305





### clearing up space

In [24]:
del uid2aiidset_CV
del lastpurchase_valweek
del local_popular_iids
del c_id2predicted_article_ids
del candidates_lastpurchase

In [25]:
del train_X
del train_y
del val_X
del train
del val
del data
del df_log_CV_test
del df_log_CV_train_valid
del df_log_CV

# LB

### generating candidates

In [7]:
week_cut = 8

test_week = df_log_LB['week'].iloc[-1]
c2weeks = df_log_LB.groupby('customer_id')['week'].unique()

candidates_lastpurchase = []
lastpurchase_testweek = {}

c2weeks2shiftedweeks = {}
for id, weeks in c2weeks.items():
    c2weeks2shiftedweeks[id] = {}
    for week in weeks:
        c2weeks2shiftedweeks[id][week] = week

for j in range(week_cut):
    for id, weeks in c2weeks.items():
        for week in weeks:
            c2weeks2shiftedweeks[id][week] = min(c2weeks2shiftedweeks[id][week]+1, test_week)

    df_copy_temp = df_log_LB.copy()

    weeks = []
    for id, aid, week in zip(df_log_LB['customer_id'], df_log_LB['article_id'], df_log_LB['week']):
        weeks.append(c2weeks2shiftedweeks[id][week])
        if c2weeks2shiftedweeks[id][week] == test_week:
            if id in lastpurchase_testweek:
                lastpurchase_testweek[id].append('0'+str(aid))
            else:
                lastpurchase_testweek[id] = ['0'+str(aid)]
    df_copy_temp.week = weeks

    candidates_lastpurchase.append(df_copy_temp.sample(frac=0.125, random_state=24+j).copy())

    del df_copy_temp

del c2weeks

local_week_cut = 4

df_log_temp = df_log_LB[df_log_LB['week']>test_week-local_week_cut]
df_iid_count_temp = df_log_temp[['article_id', 'customer_id']].groupby('article_id').count()
df_iid_count_temp.columns = ['count']
local_popular_iids = article_id_int_to_str(df_iid_count_temp.sort_values('count', ascending=False).iloc[:12].index).tolist()

del df_iid_count_temp
del df_log_temp

df_log_LB['purchased'] = 1

data = pd.concat([df_log_LB, *candidates_lastpurchase])
data.purchased.fillna(0, inplace=True)

data['customer_id'] = data['customer_id'].astype('int64')

data = data.merge(df_user, on='customer_id', how='left', copy=False)

data = data.sort_values(['week', 'customer_id'])
data = data.reset_index(drop=True)

In [9]:
data.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30041826 entries, 0 to 30041825
Data columns (total 19 columns):
 #   Column            Dtype         
---  ------            -----         
 0   customer_id       int64         
 1   article_id        int32         
 2   price             float64       
 3   sales_channel_id  int64         
 4   timestamp         int64         
 5   dto               datetime64[ns]
 6   week              int64         
 7   log_price         float64       
 8   target            object        
 9   purchased         float64       
 10  FN                int32         
 11  ACTIVE            uint8         
 12  LEFT CLUB         uint8         
 13  PRE-CREATE        uint8         
 14  Monthly           uint8         
 15  NONE              uint8         
 16  Regularly         uint8         
 17  age_gmm_1         float64       
 18  age_gmm_2         float64       
dtypes: datetime64[ns](1), float64(5), int32(2), int64(4), object(1), uint8(6)
memory u

### training

In [11]:
train = data[data.week != test_week]
test = data[data.week==test_week].drop_duplicates(['customer_id', 'article_id', 'sales_channel_id'])

train_baskets = train.groupby(['week', 'customer_id'])['article_id'].count().values

columns_to_use = ['article_id', 'sales_channel_id',
                  'week', 'log_price', 'FN',
                  'ACTIVE', 'LEFT CLUB', 'PRE-CREATE',
                  'Monthly', 'NONE', 'Regularly',
                  'age_gmm_1', 'age_gmm_2']

train_X = train[columns_to_use]
train_y = train['purchased']

test_X = test[columns_to_use]

In [12]:
ranker = LGBMRanker(
    objective='lambdarank',
    metric='ndcg',
    boosting_type='goss',
    n_estimators=200,
    importance_type='gain',
    verbose=10,
    max_depth=8,
    device='gpu'
)

ranker = ranker.fit(
    train_X, train_y,
    group=train_baskets
)

[LightGBM] [Info] This is the GPU trainer!!
[LightGBM] [Debug] Dataset::GetMultiBinFromSparseFeatures: sparse rate 0.937535
[LightGBM] [Info] Total Bins 731
[LightGBM] [Info] Number of data points in the train set: 28341931, number of used features: 13
[LightGBM] [Info] Using GPU Device: A100-SXM4-40GB, Vendor: NVIDIA Corporation
[LightGBM] [Info] Compiling OpenCL Kernel with 256 bins...
[LightGBM] [Info] GPU programs have been built
[LightGBM] [Info] Size of histogram bin entry: 8
[LightGBM] [Info] 7 dense feature groups (216.23 MB) transferred to GPU in 0.240359 secs. 1 sparse feature groups
[LightGBM] [Info] Using GOSS
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[LightGBM] [Debug] Trained a tree with leaves

### prediction

In [14]:
test['preds'] = ranker.predict(test_X)

c_id2predicted_article_ids = test \
    .sort_values(['customer_id', 'preds'], ascending=False) \
    .groupby('customer_id')['article_id'].apply(list).to_dict()

predslist = []
for uid in customer_hex_id_to_int(df_sub_raw.customer_id):
    preds = []
    if uid in c_id2predicted_article_ids:
        preds.extend(['0'+str(p) for p in c_id2predicted_article_ids[uid]])
    if uid in lastpurchase_testweek:
        for iid in lastpurchase_testweek[uid]:
            if iid in preds: continue
            preds.append(iid)
    for iid in local_popular_iids:
        if iid in preds: continue
        preds.append(iid)
    predslist.append(preds[:12])

predslist = [' '.join(ps) for ps in predslist]
df_sub_raw.prediction = predslist

In [15]:
try:
    os.mkdir('data/subs')
except FileExistsError:
    print("folder already exists")

In [16]:
df_sub_raw.to_csv('data/subs/lgbm_4.csv.gz', index=False)