In [13]:
import lightgbm as lgb
import pandas as pd
import pickle
import numpy as np
import gc
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [14]:
# project_root_path = '/content/drive/MyDrive/NCCU1102/WSM/proj3/RecSys-Dressipi/'
project_root_path = '../..'
n_train_sample = 500

In [15]:
def train_model( i ):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(i) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(i) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    query_train = [n_train_sample] * (len(X)//n_train_sample)

    print("Starting training... train_" + str(i) )
    start_time = time.time()
    gbm = lgb.LGBMRanker(device = 'cpu')
    if i == 0:
        gbm.fit(X, y, group=query_train)
    else:
        gbm.fit(X, y, group=query_train, init_model='../../model/lgbm/lgbm_' + str(i-1))
    gbm.booster_.save_model('../../model/lgbm/lgbm_' + str(i))
    print("Training finished " + str(time.time() - start_time))
    
    return gbm


In [16]:
def MRR( X_sessions, y_sessions, y_preds ):
    score = []
    for X, y, pred in zip( X_sessions, y_sessions, y_preds ):
        top100_index = np.argsort(pred)[-100:]
        rank_result = []
        flag = False
        for count, index in enumerate(top100_index[::-1]):
            if y[index] == 1:
                score.append(1/(count+1))
                flag = True
                break
        if not flag:
            score.append(0)
    return np.mean(score)

def pred2rank_result( session_id, X, pred ):
    top100_index = np.argsort(pred)[-100:]
    rank_result = []
    for count, index in enumerate(top100_index[::-1]):
        row = ','.join([str(session_id), str(int(X[index][0])), str(count+1)])
        rank_result.append(row)
    return rank_result

def predict_session( gbm, session_id ):
    with open('../../dataset/test_features/test_X_' + str(session_id) + '.pickle', 'rb') as f:
        X = pickle.load(f)
    pred = gbm.predict(X)
    return pred2rank_result( session_id, X, pred )



## Train

In [19]:
for i in range(8,32):
    model = train_model( i )

Starting training... train_8
Training finished 24.394206285476685
Starting training... train_9
Training finished 25.47046160697937
Starting training... train_10
Training finished 28.998876571655273
Starting training... train_11
Training finished 31.131080150604248
Starting training... train_12
Training finished 34.38035750389099
Starting training... train_13
Training finished 37.90410280227661
Starting training... train_14
Training finished 40.752618074417114
Starting training... train_15
Training finished 43.408037424087524
Starting training... train_16
Training finished 47.432823181152344
Starting training... train_17
Training finished 49.918471336364746
Starting training... train_18
Training finished 53.49655604362488
Starting training... train_19
Training finished 55.99709963798523
Starting training... train_20
Training finished 60.53599667549133
Starting training... train_21
Training finished 63.06070685386658
Starting training... train_22
Training finished 67.45344948768616
Start

## Validate

In [20]:
%%time

bst = lgb.Booster(model_file='../../model/lgbm/lgbm_31')
for validate_id in range(8,10):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(validate_id) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(validate_id) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    X_sessions = np.array_split(X, len(X)//n_train_sample)
    y_sessions = np.array_split(y, len(y)//n_train_sample)

    del X
    del y
    gc.collect()

    y_preds = [bst.predict(X_session) for X_session in X_sessions]
    print(MRR( X_sessions, y_sessions, y_preds))
    del X_sessions
    del y_sessions
    del y_preds
    gc.collect()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

0.46994645260165346
0.4689435812922673
CPU times: user 42min 2s, sys: 4.21 s, total: 42min 6s
Wall time: 3min 55s


## Predict

In [22]:
bst = lgb.Booster(model_file='../../model/lgbm/lgbm_31')
bst

<lightgbm.basic.Booster at 0x7f258727c160>

In [23]:
results = []
with open('../../dataset/test_leaderboard_uniq_sessions') as f:
    for line in tqdm(f.readlines()[1:]):
        session_id = int(line) 
        results.extend(predict_session(bst, session_id ))

100%|██████████| 50000/50000 [09:42<00:00, 85.90it/s]


In [24]:
with open('../../result/lgbm_train31_2021.csv', 'w') as f:
    f.write('session_id,item_id,rank\n')
    f.write('\n'.join(results))