In [1]:
import lightgbm as lgb
import pandas as pd
import pickle
import numpy as np
import csv
import time
from tqdm import tqdm

In [2]:
# project_root_path = '/content/drive/MyDrive/NCCU1102/WSM/proj3/RecSys-Dressipi/'
project_root_path = '../..'
n_train_sample = 500

In [3]:
def train_model( i ):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(i) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(i) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    query_train = [n_train_sample] * (len(X)//n_train_sample)

    print("Starting training... train_" + str(i) )
    start_time = time.time()
    gbm = lgb.LGBMRanker(device = 'gpu')
    if i == 0:
        gbm.fit(X, y, group=query_train)
    else:
        gbm.fit(X, y, group=query_train, init_model='../../model/lgbm_gpu/lgbm_' + str(i-1))
    gbm.booster_.save_model('../../model/lgbm_gpu/lgbm_' + str(i))
    print("Training finished " + str(time.time() - start_time))
    
    return gbm


In [4]:
def MRR( X_sessions, y_sessions, bst ):
    score = []
    for X, y in zip( X_sessions, y_sessions ):
        pred = bst.predict(X)
        top100_index = np.argsort(pred)[-100:]
        rank_result = []
        flag = False
        for count, index in enumerate(top100_index[::-1]):
            if y[index] == 1:
                score.append(1/(count+1))
                flag = True
                break
        if not flag:
            score.append(0)
    return np.mean(score)

def pred2rank_result( session_id, X, pred ):
    top100_index = np.argsort(pred)[-100:]
    rank_result = []
    for count, index in enumerate(top100_index[::-1]):
        row = ','.join([str(session_id), str(int(X[index][0])), str(count+1)])
        rank_result.append(row)
    return rank_result

def predict_session( gbm, session_id ):
    with open('../../dataset/test_features/test_X_' + str(session_id) + '.pickle', 'rb') as f:
        X = pickle.load(f)
    pred = gbm.predict(X)
    return pred2rank_result( session_id, X, pred )



## Train

In [10]:
for i in range(0,9):
    model = train_model( i )

Starting training... train_0
Training finished 10.134701251983643
Starting training... train_1
Training finished 10.276913404464722
Starting training... train_2
Training finished 11.178255558013916
Starting training... train_3
Training finished 12.348575115203857
Starting training... train_4
Training finished 13.534945726394653
Starting training... train_5
Training finished 14.866919755935669
Starting training... train_6
Training finished 16.920180320739746
Starting training... train_7
Training finished 19.29195523262024
Starting training... train_8
Training finished 3.858412981033325


## Validate

In [5]:
%%time
from sklearn.model_selection import train_test_split

bst = lgb.Booster(model_file='../../model/lgbm_gpu/lgbm_8')
for validate_id in range(9):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(validate_id) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(validate_id) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    X_sessions = np.array_split(X, len(X)//n_train_sample)
    y_sessions = np.array_split(y, len(y)//n_train_sample)
    print(MRR( X_sessions, y_sessions, bst))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

0.39531558829031727
0.4150499200580192
0.4007683544416681
0.406037735229362
0.4029493822656246
0.40126911153904177
0.4019283576651242
0.4072011974154905
0.513792690531054
CPU times: user 31min 43s, sys: 3.95 s, total: 31min 47s
Wall time: 3min 40s


## Predict

In [6]:
bst = lgb.Booster(model_file='../../model/lgbm_gpu/lgbm_8')
bst

<lightgbm.basic.Booster at 0x7f073b11ca30>

In [7]:
results = []
with open('../../dataset/test_leaderboard_uniq_sessions') as f:
    for line in tqdm(f.readlines()[1:]):
        session_id = int(line) 
        results.extend(predict_session(bst, session_id ))

100%|██████████| 50000/50000 [02:07<00:00, 392.81it/s]


In [8]:
with open('../../result/lgbm_train_gpu_8.csv', 'w') as f:
    f.write('session_id,item_id,rank\n')
    f.write('\n'.join(results))