In [55]:
import lightgbm as lgb
import pandas as pd
import pickle
import numpy as np
import csv
import time
from tqdm import tqdm

In [60]:
# project_root_path = '/content/drive/MyDrive/NCCU1102/WSM/proj3/RecSys-Dressipi/'
project_root_path = '../..'
n_train_sample = 500

In [57]:
def train_model( i ):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(i) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(i) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    query_train = [n_train_sample] * (len(X)//n_train_sample)

    print("Starting training... train_" + str(i) )
    start_time = time.time()
    gbm = lgb.LGBMRanker()
    if i == 0:
        gbm.fit(X, y, group=query_train)
    else:
        gbm.fit(X, y, group=query_train, init_model='../../model/lgbm_' + str(i-1))
    gbm.booster_.save_model('../../model/lgbm_' + str(i))
    print("Training finished " + str(time.time() - start_time))
    
    return gbm


In [58]:
def MRR( X_sessions, y_sessions, bst ):
    score = []
    for X, y in zip( X_sessions, y_sessions ):
        pred = bst.predict(X)
        top100_index = np.argsort(pred)[-100:]
        rank_result = []
        flag = False
        for count, index in enumerate(top100_index[::-1]):
            if y[index] == 1:
                score.append(1/(count+1))
                flag = True
                break
        if not flag:
            score.append(0)
    return np.mean(score)

def pred2rank_result( session_id, X, pred ):
    top100_index = np.argsort(pred)[-100:]
    rank_result = []
    for count, index in enumerate(top100_index[::-1]):
        row = ','.join([str(session_id), str(int(X[index][0])), str(count+1)])
        rank_result.append(row)
    return rank_result

def predict_session( gbm, session_id ):
    with open('../../dataset/test_features/test_X_' + str(session_id) + '.pickle', 'rb') as f:
        X = pickle.load(f)
    pred = gbm.predict(X)
    return pred2rank_result( session_id, X, pred )



## Train

In [61]:
for i in range(0,9):
    model = train_model( i )

Starting training... train_0
Training finished 17.73865509033203
Starting training... train_1
Training finished 20.211987257003784
Starting training... train_2
Training finished 23.832744121551514
Starting training... train_3
Training finished 31.876275777816772
Starting training... train_4
Training finished 32.7593719959259
Starting training... train_5
Training finished 37.41308784484863
Starting training... train_6
Training finished 40.627307176589966
Starting training... train_7
Training finished 1118.8098690509796
Starting training... train_8
Training finished 8.071751832962036


## Validate

In [44]:
%%time
from sklearn.model_selection import train_test_split

for validate_id in range(9):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(validate_id) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(validate_id) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    X_sessions = np.array_split(X, len(X)//n_train_sample)
    y_sessions = np.array_split(y, len(y)//n_train_sample)
    bst = lgb.Booster(model_file='../../model/lgbm_7')
    print(MRR( X_sessions, y_sessions, bst))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

0.5379260422831995
0.5544787382727367
0.5480938037545099
0.5544980090490688
0.5559050126736478
0.5583576956894686
0.5484790388984
0.5628604026118849
0.5408063583296453
CPU times: user 14min 33s, sys: 3min 6s, total: 17min 39s
Wall time: 3min 57s


## Predict

In [62]:
bst = lgb.Booster(model_file='../../model/lgbm_8')
bst

<lightgbm.basic.Booster at 0x2b56e4460>

In [63]:
results = []
with open('../../dataset/test_leaderboard_uniq_sessions') as f:
    for line in tqdm(f.readlines()[1:]):
        session_id = int(line) 
        results.extend(predict_session(bst, session_id ))

100%|██████████| 50000/50000 [04:42<00:00, 177.30it/s]


In [65]:
with open('../../result/lgbm_train7.csv', 'w') as f:
    f.write('session_id,item_id,rank\n')
    f.write('\n'.join(results))