In [1]:
import lightgbm as lgb
import pandas as pd
import pickle
import numpy as np
import csv
import time

In [2]:
# project_root_path = '/content/drive/MyDrive/NCCU1102/WSM/proj3/RecSys-Dressipi/'
project_root_path = '../..'

In [None]:
%%time
from sklearn.model_selection import train_test_split

# fit training data
X_path = project_root_path + '/dataset/train_features/train_X_0.pickle'
y_path = project_root_path + '/dataset/train_features/train_y_0.pickle'
with open( X_path, 'rb') as f:
    X = pickle.load(f)
with open( y_path, 'rb') as f:
    y = pickle.load(f)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=1)


In [3]:
def train_model( i ):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(i) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(i) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    query_train = [150] * (len(X)//150)

    print("Starting training... train_" + str(i) )
    start_time = time.time()
    gbm = lgb.LGBMRanker()
    if i == 0:
        gbm.fit(X, y, group=query_train)
    else:
        gbm.fit(X, y, group=query_train, init_model='../../model/lgbm_' + str(i-1))
    gbm.booster_.save_model('../../model/lgbm_' + str(i))
    print("Training finished " + str(time.time() - start_time))
    
    return gbm


In [8]:
def pred2rank_result( session_id, X, pred ):
    top100_index = np.argsort(pred)[-100:]
    rank_result = []
    for count, index in enumerate(top100_index[::-1]):
        row = ','.join([str(session_id), str(int(X[index][0])), str(count+1)])
        rank_result.append(row)
    return rank_result

def predict_session( gbm, session_id ):
    with open('../../dataset/test_features/test_X_' + str(session_id) + '.pickle', 'rb') as f:
        X = pickle.load(f)
    pred = gbm.predict(X)
    return pred2rank_result( session_id, X, pred )



In [None]:
for i in range(0,9):
    model = train_model( i )

In [9]:
bst = lgb.Booster(model_file='../../model/lgbm_8')
bst

<lightgbm.basic.Booster at 0x17fd27e20>

In [10]:
with open('../../dataset/test_leaderboard_uniq_sessions') as f:
    sessions = [ int(line) for line in f.readlines()[1:]]

results = []
for session_id in sessions:
    results.extend(predict_session(bst, session_id ))

In [11]:
with open('../../result/lgbm_train8.csv', 'w') as f:
    f.write('session_id,item_id,rank\n')
    f.write('\n'.join(results))