In [None]:
import lightgbm as lgb
import pandas as pd
import pickle
import numpy as np
import gc
import time
from tqdm import tqdm
from sklearn.model_selection import train_test_split

In [None]:
# project_root_path = '/content/drive/MyDrive/NCCU1102/WSM/proj3/RecSys-Dressipi/'
project_root_path = '../..'
n_train_sample = 500

In [None]:
def select_features( X ):
     select_cols = [0,1,2,3,4,5,6,7,8,9,10,14,15,16]
     # select_cols = list(range(17))
     new_X = np.zeros((len(X), len(select_cols)))
     for i in range(len(X)):
          new_X[i] = X[i][select_cols]
     return new_X

        

In [None]:
def train_model( i ):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(i) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(i) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    query_train = [n_train_sample] * (len(X)//n_train_sample)

    print("Starting training... train_" + str(i) )
    start_time = time.time()
    gbm = lgb.LGBMRanker(device = 'cpu')
    if i == 0:
        gbm.fit(select_features(X), y, group=query_train)
    else:
        gbm.fit(select_features(X), y, group=query_train, init_model='../../model/lgbm/lgbm_' + str(i-1))
    gbm.booster_.save_model('../../model/lgbm/lgbm_' + str(i))
    print("Training finished " + str(time.time() - start_time))
    
    return gbm


In [None]:
def MRR( X_sessions, y_sessions, y_preds ):
    score = []
    for X, y, pred in zip( X_sessions, y_sessions, y_preds ):
        top100_index = np.argsort(pred)[-100:]
        rank_result = []
        flag = False
        for count, index in enumerate(top100_index[::-1]):
            if y[index] == 1:
                score.append(1/(count+1))
                flag = True
                break
        if not flag:
            score.append(0)
    return np.mean(score)

def pred2rank_result( session_id, X, pred ):
    top100_index = np.argsort(pred)[-100:]
    rank_result = []
    for count, index in enumerate(top100_index[::-1]):
        row = ','.join([str(session_id), str(int(X[index][0])), str(count+1)])
        rank_result.append(row)
    return rank_result

def predict_session( gbm, session_id ):
    with open('../../dataset/test_features/test_X_' + str(session_id) + '.pickle', 'rb') as f:
        X = pickle.load(f)
    pred = gbm.predict(select_features(X))
    return pred2rank_result( session_id, X, pred )



## Train

In [None]:
for i in range(9):
    model = train_model( i )

## Validate

In [None]:
%%time

bst = lgb.Booster(model_file='../../model/lgbm/lgbm_31')
for validate_id in range(8,10):
    # fit training data
    X_path = project_root_path + '/dataset/train_features/train_X_' + str(validate_id) + '.pickle'
    y_path = project_root_path + '/dataset/train_features/train_y_' + str(validate_id) + '.pickle'
    with open( X_path, 'rb') as f:
        X = pickle.load(f)
    with open( y_path, 'rb') as f:
        y = pickle.load(f)

    X_sessions = np.array_split(X, len(X)//n_train_sample)
    y_sessions = np.array_split(y, len(y)//n_train_sample)

    del X
    del y
    gc.collect()

    y_preds = [bst.predict(select_features(X_session)) for X_session in X_sessions]
    print(MRR( X_sessions, y_sessions, y_preds))
    del X_sessions
    del y_sessions
    del y_preds
    gc.collect()
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

## Predict

In [None]:
bst = lgb.Booster(model_file='../../model/lgbm/lgbm_8')
bst

In [None]:
results = []
with open('../../dataset/test_leaderboard_uniq_sessions') as f:
    for line in tqdm(f.readlines()[1:]):
        session_id = int(line) 
        results.extend(predict_session(bst, session_id ))

In [None]:
with open('../../result/lgbm_s500_2021-5_f0-1014-16.csv', 'w') as f:
    f.write('session_id,item_id,rank\n')
    f.write('\n'.join(results))