# Module 

In [1]:
import os 
import pandas as pd
import lightgbm as lgb
from tqdm import tqdm

from lightgbm import LGBMRanker

This means that in case of installing LightGBM from PyPI via the ``pip install lightgbm`` command, you don't need to install the gcc compiler anymore.
Instead of that, you need to install the OpenMP library, which is required for running LightGBM on the system with the Apple Clang compiler.
You can install the OpenMP library by the following command: ``brew install libomp``.


# Data Process 

code from [LightGBM Benchmark Repository](https://github.com/guolinke/boosting_tree_benchmarks)

In [3]:
def convert(input_filename, out_data_filename, out_query_filename, out_query_filename2):
	input = open(input_filename,"r")
	output_feature = open(out_data_filename,"w")
	output_query = open(out_query_filename,"w")
	output_query2 = open(out_query_filename2,"w")
	cur_cnt = 0
	cur_doc_cnt = 0
	last_qid = -1
	while True:
		line = input.readline()
		if not line:
			break
		tokens = line.split(' ')
		tokens[-1] = tokens[-1].strip()
		label = tokens[0]
		qid = int(tokens[1].split(':')[1])
		if qid != last_qid:
			if cur_doc_cnt > 0:
				output_query.write(str(cur_doc_cnt) + '\n')
				output_query2.write(str(cur_doc_cnt) + '\n')
				cur_cnt += 1
			cur_doc_cnt = 0
			last_qid = qid
		cur_doc_cnt += 1
		output_feature.write(label+' ')
		output_feature.write(' '.join(tokens[2:]) + '\n')
	output_query.write(str(cur_doc_cnt) + '\n')
	output_query2.write(str(cur_doc_cnt) + '\n')
	
	input.close()
	output_query.close()
	output_feature.close()
	output_query2.close()

# convert("train.txt","msltr.train","msltr.train.query","msltr.train.group")
# convert("test.txt","msltr.test","msltr.test.query","msltr.test.group")

#  Config 

In [18]:
data_dir = '../data/'
debug = False

debug_group = 100


In [19]:
! ls {data_dir}

dataexpo.txt       msltr.test.group   msltr.train.query  train.txt
dataexpo2libsvm.py msltr.test.query   msltr2libsvm.py    vali.txt
higgs2libsvm.py    msltr.train        readme.md          yahoo2libsvm.py
msltr.test         msltr.train.group  test.txt


In [20]:
train_data_dir = os.path.join(data_dir, 'msltr.train')
train_group_dir = os.path.join(data_dir, 'msltr.train.group')

test_data_dir = os.path.join(data_dir, 'msltr.test')
test_group_dir = os.path.join(data_dir, 'msltr.test.group')

# Data Info 

In [21]:
df = pd.read_csv(train_data_dir, header=None, sep=' ', nrows=100)

In [22]:
df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,128,129,130,131,132,133,134,135,136,137
0,2,1:3,2:3,3:0,4:0,5:3,6:1,7:1,8:0,9:0,...,128:11089534,129:2,130:116,131:64034,132:13,133:3,134:0,135:0,136:0,
1,2,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,9:0,...,128:11089534,129:2,130:124,131:64034,132:1,133:2,134:0,135:0,136:0,
2,0,1:3,2:0,3:2,4:0,5:3,6:1,7:0,8:0.666667,9:0,...,128:3,129:1,130:124,131:3344,132:14,133:67,134:0,135:0,136:0,
3,2,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,9:0,...,128:11089534,129:13,130:123,131:63933,132:1,133:3,134:0,135:0,136:0,
4,1,1:3,2:0,3:3,4:0,5:3,6:1,7:0,8:1,9:0,...,128:5,129:7,130:256,131:49697,132:1,133:13,134:0,135:0,136:0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,0,1:5,2:0,3:0,4:2,5:6,6:0.714286,7:0,8:0,9:0.285714,...,128:10,129:0,130:5338,131:39632,132:2,133:5,134:0,135:2,136:57,
96,0,1:7,2:0,3:2,4:0,5:7,6:1,7:0,8:0.285714,9:0,...,128:0,129:2,130:5885,131:51991,132:1,133:1,134:0,135:0,136:0,
97,0,1:6,2:2,3:2,4:0,5:6,6:0.857143,7:0.285714,8:0.285714,9:0,...,128:240,129:1,130:50712,131:51991,132:1,133:1,134:0,135:0,136:0,
98,0,1:3,2:0,3:3,4:3,5:3,6:0.428571,7:0,8:0.428571,9:0.428571,...,128:27,129:0,130:1284,131:65535,132:7,133:1,134:0,135:0,136:0,


In [23]:
# data_path = train_data_dir
# group_path = train_group_dir

def load_data(data_path: str, group_path: str, debug=False, debug_num=3):
#     data = lgb.Dataset(data_path)
    group_lst = list(pd.read_csv(group_path, header=None).T.values[0])
    if debug:
        group_lst = group_lst[:debug_num]
        df = pd.read_csv(data_path, header=None, sep=' '
                     , nrows=sum(group_lst)
                    )
    else:
        df = pd.read_csv(data_path, header=None, sep=' '
                    )
    label = df[0]
    df = df.drop(137, axis=1)
    df = df.drop(0, axis=1)

    def map_col(row):
        lst = row.split(":")
        return float(lst[-1])

    for col in tqdm(df.columns):
        df[col] = df[col].apply(map_col)
    
    return df, label, group_lst


In [24]:
train_X, train_y, train_group = load_data(train_data_dir, train_group_dir, debug=debug, debug_num=debug_group)
test_X, test_y, test_group = load_data(test_data_dir, test_group_dir, debug=debug, debug_num=debug_group)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [02:16<00:00,  1.00s/it]
100%|███████████████████████████████████████████████████████████████████████████████████████████████████| 136/136 [00:41<00:00,  3.30it/s]


In [25]:
assert train_X.shape[0] == sum(train_group)
assert test_X.shape[0] == sum(test_group)

In [26]:
# train_X.info()

In [27]:
# train_group

In [28]:
# test_y

# Model Train 

In [35]:
param = {
    "task": "train",
    # "num_leaves": 255,
    # "min_data_in_leaf": 1,
    # "min_sum_hessian_in_leaf": 100,
    "objective": "lambdarank",
    "metric": "ndcg",
    # "ndcg_eval_at": [
    #     1,
    #                  3, 5, 10
    #                  ],
    "learning_rate": 0.1, 
    'num_iterations': 1000
}


# In[16]:


ranker = LGBMRanker()



In [36]:
# In[1]:

print('Train begin...')
ranker.fit(train_X, train_y, group=train_group,
            # eval_set=[(train_X, train_y)], eval_group=[train_group],

        eval_set=[(train_X, train_y), (test_X, test_y)], eval_group=[train_group, test_group],
        eval_at=[5, 
#                  10, 20
                ], 
           early_stopping_rounds=50
           , verbose=True
          )

Train begin...
[1]	training's ndcg@5: 0.381274	valid_1's ndcg@5: 0.382798
Training until validation scores don't improve for 50 rounds.
[2]	training's ndcg@5: 0.403636	valid_1's ndcg@5: 0.404708
[3]	training's ndcg@5: 0.410479	valid_1's ndcg@5: 0.412599
[4]	training's ndcg@5: 0.414463	valid_1's ndcg@5: 0.412992
[5]	training's ndcg@5: 0.416583	valid_1's ndcg@5: 0.415575
[6]	training's ndcg@5: 0.421209	valid_1's ndcg@5: 0.420117
[7]	training's ndcg@5: 0.421468	valid_1's ndcg@5: 0.419927
[8]	training's ndcg@5: 0.423462	valid_1's ndcg@5: 0.42044
[9]	training's ndcg@5: 0.426848	valid_1's ndcg@5: 0.422858
[10]	training's ndcg@5: 0.428781	valid_1's ndcg@5: 0.424683
[11]	training's ndcg@5: 0.430061	valid_1's ndcg@5: 0.426438
[12]	training's ndcg@5: 0.434252	valid_1's ndcg@5: 0.430423
[13]	training's ndcg@5: 0.435003	valid_1's ndcg@5: 0.430947
[14]	training's ndcg@5: 0.436777	valid_1's ndcg@5: 0.43102
[15]	training's ndcg@5: 0.43756	valid_1's ndcg@5: 0.432384
[16]	training's ndcg@5: 0.43852	val

LGBMRanker()

# Model Evaluation

In [1]:
# ranker.predict(tra)

# Reference 

1. [Introduction to Learning to Rank](https://everdark.github.io/k9/notebooks/ml/learning_to_rank/learning_to_rank.html#Introduction-to-Learning-to-Rank)
2. [Learning-to-rank with LightGBM (Code example in python)](https://tamaracucumides.medium.com/learning-to-rank-with-lightgbm-code-example-in-python-843bd7b44574)
2. [LightGBM Benchmark Repository](https://github.com/guolinke/boosting_tree_benchmarks)
4. [Microsoft Learning to Rank Datasets](https://www.microsoft.com/en-us/research/project/mslr/?from=http%3A%2F%2Fresearch.microsoft.com%2Fen-us%2Fprojects%2Fmslr%2F)
5. [2019 RecSys Challenge First Solution](https://github.com/logicai-io/recsys2019)