# LGBM Prediction



In [1]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/RecSys2024/'
# 3.6.0以降だとLightGBMTunerが動かない
#!pip3 install optuna==3.5.0
!pip3 install polars lightgbm pyarrow

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
# ==================================================== # Library # ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
import joblib
import pyarrow
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm

# Ensemble

In [3]:
weight = 0.7

In [4]:
sub1 = pl.read_parquet(f"{base_path}/output/score_lgbm_valid_frac1.0.parquet") # validで学習, Public LB=0.73
sub2 = pl.read_parquet(f"{base_path}/output/score_lgbm_train_frac1.0.parquet") # trainで学習, Public LB=0.72

In [5]:
sub1

impression_id,user_id,to_article_id,score
i32,i32,i32,f64
6451339,35982,9796527,-0.062023
6451339,35982,7851321,-0.650809
6451339,35982,9798805,0.025555
6451339,35982,9795150,-0.412612
6451339,35982,9531110,-0.42637
…,…,…,…
0,1225161,9792362,-0.641075
0,1225161,9788041,-0.719298
0,1225161,9790135,-0.632726
0,1225161,9792408,-0.667543


In [6]:
sub2

impression_id,user_id,to_article_id,score
i32,i32,i32,f64
6451339,35982,9796527,-0.284338
6451339,35982,7851321,-0.854356
6451339,35982,9798805,0.130775
6451339,35982,9795150,-0.527686
6451339,35982,9531110,-0.621288
…,…,…,…
0,1225161,9792362,-0.610987
0,1225161,9788041,-0.626389
0,1225161,9790135,-0.637639
0,1225161,9792408,-0.62608


In [7]:
sub_en = sub1.join(sub2, how='left', on=["impression_id", "user_id", 'to_article_id']).with_columns(
    (pl.col('score') * weight+ pl.col('score_right') * (1-weight)).alias('score_ensemble')
    )
sub_en

impression_id,user_id,to_article_id,score,score_right,score_ensemble
i32,i32,i32,f64,f64,f64
6451339,35982,9796527,-0.062023,-0.284338,-0.128717
6451339,35982,7851321,-0.650809,-0.854356,-0.711873
6451339,35982,9798805,0.025555,0.130775,0.057121
6451339,35982,9795150,-0.412612,-0.527686,-0.447134
6451339,35982,9531110,-0.42637,-0.621288,-0.484845
…,…,…,…,…,…
0,1225161,9792362,-0.641075,-0.610987,-0.632048
0,1225161,9788041,-0.719298,-0.626389,-0.691425
0,1225161,9790135,-0.632726,-0.637639,-0.6342
0,1225161,9792408,-0.667543,-0.62608,-0.655104


In [8]:
sub_en = sub_en.select(['impression_id', 'user_id', 'to_article_id', 'score_ensemble']).rename({'score_ensemble':'score'})
sub_en

impression_id,user_id,to_article_id,score
i32,i32,i32,f64
6451339,35982,9796527,-0.128717
6451339,35982,7851321,-0.711873
6451339,35982,9798805,0.057121
6451339,35982,9795150,-0.447134
6451339,35982,9531110,-0.484845
…,…,…,…
0,1225161,9792362,-0.632048
0,1225161,9788041,-0.691425
0,1225161,9790135,-0.6342
0,1225161,9792408,-0.655104


# Output

In [9]:
out_path = base_path + 'feature_output'
sub_sorted = sub_en.sort("impression_id", "user_id", "score", descending=[False, False, True])
sub_sorted = sub_sorted.with_columns(pl.col('score').rank(method="ordinal", descending=True).over(["impression_id", "user_id"]).alias('rank'))

out = sub_en.join(sub_sorted[['impression_id', 'user_id', 'to_article_id', 'rank']], on=['impression_id', 'user_id', 'to_article_id'], how='left')
out = out.group_by(['impression_id', 'user_id']).agg(pl.col('rank'))
id = pl.read_parquet(f"{out_path}/test_impression.parquet").select([
    pl.col("impression_id").cast(pl.Int32),
    pl.col("user_id").cast(pl.Int32)
])
out = id.join(out, on=["impression_id", "user_id"], how='left')
out

impression_id,user_id,rank
i32,i32,list[u32]
6451339,35982,"[4, 8, … 9]"
6451363,36012,"[1, 6, … 5]"
6451382,36162,"[2, 5, … 4]"
6451383,36162,"[1, 6, … 4]"
6451385,36162,"[4, 3, … 7]"
…,…,…
0,1589163,"[101, 140, … 217]"
0,1699456,"[89, 136, … 217]"
0,635479,"[123, 137, … 217]"
0,251030,"[99, 129, … 217]"


In [10]:
# check beyond accuracy impression
out.filter(pl.col('impression_id') == 0)

impression_id,user_id,rank
i32,i32,list[u32]
0,1049297,"[100, 156, … 217]"
0,231624,"[90, 141, … 217]"
0,716356,"[87, 149, … 217]"
0,1440307,"[103, 155, … 217]"
0,1822406,"[118, 133, … 217]"
…,…,…
0,1589163,"[101, 140, … 217]"
0,1699456,"[89, 136, … 217]"
0,635479,"[123, 137, … 217]"
0,251030,"[99, 129, … 217]"


In [11]:
file_name = f'{base_path}/output/pred_ensemble_weight_{weight}.txt'
with open(file_name, 'w') as f:
    for row in out.to_dicts():
        # 一度strにしないとlistの","の間に半角スペースが入ってしまう
        rank_str = ','.join(map(str, row['rank']))
        line = f"{row['impression_id']} [{rank_str}]\n"
        f.write(line)

### Check output

In [12]:
!head {file_name}

6451339 [4,8,2,6,7,3,1,5,9]
6451363 [1,6,3,7,8,2,4,5]
6451382 [2,5,1,3,4]
6451383 [1,6,10,2,3,7,11,5,9,8,4]
6451385 [4,3,2,6,5,1,7]
6451411 [9,1,8,4,2,5,6,7,3]
6451412 [3,4,2,6,8,7,1,5]
6451423 [24,29,32,7,2,23,16,18,28,30,8,15,1,33,11,27,31,6,22,20,12,10,17,14,9,25,21,19,26,13,5,3,4]
6451425 [4,3,1,6,2,5]
6451426 [2,5,1,3,4]


In [13]:
!tail {file_name}

0 [107,147,250,83,175,240,59,12,138,178,157,181,208,186,212,167,54,145,68,94,144,84,131,130,239,238,6,8,98,79,69,64,230,123,180,204,206,233,132,104,173,170,116,28,49,244,23,205,95,33,241,89,211,65,148,171,155,209,177,232,30,162,1,245,9,103,133,193,67,126,72,55,115,66,121,201,25,153,156,81,43,56,48,90,41,75,96,176,73,5,38,86,246,161,85,19,142,159,120,46,196,122,191,78,87,102,172,109,198,210,188,202,101,168,179,226,231,7,61,24,112,39,242,152,223,110,88,150,234,106,174,200,119,129,13,14,216,160,124,134,135,117,189,139,225,60,228,27,214,47,195,219,137,118,58,77,3,80,221,34,151,76,207,235,45,199,158,113,215,187,197,243,164,227,52,99,35,185,21,154,108,237,44,165,22,220,125,29,213,17,218,194,149,166,222,127,42,57,146,182,203,100,16,224,97,247,18,15,93,82,184,74,91,71,192,111,63,31,40,32,37,51,62,236,26,128,114,20,249,4,11,2,105,183,10,136,229,163,169,36,70,190,140,141,143,50,248,53,92,217]
0 [107,139,250,83,175,240,59,12,138,182,166,178,208,186,212,155,54,146,68,116,151,84,145,143,239,238,6,8

In [14]:
!wc -l {file_name}

13536710 /content/drive/MyDrive/RecSys2024//output/pred_ensemble_weight_0.7.txt
