# LGBM Prediction



In [1]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/RecSys2024/'
!pip3 install polars lightgbm pyarrow optuna optuna-integration

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#train_type = 'train'
train_type = 'valid' # use validation data as training for final su

In [3]:
# ==================================================== # Library # ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
import joblib
import pyarrow
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, log_loss
import lightgbm as lgb
print(lgb.__version__)

4.4.0


# Reading Input File

In [4]:
N_split = 2 # N個に分割して推論
out_path = base_path + 'feature_output'
model_path = base_path + 'models'
#TRAIN_FRAC = '1.0'
TRAIN_FRAC = '0.6'

#RANDOM_SEED = 42 # 学習ときのseed
#RANDOM_SEED = 52 # 学習ときのseeed
RANDOM_SEED = 62 # 学習ときのseeed

bins = [600, 3600, 3600 * 6]
#bins = [300, 600, 3600, 3600 * 6]

# Feature Engineering

In [5]:
def read_files(i_chunk, N_split=2):
    impression = pl.read_parquet(f"{out_path}/test_impression.parquet")
    #impression_article = pl.read_parquet(f"{out_path}/test_impression_article.parquet")
    impression_article = pl.read_parquet(f"{out_path}/test_impression_article_w_embed.parquet")
    user = pl.read_parquet(f"{out_path}/test_user.parquet")
    user_article = pl.read_parquet(f"{out_path}/test_user_article.parquet")
    article = pl.read_parquet(f"{out_path}/test_article.parquet")
    session = pl.read_parquet(f"{out_path}/test_session.parquet")

    rows_per_chunk = len(impression) // N_split
    chunks = [impression.slice(i * rows_per_chunk, rows_per_chunk) for i in range(N_split)]
    impression = chunks[i_chunk]
    del chunks, rows_per_chunk
    gc.collect()
    return impression_article, impression, user, user_article, article, session

def add_pop_count(df, out_col_name, bins):

    for bin in bins:
        df = df.with_columns(
            (df["impression_time"] / bin).cast(pl.Int64).alias(f"time_bin_{bin}")
        )
        grouped_df = df.group_by([f"time_bin_{bin}", "article_id_inview"]).agg(
            pl.count().cast(pl.Int32).alias(out_col_name + str(bin))
        )
        df = df.join(
            grouped_df,
            on=[f"time_bin_{bin}", "article_id_inview"],
            how="left"
        )
        df = df.drop(f"time_bin_{bin}")

    # userごとにviewのあったarticleのview数 (未来の情報含む)
    key = ['user_id', 'article_id_inview']
    tmp = df.group_by(key).agg([
            pl.col('impression_time').mean().alias('imp_time_mean'),
            pl.col('impression_time').std().alias('imp_time_std'),
            pl.col('impression_time').max().alias('imp_time_max'),
            pl.col('impression_time').min().alias('imp_time_min'),
            pl.col('impression_time').count().alias('imp_time_cnt'),
        ])
    tmp = reduce_mem_usage(tmp)
    df = df.join(tmp, on=['user_id', 'article_id_inview'], how='left')
    del tmp
    gc.collect()

    df = df.with_columns(
        (pl.col('impression_time') - pl.col('imp_time_mean')).alias('diff_imp_time_mean').cast(pl.Int64),
        (pl.col('imp_time_max') - pl.col('impression_time')).alias('diff_imp_time_max'),
        (pl.col('impression_time') - pl.col('imp_time_min')).alias('diff_imp_time_min'),
    )
    df = df.drop(['imp_time_mean', 'imp_time_max', 'imp_time_min'])
    df = reduce_mem_usage(df)

    return df


def join_features(impression_article, impression, user, user_article, article, session, bins):
    print('join features....')
    #df = impression_article.select("impression_id", "article_id_inview").join(impression, on="impression_id", how="inner")
    df = impression_article.join(impression, on=["impression_id", "user_id"], how="inner")
    del impression_article
    gc.collect()
    df = reduce_mem_usage(df)

    print('add view_cnt....')
    df = add_pop_count(df,"view_cnt", bins)
    df = reduce_mem_usage(df)

    print('add user features....')
    df = df.join(user, on="user_id", how="left")
    del user
    gc.collect()
    df = reduce_mem_usage(df)

    print('add article features....')
    df = df.rename({
        "article_id_inview": "article_id"
    })
    df = df.join(
        #article.rename({col: f"to_{col}" for col in article.columns}),
        article,
        on="article_id",
        how="left"
    )
    del article
    gc.collect()
    df = reduce_mem_usage(df)

    print('add user article features....')
    df = df.join(
        #user_article.rename({col: f"to_{col}" for col in user_article.columns if col != 'user_id'}),
        user_article,
        on=["user_id", "article_id"],
        how="left"
    )
    #df = df.drop(['user_id', 'article_id'])
    del user_article
    gc.collect()
    df = reduce_mem_usage(df)

    print('add session features....')
    df = df.join(
        session,
        on=["impression_id", "user_id", "article_id"],
        how="left"
    )
    del session
    gc.collect()
    df = reduce_mem_usage(df)

    print('cast features....')
    for bin in bins:
        df = df.with_columns(
            pl.col(f"view_cnt{bin}").fill_null(0).cast(pl.Int32)
        )
    df = reduce_mem_usage(df)
    return df


def generate_unixtime_features(df, unixtime_list):

    for col_name in unixtime_list:
        df = df.with_columns(
            (pl.col("impression_time") - pl.col(col_name)).alias(f"{col_name}_diff").cast(pl.Int64)
        )
        df = df.drop(col_name)
    df = df.drop('impression_time')
    df = reduce_mem_usage(df)
    return df


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data
    #return df_data, cat_cols


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage in Polars.
    """

    # Initialize an empty list to store optimized columns
    optimized_columns = []

    for col in df.columns:
        col_data = df[col]
        col_type = col_data.dtype

        if col_type in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt32]:
            col_data = col_data.fill_null(0)
            c_min = col_data.min()
            c_max = col_data.max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                optimized_columns.append(col_data.cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                optimized_columns.append(col_data.cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                optimized_columns.append(col_data.cast(pl.Int32))
            else:
                optimized_columns.append(col_data.cast(pl.Int64))
        elif col_type in [pl.Float32, pl.Float64]:
            c_min = col_data.min()
            c_max = col_data.max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                optimized_columns.append(col_data.cast(pl.Float32))
            else:
                optimized_columns.append(col_data.cast(pl.Float64))
        else:
            optimized_columns.append(col_data)
    # Create a new DataFrame with optimized columns
    optimized_df = pl.DataFrame({col: optimized_columns[i] for i, col in enumerate(df.columns)})

    return optimized_df


def cast_float_to_int(df, columns):
    for col in columns:
        df = df.with_columns(pl.col(col).cast(pl.Int64))
    return df


def add_features(df, cols, key, prefix, df_origin = None):
    tmp = df.group_by(key).agg([
        pl.col(cols).mean(),
    ])
    new_cols = [prefix + name + "_mean" if name != key else name for name in cols]
    tmp = tmp.rename(dict(zip(cols, new_cols)))
    tmp = reduce_mem_usage(tmp)
    # joinする元のdfが異なる場合は最後の引数を指定する
    if df_origin is not None:
        df = df_origin
    df = df.join(tmp, on=key, how='left')
    del tmp
    gc.collect()

    # 差分のfeatureを追加
    for i, j in zip(cols, new_cols):
      df = df.with_columns(
          (pl.col(i) - pl.col(j)).alias('diff_' + j)
      )
    df = reduce_mem_usage(df)
    return df


def join_all_process(i_chunk, N_split):

    impression_article, impression, user, user_article, article, session = read_files(i_chunk, N_split)
    float_int_cols = ['read_time', 'scroll_percentage']
    impression = cast_float_to_int(impression, float_int_cols)
    impression = reduce_mem_usage(impression)
    impression_article = reduce_mem_usage(impression_article)

    float_int_cols = ['u_read_time_last', 'u_scroll_percentage_last', 'u_impression_time_mean']
    user = cast_float_to_int(user, float_int_cols)
    user = reduce_mem_usage(user)

    float_int_cols = ['ua_read_time_last', 'ua_scroll_percentage_last']
    user_article = cast_float_to_int(user_article, float_int_cols)
    user_article = reduce_mem_usage(user_article)

    float_int_cols = ['a_total_read_time']
    article = cast_float_to_int(article, float_int_cols)
    article = reduce_mem_usage(article)

    session = reduce_mem_usage(session)

    df = join_features(impression_article, impression, user, user_article, article, session, bins)
    del impression_article, impression, user, session
    gc.collect()

    print('add user features group_by article impression basis, add diff features....')
    cols = ['read_time', 'scroll_percentage','device_type', 'is_sso_user', 'gender', 'postcode', 'age', 'is_subscriber', 'view_num']
    df = add_features(df, cols=cols, key='article_id', prefix='a_')

    print('add article features group_by user impression basis, add diff features....')
    cols = ["premium", "total_inviews", "total_pageviews", "total_read_time", "sentiment_score", "title_len", "subtitle_len", "body_len", "image_ids_len", "ner_clusters_len", "entity_groups_len", "topics_len", "subcategory_len"]
    cols = ['a_' + i for i in cols]
    df = add_features(df, cols=cols, key='user_id', prefix='u_')

    print('add article features group_by user click history basis, add diff features....')
    user_hist = user_article.select("user_id", "article_id")
    del user_article
    gc.collect()

    user_hist = user_hist.join(article, on='article_id', how='left')
    del article
    gc.collect()

    user_hist = reduce_mem_usage(user_hist)
    df = add_features(user_hist, cols=cols, key='user_id', prefix='u_hist_', df_origin=df)
    del user_hist
    gc.collect()
    return df


In [6]:
# is_beyond_accuracyはtest dataだけ入ってるので削る
IGNORE_COL_ID = ["impression_id", "session_id", "is_beyond_accuracy", "user_id", "article_id"]
#LOW_IMP_COL = ["gender", "postcode", "u_a_title_len_mean", "u_read_time_last", "u_scroll_percentage_last", "ua_read_time_last"] # 0 importanceのfeatureを削る
LOW_IMP_COL = ["age", "gender", "postcode"] # 0 importanceのfeatureを削る
#LOW_IMP_COL = [] # 0 importanceのfeatureを削る

model = np.load(f'{model_path}/lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.pkl', allow_pickle=True)
model

<lightgbm.basic.Booster at 0x7e926dafbdc0>

In [7]:
print('feature num:', len(model.feature_name()))
#set(model.feature_name()) - set(test.columns)

feature num: 130


In [8]:
sub_list = []

for i in range(N_split):
    print(f'chunk {i}')
    test = join_all_process(i_chunk = i, N_split=N_split)

    # unixtimeのfeatureをdiffにする
    unixtime_list = ['u_impression_time_last', 'u_impression_time_mean', 'ua_impression_time_mean', 'ua_impression_time_last', 'a_published_time', 'a_last_modified_time']
    test = generate_unixtime_features(test, unixtime_list)

    # tmp
    for i in range(2,8):
        test = test.drop(f'a_ner_clusters_{i}th')
    test = test.drop(f'a_ner_clusters_last')
    test = test.drop(f'a_ner_clusters_first')
    test = test.drop(f'seconds_since_midnight')

    display(test.shape)
    display(test.head())
    test = to_pandas(test)
    #query_list_test = test['impression_id'].value_counts()
    #query_list_test = query_list_test.sort_index()
    print('remove id from features....')
    test_id = test[['impression_id', 'user_id', 'article_id']]
    test = test.drop(IGNORE_COL_ID + LOW_IMP_COL, axis=1)
    pred = model.predict(test)

    test_id = test_id.reset_index(drop=True)
    pred = pd.Series(pred, name='score')
    sub = pd.concat([test_id, pred], axis=1)
    sub_list.append(sub)

    del test_id, pred, sub
    gc.collect()


chunk 0
join features....
add view_cnt....
add user features....
add article features....
add user article features....
add session features....
cast features....
add user features group_by article impression basis, add diff features....
add article features group_by user impression basis, add diff features....
add article features group_by user click history basis, add diff features....


(79043265, 138)

impression_id,user_id,article_id,sim_xlm-roberta-base_pca16_5hist,sim_title_vector_pca16_5hist,read_time,scroll_percentage,device_type,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy,ndays,view_num,view_cnt600,view_cnt3600,view_cnt21600,imp_time_std,imp_time_cnt,diff_imp_time_mean,diff_imp_time_max,diff_imp_time_min,u_history_len,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean,a_premium,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,…,diff_u_a_image_ids_len_mean,diff_u_a_ner_clusters_len_mean,diff_u_a_entity_groups_len_mean,diff_u_a_topics_len_mean,diff_u_a_subcategory_len_mean,u_hist_a_premium_mean,u_hist_a_total_inviews_mean,u_hist_a_total_pageviews_mean,u_hist_a_total_read_time_mean,u_hist_a_sentiment_score_mean,u_hist_a_title_len_mean,u_hist_a_subtitle_len_mean,u_hist_a_body_len_mean,u_hist_a_image_ids_len_mean,u_hist_a_ner_clusters_len_mean,u_hist_a_entity_groups_len_mean,u_hist_a_topics_len_mean,u_hist_a_subcategory_len_mean,diff_u_hist_a_premium_mean,diff_u_hist_a_total_inviews_mean,diff_u_hist_a_total_pageviews_mean,diff_u_hist_a_total_read_time_mean,diff_u_hist_a_sentiment_score_mean,diff_u_hist_a_title_len_mean,diff_u_hist_a_subtitle_len_mean,diff_u_hist_a_body_len_mean,diff_u_hist_a_image_ids_len_mean,diff_u_hist_a_ner_clusters_len_mean,diff_u_hist_a_entity_groups_len_mean,diff_u_hist_a_topics_len_mean,diff_u_hist_a_subcategory_len_mean,u_impression_time_last_diff,u_impression_time_mean_diff,ua_impression_time_mean_diff,ua_impression_time_last_diff,a_published_time_diff,a_last_modified_time_diff
i32,i32,i32,f32,f32,i16,i8,i8,bool,i8,i8,i8,bool,i32,bool,i8,u8,i16,i32,i32,f32,i8,i32,i32,i32,i16,i16,f32,i8,f32,bool,str,i16,str,i32,i32,i32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,i32,i32,i32,i32
6451339,35982,9796527,-0.407535,0.220448,8,0,2,False,0,0,0,False,388,False,4,9,1540,7849,36825,19886.917969,3,22297,0,38252,200,116,55.965,100,75.919357,True,"""article_standard_feature""",414,"""underholdning""",1674245,95347,3372986,…,40.789116,-3.034014,-3.034014,2.14966,-0.272109,0.038674,552658.5625,123278.398438,7420708.0,0.834342,43.812153,112.215469,1934.270752,1.801105,9.475138,9.475138,3.585635,0.933702,0.961326,1121586.5,-27931.398438,-4047722.0,-0.087743,-2.812153,12.784531,794.729248,43.198895,-2.475138,-2.475138,2.414365,0.066298,410423,1250701,0,1685977369,88394,-2043991
6451339,35982,7851321,0.020333,0.029117,8,0,2,False,0,0,0,False,388,False,4,9,1725,8504,24072,,1,25,0,0,200,116,55.965,100,75.919357,True,"""article_default""",457,"""forbrug""",0,0,0,…,-2.210885,-10.034014,-10.034014,0.14966,-0.272109,0.038674,552658.5625,123278.398438,7420708.0,0.834342,43.812153,112.215469,1934.270752,1.801105,9.475138,9.475138,3.585635,0.933702,0.961326,-552658.5625,-123278.398438,-7420708.0,-0.094543,3.187847,107.784531,4053.729248,0.198895,-9.475138,-9.475138,0.414365,0.066298,410423,1250701,0,1685977369,113110987,-2042686
6451339,35982,9798805,0.248214,-0.062174,8,0,2,False,0,0,0,False,388,False,4,9,1774,7831,52617,,1,25,0,0,200,116,55.965,100,75.919357,False,"""article_default""",142,"""sport""",340797,64344,4159228,…,-3.210885,0.965986,0.965986,0.14966,1.727891,0.038674,552658.5625,123278.398438,7420708.0,0.834342,43.812153,112.215469,1934.270752,1.801105,9.475138,9.475138,3.585635,0.933702,-0.038674,-211861.5625,-58934.398438,-3261480.0,-0.335042,-9.812153,-35.215469,-593.270752,-0.801105,1.524862,1.524862,0.414365,2.066298,410423,1250701,0,1685977369,10189,-2043993
6451339,35982,9795150,-0.350247,-0.038875,8,0,2,False,0,0,0,False,388,False,4,9,15,134,808,,1,25,0,0,200,116,55.965,100,75.919357,False,"""article_default""",498,"""musik""",560052,116804,4855463,…,0.789115,-5.034014,-5.034014,1.14966,-0.272109,0.038674,552658.5625,123278.398438,7420708.0,0.834342,43.812153,112.215469,1934.270752,1.801105,9.475138,9.475138,3.585635,0.933702,-0.038674,7393.4375,-6474.398438,-2565245.0,0.123258,-14.812153,-9.215469,-277.270752,3.198895,-4.475138,-4.475138,1.414365,0.066298,410423,1250701,0,1685977369,164005,-2043989
6451339,35982,9531110,-0.250325,-0.257405,8,0,2,False,0,0,0,False,388,False,4,9,518,2248,12566,255.265549,2,153,0,361,200,116,55.965,100,75.919357,True,"""article_default""",572,"""side9""",0,0,0,…,-2.210885,-3.034014,-3.034014,-1.85034,-0.272109,0.038674,552658.5625,123278.398438,7420708.0,0.834342,43.812153,112.215469,1934.270752,1.801105,9.475138,9.475138,3.585635,0.933702,0.961326,-552658.5625,-123278.398438,-7420708.0,-0.318542,-22.812153,-8.215469,-654.270752,0.198895,-2.475138,-2.475138,-1.585635,0.066298,410423,1250701,0,1685977369,16221769,-2043761


remove id from features....
chunk 1
join features....
add view_cnt....
add user features....
add article features....
add user article features....
add session features....
cast features....
add user features group_by article impression basis, add diff features....
add article features group_by user impression basis, add diff features....
add article features group_by user click history basis, add diff features....


(126882603, 138)

impression_id,user_id,article_id,sim_xlm-roberta-base_pca16_5hist,sim_title_vector_pca16_5hist,read_time,scroll_percentage,device_type,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy,ndays,view_num,view_cnt600,view_cnt3600,view_cnt21600,imp_time_std,imp_time_cnt,diff_imp_time_mean,diff_imp_time_max,diff_imp_time_min,u_history_len,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean,a_premium,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,…,diff_u_a_image_ids_len_mean,diff_u_a_ner_clusters_len_mean,diff_u_a_entity_groups_len_mean,diff_u_a_topics_len_mean,diff_u_a_subcategory_len_mean,u_hist_a_premium_mean,u_hist_a_total_inviews_mean,u_hist_a_total_pageviews_mean,u_hist_a_total_read_time_mean,u_hist_a_sentiment_score_mean,u_hist_a_title_len_mean,u_hist_a_subtitle_len_mean,u_hist_a_body_len_mean,u_hist_a_image_ids_len_mean,u_hist_a_ner_clusters_len_mean,u_hist_a_entity_groups_len_mean,u_hist_a_topics_len_mean,u_hist_a_subcategory_len_mean,diff_u_hist_a_premium_mean,diff_u_hist_a_total_inviews_mean,diff_u_hist_a_total_pageviews_mean,diff_u_hist_a_total_read_time_mean,diff_u_hist_a_sentiment_score_mean,diff_u_hist_a_title_len_mean,diff_u_hist_a_subtitle_len_mean,diff_u_hist_a_body_len_mean,diff_u_hist_a_image_ids_len_mean,diff_u_hist_a_ner_clusters_len_mean,diff_u_hist_a_entity_groups_len_mean,diff_u_hist_a_topics_len_mean,diff_u_hist_a_subcategory_len_mean,u_impression_time_last_diff,u_impression_time_mean_diff,ua_impression_time_mean_diff,ua_impression_time_last_diff,a_published_time_diff,a_last_modified_time_diff
i32,i32,i32,f32,f32,i16,i8,i8,bool,i8,i8,i8,bool,i32,bool,i8,u8,i32,i32,i32,f32,i8,i32,i32,i32,i16,i16,f32,i8,f32,bool,str,i16,str,i32,i32,i32,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,i32,i32,i32,i32
284044691,1518744,9799773,-0.238661,-0.42854,7,0,1,False,0,0,0,False,41620968,False,5,10,2530,14196,75134,,1,60,0,0,112,9,48.767857,55,76.590477,False,"""article_scribblelive""",142,"""sport""",611193,156021,13761650,…,-2.514894,-9.32766,-9.32766,1.217021,0.923404,0.011364,518933.96875,119858.492188,7411824.0,0.853848,42.988636,118.034088,1962.75,1.772727,8.772727,8.772727,3.920455,0.761364,-0.011364,92259.03125,36162.507812,6349826.0,0.104352,3.011364,119.965912,-1962.75,-0.772727,-7.772727,-7.772727,1.079545,1.238636,510801,1494180,0,1686049852,13669,-1971510
284044691,1518744,9800148,-0.115633,0.334225,7,0,1,False,0,0,0,False,41620968,False,5,10,1771,9112,33345,,1,60,0,0,112,9,48.767857,55,76.590477,False,"""article_default""",142,"""sport""",238901,29148,1594379,…,-2.514894,-4.32766,-4.32766,1.217021,2.923404,0.011364,518933.96875,119858.492188,7411824.0,0.853848,42.988636,118.034088,1962.75,1.772727,8.772727,8.772727,3.920455,0.761364,-0.011364,-280032.96875,-90710.492188,-5817445.0,-0.014148,-17.988636,-72.034088,-1194.75,-0.772727,-2.772727,-2.772727,1.079545,3.238636,510801,1494180,0,1686049852,5634,-1971511
284044691,1518744,9788021,-0.303386,-0.190818,7,0,1,False,0,0,0,False,41620968,False,5,10,2571,14480,95517,,1,60,0,0,112,9,48.767857,55,76.590477,True,"""article_default""",140,"""krimi""",1437233,78228,4593527,…,8.485106,-7.32766,-7.32766,-1.782979,-1.076596,0.011364,518933.96875,119858.492188,7411824.0,0.853848,42.988636,118.034088,1962.75,1.772727,8.772727,8.772727,3.920455,0.761364,0.988636,918299.0,-41630.492188,-2818297.0,0.139552,-10.988636,9.965912,564.25,10.227273,-5.772727,-5.772727,-1.920455,-0.761364,510801,1494180,0,1686049852,21372,-1971500
284044691,1518744,9498042,0.28698,-0.045946,7,0,1,False,0,0,0,False,41620968,False,5,10,136,766,4666,12359.258789,3,14268,0,21545,112,9,48.767857,55,76.590477,True,"""article_default""",457,"""forbrug""",0,0,0,…,-2.514894,-10.32766,-10.32766,1.217021,-0.076596,0.011364,518933.96875,119858.492188,7411824.0,0.853848,42.988636,118.034088,1962.75,1.772727,8.772727,8.772727,3.920455,0.761364,0.988636,-518933.96875,-119858.492188,-7411824.0,0.068752,-6.988636,17.965912,1373.25,-0.772727,-8.772727,-8.772727,1.079545,0.238636,510801,1494180,0,1686049852,18377646,-1971252
284044691,1518744,9799083,-0.061821,-0.489984,7,0,1,False,0,0,0,False,41620968,False,5,10,2418,14108,74332,,1,60,0,0,112,9,48.767857,55,76.590477,False,"""article_scribblelive""",140,"""krimi""",517319,85599,5726493,…,-2.514894,-9.32766,-9.32766,-1.782979,-1.076596,0.011364,518933.96875,119858.492188,7411824.0,0.853848,42.988636,118.034088,1962.75,1.772727,8.772727,8.772727,3.920455,0.761364,-0.011364,-1614.96875,-34259.492188,-1685331.0,0.143252,8.011364,43.965912,-1962.75,-0.772727,-7.772727,-7.772727,-1.920455,-0.761364,510801,1494180,0,1686049852,14267,-1971510


remove id from features....


In [9]:
sub = pd.concat(sub_list, ignore_index=True)
del sub_list
gc.collect()

0

## Output and Save

In [10]:
sub = pl.from_pandas(sub)
sub.write_parquet(f'{base_path}/output/score_lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.parquet')
sub

impression_id,user_id,article_id,score
i32,i32,i32,f64
6451339,35982,9796527,-1.176638
6451339,35982,7851321,-1.726971
6451339,35982,9798805,-1.250337
6451339,35982,9795150,0.350031
6451339,35982,9531110,-2.364827
…,…,…,…
0,1225161,9792362,-2.003431
0,1225161,9788041,-1.090464
0,1225161,9790135,-2.337952
0,1225161,9792408,-4.301114


In [11]:
sub_sorted = sub.sort("impression_id", "user_id", "score", descending=[False, False, True])
sub_sorted = sub_sorted.with_columns(pl.col('score').rank(method="ordinal", descending=True).over(["impression_id", "user_id"]).alias('rank'))

out = sub.join(sub_sorted[['impression_id', 'user_id', 'article_id', 'rank']], on=['impression_id', 'user_id', 'article_id'], how='left')
out = out.group_by(['impression_id', 'user_id']).agg(pl.col('rank'))
id = pl.read_parquet(f"{out_path}/test_impression.parquet").select([
    pl.col("impression_id").cast(pl.Int32),
    pl.col("user_id").cast(pl.Int32)
])
out = id.join(out, on=["impression_id", "user_id"], how='left')
out

impression_id,user_id,rank
i32,i32,list[u32]
6451339,35982,"[5, 7, … 9]"
6451363,36012,"[2, 7, … 1]"
6451382,36162,"[5, 3, … 4]"
6451383,36162,"[2, 9, … 5]"
6451385,36162,"[5, 2, … 7]"
…,…,…
0,1589163,"[231, 229, … 78]"
0,1699456,"[248, 206, … 54]"
0,635479,"[246, 248, … 72]"
0,251030,"[227, 198, … 70]"


In [12]:
# check beyond accuracy impression
out.filter(pl.col('impression_id') == 0)

impression_id,user_id,rank
i32,i32,list[u32]
0,1049297,"[244, 242, … 69]"
0,231624,"[241, 224, … 88]"
0,716356,"[221, 245, … 76]"
0,1440307,"[236, 222, … 78]"
0,1822406,"[238, 218, … 57]"
…,…,…
0,1589163,"[231, 229, … 78]"
0,1699456,"[248, 206, … 54]"
0,635479,"[246, 248, … 72]"
0,251030,"[227, 198, … 70]"


In [13]:
with open(f'{base_path}/output/pred_lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.txt', 'w') as f:
    for row in out.to_dicts():
        # 一度strにしないとlistの","の間に半角スペースが入ってしまう
        rank_str = ','.join(map(str, row['rank']))
        line = f"{row['impression_id']} [{rank_str}]\n"
        #line = f"{row['impression_id']} {row['rank']}\n"
        f.write(line)

### Check output

In [14]:
!head {base_path}/output/pred_lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.txt

6451339 [5,7,6,1,8,4,2,3,9]
6451363 [2,7,3,6,8,4,5,1]
6451382 [5,3,2,1,4]
6451383 [2,9,8,3,1,10,11,4,6,7,5]
6451385 [5,2,3,4,6,1,7]
6451411 [8,2,5,7,1,9,3,6,4]
6451412 [8,2,1,6,7,5,4,3]
6451423 [12,30,18,8,3,33,17,21,25,28,15,20,1,6,26,27,31,5,23,24,14,11,22,9,10,16,19,29,32,13,7,2,4]
6451425 [4,2,3,6,1,5]
6451426 [1,5,2,3,4]


In [15]:
!tail {base_path}/output/pred_lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.txt

0 [248,194,112,165,60,26,145,123,201,228,233,195,31,117,11,167,62,189,106,161,202,120,223,152,56,77,148,159,208,85,124,131,57,191,197,16,134,40,166,174,92,48,181,41,67,1,133,22,163,147,160,154,169,28,177,99,162,198,146,74,140,180,38,10,139,211,182,47,119,44,151,101,249,110,170,91,150,5,175,37,89,75,128,234,98,43,237,84,100,240,109,51,4,179,45,138,214,222,213,107,32,218,65,130,185,187,188,173,81,245,49,18,178,192,230,83,53,241,55,111,236,108,14,219,35,183,186,217,116,141,23,127,171,184,137,68,224,227,172,247,196,199,164,205,87,52,21,113,88,58,69,95,226,244,93,15,114,78,25,126,155,59,135,8,94,72,200,153,39,129,24,30,229,46,34,238,121,220,250,12,216,70,96,176,156,36,209,103,2,19,82,50,212,210,71,7,132,63,158,149,20,225,73,17,246,80,13,102,203,64,190,33,206,215,125,204,61,144,79,157,97,105,118,27,232,9,243,90,3,29,104,115,231,221,86,168,42,239,207,136,122,143,242,142,193,54,6,76,235,66]
0 [240,182,117,152,51,27,146,105,177,215,232,178,20,119,12,181,84,229,100,166,179,124,225,163,30,92,158,

In [16]:
!wc -l {base_path}/output/pred_lgbm_{train_type}_frac{TRAIN_FRAC}_{RANDOM_SEED}.txt

13536710 /content/drive/MyDrive/RecSys2024//output/pred_lgbm_valid_frac0.6_62.txt
