# LGBM Train



In [1]:
from google.colab import drive
drive.mount('/content/drive')
base_path = '/content/drive/MyDrive/RecSys2024/'
!pip3 install polars lightgbm pyarrow optuna optuna-integration

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
#DEBUG_MODE = True
DEBUG_MODE = False
#OPTUNA_FLAG = True
OPTUNA_FLAG = False
#TRAIN_FRAC = 1.0 # train dataのnegative sampleをどれくらいsamplingするか
#TRAIN_FRAC = 0.8 # train dataのnegative sampleをどれくらいsamplingするか
TRAIN_FRAC = 0.6 # train dataのnegative sampleをどれくらいsamplingするか
#TRAIN_FRAC = 0.5 # train dataのnegative sampleをどれくらいsamplingするか
VALID_FRAC = 0.25 # validatoin dataをどれくらいsamplingするか
if DEBUG_MODE:
    TRAIN_FRAC = 1.0
    VALID_FRAC = 1.0

#train_type = 'train'
train_type = 'valid' # use validation data as training for final sub

In [3]:
# ==================================================== # Library # ====================================================
import os
import gc
import warnings
warnings.filterwarnings('ignore')
import random
import scipy as sp
import numpy as np
import pandas as pd
import polars as pl
import joblib
import pyarrow
import itertools
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)
from tqdm.auto import tqdm
from sklearn.model_selection import KFold, train_test_split
from sklearn.metrics import accuracy_score, log_loss
# optuna
if OPTUNA_FLAG:
    import optuna.integration.lightgbm as lgb
else:
    import lightgbm as lgb
    print(lgb.__version__)

4.3.0


# Reading Input File

In [4]:
out_path = base_path + 'feature_output'
bins = [600, 3600, 3600 * 6]

# Feature Engineering

In [5]:
def read_files(train_type = 'train'):
    if DEBUG_MODE:
        impression = pl.read_parquet(f"{out_path}/small_{train_type}_impression.parquet")
        impression_article = pl.read_parquet(f"{out_path}/small_{train_type}_impression_article.parquet")
        user = pl.read_parquet(f"{out_path}/small_{train_type}_user.parquet")
        user_article = pl.read_parquet(f"{out_path}/small_{train_type}_user_article.parquet")
        article = pl.read_parquet(f"{out_path}/small_{train_type}_article.parquet")
    else:
        impression = pl.read_parquet(f"{out_path}/{train_type}_impression.parquet")
        impression_article = pl.read_parquet(f"{out_path}/{train_type}_impression_article.parquet")
        user = pl.read_parquet(f"{out_path}/{train_type}_user.parquet")
        user_article = pl.read_parquet(f"{out_path}/{train_type}_user_article.parquet")
        article = pl.read_parquet(f"{out_path}/{train_type}_article.parquet")
    return impression_article, impression, user, user_article, article

def add_pop_count(df, out_col_name, bins):

    for bin in bins:
        df = df.with_columns(
            (df["impression_time"] / bin).cast(pl.Int64).alias(f"time_bin_{bin}")
        )
        grouped_df = df.group_by([f"time_bin_{bin}", "article_id_inview"]).agg(
            pl.count().cast(pl.Int32).alias(out_col_name + str(bin))
        )
        df = df.join(
            grouped_df,
            on=[f"time_bin_{bin}", "article_id_inview"],
            how="left"
        )
        df = df.drop(f"time_bin_{bin}")
    return df


def join_features(impression_article, impression, user, user_article, article, bins):
    print('join features....')
    df = impression_article.select("impression_id", "article_id_inview", "clicked").join(impression, on="impression_id", how="inner")
    del impression_article
    gc.collect()
    df = reduce_mem_usage(df)

    print('add view_cnt....')
    df = add_pop_count(df,"view_cnt", bins)
    df = reduce_mem_usage(df)

    print('add user features....')
    df = df.join(user, on="user_id", how="left")
    del user
    gc.collect()
    df = reduce_mem_usage(df)

    print('add article features....')
    df = df.rename({
        "article_id_inview": "article_id"
    })
    df = df.join(
        #article.rename({col: f"to_{col}" for col in article.columns}),
        article,
        on="article_id",
        how="left"
    )
    del article
    gc.collect()
    df = reduce_mem_usage(df)

    print('add user article features....')
    df = df.join(
        #user_article.rename({col: f"to_{col}" for col in user_article.columns if col != 'user_id'}),
        user_article,
        on=["user_id", "article_id"],
        how="left"
    )
    #df = df.drop(['user_id', 'article_id'])
    del user_article
    gc.collect()
    df = reduce_mem_usage(df)

    print('cast features....')
    for bin in bins:
        df = df.with_columns(
            pl.col(f"view_cnt{bin}").fill_null(0).cast(pl.Int32)
        )
    df = reduce_mem_usage(df)
    return df


def generate_unixtime_features(df, unixtime_list):

    for col_name in unixtime_list:
        df = df.with_columns(
            (pl.col("impression_time") - pl.col(col_name)).alias(f"{col_name}_diff").cast(pl.Int64)
        )
        df = df.drop(col_name)
    df = df.drop('impression_time')
    df = reduce_mem_usage(df)
    return df


def to_pandas(df_data, cat_cols=None):
    df_data = df_data.to_pandas()
    if cat_cols is None:
        cat_cols = list(df_data.select_dtypes("object").columns)
    df_data[cat_cols] = df_data[cat_cols].astype("category")
    return df_data
    #return df_data, cat_cols


def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage in Polars.
    """

    # Initialize an empty list to store optimized columns
    optimized_columns = []

    for col in df.columns:
        col_data = df[col]
        col_type = col_data.dtype

        if col_type in [pl.Int8, pl.Int16, pl.Int32, pl.Int64, pl.UInt32]:
            col_data = col_data.fill_null(0)
            c_min = col_data.min()
            c_max = col_data.max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                optimized_columns.append(col_data.cast(pl.Int8))
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                optimized_columns.append(col_data.cast(pl.Int16))
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                optimized_columns.append(col_data.cast(pl.Int32))
            else:
                optimized_columns.append(col_data.cast(pl.Int64))
        elif col_type in [pl.Float32, pl.Float64]:
            c_min = col_data.min()
            c_max = col_data.max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                optimized_columns.append(col_data.cast(pl.Float32))
            else:
                optimized_columns.append(col_data.cast(pl.Float64))
        else:
            optimized_columns.append(col_data)
    # Create a new DataFrame with optimized columns
    optimized_df = pl.DataFrame({col: optimized_columns[i] for i, col in enumerate(df.columns)})

    return optimized_df


def cast_float_to_int(df, columns):
    for col in columns:
        df = df.with_columns(pl.col(col).cast(pl.Int64))
    return df


def negative_sampling(df, frac=0.5, seed=42):
    negative_examples = df.filter(pl.col("clicked") == False)
    positive_examples = df.filter(pl.col("clicked") == True)
    reduced_negatives = negative_examples.sample(fraction = frac, seed=seed)

    return pl.concat([reduced_negatives, positive_examples]).sort("impression_id")


def add_features(df, cols, key, prefix, df_origin = None):
    tmp = df.group_by(key).agg([
        pl.col(cols).mean(),
    ])
    new_cols = [prefix + name + "_mean" if name != key else name for name in cols]
    tmp = tmp.rename(dict(zip(cols, new_cols)))
    tmp = reduce_mem_usage(tmp)
    # joinする元のdfが異なる場合は最後の引数を指定する
    if df_origin is not None:
        df = df_origin
    df = df.join(tmp, on=key, how='left')
    del tmp
    gc.collect()

    # 差分のfeatureを追加
    for i, j in zip(cols, new_cols):
      df = df.with_columns(
          (pl.col(i) - pl.col(j)).alias('diff_' + j)
      )
    df = reduce_mem_usage(df)
    return df

def join_all_process(train_type='train'):

    impression_article, impression, user, user_article, article = read_files(train_type=train_type)
    # validはデータ量が多いので半分に削る
    if train_type == 'train':
      frac = TRAIN_FRAC
    else:
      frac = VALID_FRAC
    impression = impression.sample(fraction=frac, seed=42)
    float_int_cols = ['read_time', 'scroll_percentage']
    impression = cast_float_to_int(impression, float_int_cols)
    impression = reduce_mem_usage(impression)
    impression_article = reduce_mem_usage(impression_article)

    float_int_cols = ['u_read_time_last', 'u_scroll_percentage_last', 'u_impression_time_mean']
    user = cast_float_to_int(user, float_int_cols)
    user = reduce_mem_usage(user)

    float_int_cols = ['ua_read_time_last', 'ua_scroll_percentage_last']
    user_article = cast_float_to_int(user_article, float_int_cols)
    user_article = reduce_mem_usage(user_article)

    float_int_cols = ['a_total_read_time']
    article = cast_float_to_int(article, float_int_cols)
    article = reduce_mem_usage(article)

    df = join_features(impression_article, impression, user, user_article, article, bins)
    del impression_article, impression, user
    gc.collect()

    print('add user features group_by article impression basis, add diff features....')
    cols = ['read_time', 'scroll_percentage','device_type', 'is_sso_user', 'gender', 'postcode', 'age', 'is_subscriber', 'view_num']
    df = add_features(df, cols=cols, key='article_id', prefix='a_')

    print('add article features group_by user impression basis, add diff features....')
    cols = ["premium", "total_inviews", "total_pageviews", "total_read_time", "sentiment_score", "title_len", "subtitle_len", "body_len", "image_ids_len", "ner_clusters_len", "entity_groups_len", "topics_len", "subcategory_len"]
    cols = ['a_' + i for i in cols]
    df = add_features(df, cols=cols, key='user_id', prefix='u_')

    print('add article features group_by user click history basis, add diff features....')
    user_hist = user_article.select("user_id", "article_id")
    del user_article
    gc.collect()

    user_hist = user_hist.join(article, on='article_id', how='left')
    del article
    gc.collect()

    user_hist = reduce_mem_usage(user_hist)
    df = add_features(user_hist, cols=cols, key='user_id', prefix='u_hist_', df_origin=df)
    del user_hist
    gc.collect()
    return df


In [6]:
valid = join_all_process('valid')

join features....
add view_cnt....
add user features....
add article features....
add user article features....
cast features....
add user features group_by article impression basis, add diff features....
add article features group_by user impression basis, add diff features....
add article features group_by user click history basis, add diff features....


In [7]:
if train_type == 'train':
    train = join_all_process('train')
else:
    train = valid

In [8]:
# unixtimeのfeatureをdiffにする
unixtime_list = ['u_impression_time_last', 'u_impression_time_mean', 'ua_impression_time_mean', 'ua_impression_time_last', 'a_published_time', 'a_last_modified_time']
train = generate_unixtime_features(train, unixtime_list)
valid = generate_unixtime_features(valid, unixtime_list)

In [9]:
print(train.shape)
train.head()

(37536611, 124)


impression_id,article_id,clicked,read_time,scroll_percentage,device_type,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,view_num,view_cnt600,view_cnt3600,view_cnt21600,u_history_len,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean,a_premium,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,a_sentiment_score,a_sentiment_label,a_title_len,a_subtitle_len,a_body_len,a_ner_clusters_first,a_entity_groups_first,a_topics_first,…,diff_u_a_image_ids_len_mean,diff_u_a_ner_clusters_len_mean,diff_u_a_entity_groups_len_mean,diff_u_a_topics_len_mean,diff_u_a_subcategory_len_mean,u_hist_a_premium_mean,u_hist_a_total_inviews_mean,u_hist_a_total_pageviews_mean,u_hist_a_total_read_time_mean,u_hist_a_sentiment_score_mean,u_hist_a_title_len_mean,u_hist_a_subtitle_len_mean,u_hist_a_body_len_mean,u_hist_a_image_ids_len_mean,u_hist_a_ner_clusters_len_mean,u_hist_a_entity_groups_len_mean,u_hist_a_topics_len_mean,u_hist_a_subcategory_len_mean,diff_u_hist_a_premium_mean,diff_u_hist_a_total_inviews_mean,diff_u_hist_a_total_pageviews_mean,diff_u_hist_a_total_read_time_mean,diff_u_hist_a_sentiment_score_mean,diff_u_hist_a_title_len_mean,diff_u_hist_a_subtitle_len_mean,diff_u_hist_a_body_len_mean,diff_u_hist_a_image_ids_len_mean,diff_u_hist_a_ner_clusters_len_mean,diff_u_hist_a_entity_groups_len_mean,diff_u_hist_a_topics_len_mean,diff_u_hist_a_subcategory_len_mean,u_impression_time_last_diff,u_impression_time_mean_diff,ua_impression_time_mean_diff,ua_impression_time_last_diff,a_published_time_diff,a_last_modified_time_diff
i32,i32,bool,i16,i8,i8,i32,bool,i8,i8,i8,bool,i32,u8,i16,i16,i32,i16,i16,f32,i8,f32,bool,str,i16,str,i32,i32,i32,f32,str,u16,u16,u16,str,str,str,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,i32,i32,i32,i32
96434,9782884,True,13,0,2,21822,False,0,0,0,False,110,5,1470,6547,24541,176,15,28.835228,28,57.40136,False,"""article_default""",512,"""nationen""",894710,177869,16980028,0.9475,"""Negative""",36,141,2763,"""Lenovo""","""ORG""","""Erhverv""",…,3.483871,-3.67742,-3.67742,0.290323,-1.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,371584.15625,58139.3125,9869293.0,0.04688,-8.55357,30.416664,996.238037,5.416667,-0.648809,-0.648809,0.392857,-0.738095,259840,1118488,0,1685249636,29172,-2771710
96434,9783800,False,13,0,2,21822,False,0,0,0,False,110,5,1533,5105,16791,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",505457,68468,4559120,0.971,"""Positive""",51,127,1335,"""Clara Tauson""","""PER""","""Kendt""",…,0.483871,-7.67742,-7.67742,1.290323,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-17668.84375,-51261.6875,-2551615.0,0.07038,6.44643,16.416664,-431.761963,2.416667,-4.648809,-4.648809,1.392857,1.261905,259840,1118488,0,1685249636,2601,-2771711
96434,9784793,False,13,0,2,21822,False,0,0,0,False,110,5,1520,5118,16527,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",458077,80656,3816893,0.9194,"""Positive""",56,102,1682,"""amerikaneren""","""MISC""","""Kendt""",…,-2.516129,4.32258,4.32258,0.290323,-0.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-65048.84375,-39073.6875,-3293842.0,0.01878,11.44643,-8.583336,-84.761963,-0.583333,7.351191,7.351191,0.392857,0.261905,259840,1118488,0,1685249636,2364,-2771713
96434,9784804,False,13,0,2,21822,False,0,0,0,False,110,5,1401,3464,14312,176,15,28.835228,28,57.40136,False,"""article_default""",118,"""nyheder""",350774,43066,3214526,0.9378,"""Neutral""",64,111,2499,"""Berlingske""","""ORG""","""Kendt""",…,-2.516129,-0.67742,-0.67742,-0.709677,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-172351.84375,-76663.6875,-3896209.0,0.03718,19.44643,0.416664,732.238037,-0.583333,2.351191,2.351191,-0.607143,1.261905,259840,1118488,0,1685249636,1743,-2771713
96434,9784702,False,13,0,2,21822,False,0,0,0,False,110,5,1389,6223,23481,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",779910,107271,4991002,0.7025,"""Positive""",44,107,1301,"""Bundesligaen""","""EVENT""","""Begivenhed""",…,-2.516129,2.32258,2.32258,0.290323,1.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,256784.15625,-12458.6875,-2119733.0,-0.19812,-0.55357,-3.583336,-465.761963,-0.583333,5.351191,5.351191,0.392857,2.261905,259840,1118488,0,1685249636,29119,-2771713


In [10]:
train.select('clicked').mean()

clicked
f64
0.084024


In [11]:
print(valid.shape)
valid.head()

(37536611, 124)


impression_id,article_id,clicked,read_time,scroll_percentage,device_type,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,view_num,view_cnt600,view_cnt3600,view_cnt21600,u_history_len,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean,a_premium,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,a_sentiment_score,a_sentiment_label,a_title_len,a_subtitle_len,a_body_len,a_ner_clusters_first,a_entity_groups_first,a_topics_first,…,diff_u_a_image_ids_len_mean,diff_u_a_ner_clusters_len_mean,diff_u_a_entity_groups_len_mean,diff_u_a_topics_len_mean,diff_u_a_subcategory_len_mean,u_hist_a_premium_mean,u_hist_a_total_inviews_mean,u_hist_a_total_pageviews_mean,u_hist_a_total_read_time_mean,u_hist_a_sentiment_score_mean,u_hist_a_title_len_mean,u_hist_a_subtitle_len_mean,u_hist_a_body_len_mean,u_hist_a_image_ids_len_mean,u_hist_a_ner_clusters_len_mean,u_hist_a_entity_groups_len_mean,u_hist_a_topics_len_mean,u_hist_a_subcategory_len_mean,diff_u_hist_a_premium_mean,diff_u_hist_a_total_inviews_mean,diff_u_hist_a_total_pageviews_mean,diff_u_hist_a_total_read_time_mean,diff_u_hist_a_sentiment_score_mean,diff_u_hist_a_title_len_mean,diff_u_hist_a_subtitle_len_mean,diff_u_hist_a_body_len_mean,diff_u_hist_a_image_ids_len_mean,diff_u_hist_a_ner_clusters_len_mean,diff_u_hist_a_entity_groups_len_mean,diff_u_hist_a_topics_len_mean,diff_u_hist_a_subcategory_len_mean,u_impression_time_last_diff,u_impression_time_mean_diff,ua_impression_time_mean_diff,ua_impression_time_last_diff,a_published_time_diff,a_last_modified_time_diff
i32,i32,bool,i16,i8,i8,i32,bool,i8,i8,i8,bool,i32,u8,i16,i16,i32,i16,i16,f32,i8,f32,bool,str,i16,str,i32,i32,i32,f32,str,u16,u16,u16,str,str,str,…,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,f32,i32,i32,i32,i32,i32,i32
96434,9782884,True,13,0,2,21822,False,0,0,0,False,110,5,1470,6547,24541,176,15,28.835228,28,57.40136,False,"""article_default""",512,"""nationen""",894710,177869,16980028,0.9475,"""Negative""",36,141,2763,"""Lenovo""","""ORG""","""Erhverv""",…,3.483871,-3.67742,-3.67742,0.290323,-1.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,371584.15625,58139.3125,9869293.0,0.04688,-8.55357,30.416664,996.238037,5.416667,-0.648809,-0.648809,0.392857,-0.738095,259840,1118488,0,1685249636,29172,-2771710
96434,9783800,False,13,0,2,21822,False,0,0,0,False,110,5,1533,5105,16791,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",505457,68468,4559120,0.971,"""Positive""",51,127,1335,"""Clara Tauson""","""PER""","""Kendt""",…,0.483871,-7.67742,-7.67742,1.290323,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-17668.84375,-51261.6875,-2551615.0,0.07038,6.44643,16.416664,-431.761963,2.416667,-4.648809,-4.648809,1.392857,1.261905,259840,1118488,0,1685249636,2601,-2771711
96434,9784793,False,13,0,2,21822,False,0,0,0,False,110,5,1520,5118,16527,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",458077,80656,3816893,0.9194,"""Positive""",56,102,1682,"""amerikaneren""","""MISC""","""Kendt""",…,-2.516129,4.32258,4.32258,0.290323,-0.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-65048.84375,-39073.6875,-3293842.0,0.01878,11.44643,-8.583336,-84.761963,-0.583333,7.351191,7.351191,0.392857,0.261905,259840,1118488,0,1685249636,2364,-2771713
96434,9784804,False,13,0,2,21822,False,0,0,0,False,110,5,1401,3464,14312,176,15,28.835228,28,57.40136,False,"""article_default""",118,"""nyheder""",350774,43066,3214526,0.9378,"""Neutral""",64,111,2499,"""Berlingske""","""ORG""","""Kendt""",…,-2.516129,-0.67742,-0.67742,-0.709677,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-172351.84375,-76663.6875,-3896209.0,0.03718,19.44643,0.416664,732.238037,-0.583333,2.351191,2.351191,-0.607143,1.261905,259840,1118488,0,1685249636,1743,-2771713
96434,9784702,False,13,0,2,21822,False,0,0,0,False,110,5,1389,6223,23481,176,15,28.835228,28,57.40136,False,"""article_default""",142,"""sport""",779910,107271,4991002,0.7025,"""Positive""",44,107,1301,"""Bundesligaen""","""EVENT""","""Begivenhed""",…,-2.516129,2.32258,2.32258,0.290323,1.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,256784.15625,-12458.6875,-2119733.0,-0.19812,-0.55357,-3.583336,-465.761963,-0.583333,5.351191,5.351191,0.392857,2.261905,259840,1118488,0,1685249636,29119,-2771713


In [12]:
valid.select('clicked').mean()

clicked
f64
0.084024


## Preprocess

In [13]:
train = to_pandas(train)
valid = to_pandas(valid)

In [14]:
train_x = train.drop(['clicked'], axis=1)
train_y = train['clicked']
del train
gc.collect()
valid_x = valid.drop(['clicked'], axis=1)
valid_y = valid['clicked']
del valid
gc.collect()

0

In [15]:
query_list_train = train_x['impression_id'].value_counts()
query_list_train = query_list_train.sort_index()

query_list_valid = valid_x['impression_id'].value_counts()
query_list_valid = query_list_valid.sort_index()

In [16]:
query_list_train

impression_id
96434         5
96436        13
96473         8
96478         5
96553         6
             ..
579553853    15
579553911     7
579553913     7
579553939     7
579553955     9
Name: count, Length: 3141596, dtype: int64

In [17]:
train_x.head()

Unnamed: 0,impression_id,article_id,read_time,scroll_percentage,device_type,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,view_num,view_cnt600,view_cnt3600,view_cnt21600,u_history_len,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean,a_premium,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,a_sentiment_score,a_sentiment_label,a_title_len,a_subtitle_len,a_body_len,a_ner_clusters_first,a_entity_groups_first,a_topics_first,a_subcategory_first,a_image_ids_len,a_ner_clusters_len,a_entity_groups_len,a_topics_len,a_subcategory_len,ua_scroll_percentage_mean,ua_scroll_percentage_last,ua_read_time_mean,ua_read_time_last,ua_count,a_read_time_mean,a_scroll_percentage_mean,a_device_type_mean,a_is_sso_user_mean,a_gender_mean,a_postcode_mean,a_age_mean,a_is_subscriber_mean,a_view_num_mean,diff_a_read_time_mean,diff_a_scroll_percentage_mean,diff_a_device_type_mean,diff_a_is_sso_user_mean,diff_a_gender_mean,diff_a_postcode_mean,diff_a_age_mean,diff_a_is_subscriber_mean,diff_a_view_num_mean,u_a_premium_mean,u_a_total_inviews_mean,u_a_total_pageviews_mean,u_a_total_read_time_mean,u_a_sentiment_score_mean,u_a_title_len_mean,u_a_subtitle_len_mean,u_a_body_len_mean,u_a_image_ids_len_mean,u_a_ner_clusters_len_mean,u_a_entity_groups_len_mean,u_a_topics_len_mean,u_a_subcategory_len_mean,diff_u_a_premium_mean,diff_u_a_total_inviews_mean,diff_u_a_total_pageviews_mean,diff_u_a_total_read_time_mean,diff_u_a_sentiment_score_mean,diff_u_a_title_len_mean,diff_u_a_subtitle_len_mean,diff_u_a_body_len_mean,diff_u_a_image_ids_len_mean,diff_u_a_ner_clusters_len_mean,diff_u_a_entity_groups_len_mean,diff_u_a_topics_len_mean,diff_u_a_subcategory_len_mean,u_hist_a_premium_mean,u_hist_a_total_inviews_mean,u_hist_a_total_pageviews_mean,u_hist_a_total_read_time_mean,u_hist_a_sentiment_score_mean,u_hist_a_title_len_mean,u_hist_a_subtitle_len_mean,u_hist_a_body_len_mean,u_hist_a_image_ids_len_mean,u_hist_a_ner_clusters_len_mean,u_hist_a_entity_groups_len_mean,u_hist_a_topics_len_mean,u_hist_a_subcategory_len_mean,diff_u_hist_a_premium_mean,diff_u_hist_a_total_inviews_mean,diff_u_hist_a_total_pageviews_mean,diff_u_hist_a_total_read_time_mean,diff_u_hist_a_sentiment_score_mean,diff_u_hist_a_title_len_mean,diff_u_hist_a_subtitle_len_mean,diff_u_hist_a_body_len_mean,diff_u_hist_a_image_ids_len_mean,diff_u_hist_a_ner_clusters_len_mean,diff_u_hist_a_entity_groups_len_mean,diff_u_hist_a_topics_len_mean,diff_u_hist_a_subcategory_len_mean,u_impression_time_last_diff,u_impression_time_mean_diff,ua_impression_time_mean_diff,ua_impression_time_last_diff,a_published_time_diff,a_last_modified_time_diff
0,96434,9782884,13,0,2,21822,False,0,0,0,False,110,5,1470,6547,24541,176,15,28.835228,28,57.40136,False,article_default,512,nationen,894710,177869,16980028,0.9475,Negative,36,141,2763,Lenovo,ORG,Erhverv,0,7,8,8,4,0,,0,,0,0,67.987526,28.75205,1.688696,0.110824,0.009229,0.035531,1.683687,0.072823,19.293797,-54.987526,-28.75205,0.311304,-0.110824,-0.009229,-0.035531,-1.683687,-0.072823,-14.293797,0.225806,688963.625,71983.453125,4342223.0,0.874732,42.870968,122.0,2604.09668,3.516129,11.67742,11.67742,3.709677,1.290323,-0.225806,205746.375,105885.546875,12637805.0,0.072768,-6.870968,19.0,158.90332,3.483871,-3.67742,-3.67742,0.290323,-1.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,371584.15625,58139.3125,9869293.0,0.04688,-8.55357,30.416664,996.238037,5.416667,-0.648809,-0.648809,0.392857,-0.738095,259840,1118488,0,1685249636,29172,-2771710
1,96434,9783800,13,0,2,21822,False,0,0,0,False,110,5,1533,5105,16791,176,15,28.835228,28,57.40136,False,article_default,142,sport,505457,68468,4559120,0.971,Positive,51,127,1335,Clara Tauson,PER,Kendt,327,4,4,4,5,2,,0,,0,0,64.107788,33.120197,1.742844,0.106519,0.008506,0.032895,1.592313,0.068648,17.104948,-51.107788,-33.120197,0.257156,-0.106519,-0.008506,-0.032895,-1.592313,-0.068648,-12.104948,0.225806,688963.625,71983.453125,4342223.0,0.874732,42.870968,122.0,2604.09668,3.516129,11.67742,11.67742,3.709677,1.290323,-0.225806,-183506.625,-3515.453125,216897.0,0.096268,8.129032,5.0,-1269.09668,0.483871,-7.67742,-7.67742,1.290323,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-17668.84375,-51261.6875,-2551615.0,0.07038,6.44643,16.416664,-431.761963,2.416667,-4.648809,-4.648809,1.392857,1.261905,259840,1118488,0,1685249636,2601,-2771711
2,96434,9784793,13,0,2,21822,False,0,0,0,False,110,5,1520,5118,16527,176,15,28.835228,28,57.40136,False,article_default,142,sport,458077,80656,3816893,0.9194,Positive,56,102,1682,amerikaneren,MISC,Kendt,327,1,16,16,4,1,,0,,0,0,63.213081,32.106697,1.754734,0.105353,0.008543,0.032984,1.590807,0.067641,17.172741,-50.213081,-32.106697,0.245266,-0.105353,-0.008543,-0.032984,-1.590807,-0.067641,-12.172741,0.225806,688963.625,71983.453125,4342223.0,0.874732,42.870968,122.0,2604.09668,3.516129,11.67742,11.67742,3.709677,1.290323,-0.225806,-230886.625,8672.546875,-525330.0,0.044668,13.129032,-20.0,-922.09668,-2.516129,4.32258,4.32258,0.290323,-0.290323,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-65048.84375,-39073.6875,-3293842.0,0.01878,11.44643,-8.583336,-84.761963,-0.583333,7.351191,7.351191,0.392857,0.261905,259840,1118488,0,1685249636,2364,-2771713
3,96434,9784804,13,0,2,21822,False,0,0,0,False,110,5,1401,3464,14312,176,15,28.835228,28,57.40136,False,article_default,118,nyheder,350774,43066,3214526,0.9378,Neutral,64,111,2499,Berlingske,ORG,Kendt,130,1,11,11,3,2,,0,,0,0,66.374649,29.961708,1.729108,0.106978,0.008661,0.034421,1.61743,0.069357,19.630522,-53.374649,-29.961708,0.270892,-0.106978,-0.008661,-0.034421,-1.61743,-0.069357,-14.630522,0.225806,688963.625,71983.453125,4342223.0,0.874732,42.870968,122.0,2604.09668,3.516129,11.67742,11.67742,3.709677,1.290323,-0.225806,-338189.625,-28917.453125,-1127697.0,0.063068,21.129032,-11.0,-105.09668,-2.516129,-0.67742,-0.67742,-0.709677,0.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,-172351.84375,-76663.6875,-3896209.0,0.03718,19.44643,0.416664,732.238037,-0.583333,2.351191,2.351191,-0.607143,1.261905,259840,1118488,0,1685249636,1743,-2771713
4,96434,9784702,13,0,2,21822,False,0,0,0,False,110,5,1389,6223,23481,176,15,28.835228,28,57.40136,False,article_default,142,sport,779910,107271,4991002,0.7025,Positive,44,107,1301,Bundesligaen,EVENT,Begivenhed,196,1,14,14,4,3,,0,,0,0,67.960045,33.918762,1.766495,0.1068,0.008324,0.03355,1.630496,0.069993,17.042997,-54.960045,-33.918762,0.233505,-0.1068,-0.008324,-0.03355,-1.630496,-0.069993,-12.042997,0.225806,688963.625,71983.453125,4342223.0,0.874732,42.870968,122.0,2604.09668,3.516129,11.67742,11.67742,3.709677,1.290323,-0.225806,90946.375,35287.546875,648779.0,-0.172232,1.129032,-15.0,-1303.09668,-2.516129,2.32258,2.32258,0.290323,1.709677,0.0,523125.84375,119729.6875,7110735.0,0.90062,44.55357,110.583336,1766.761963,1.583333,8.648809,8.648809,3.607143,0.738095,0.0,256784.15625,-12458.6875,-2119733.0,-0.19812,-0.55357,-3.583336,-465.761963,-0.583333,5.351191,5.351191,0.392857,2.261905,259840,1118488,0,1685249636,29119,-2771713


In [18]:
train_x.dtypes

impression_id                              int32
article_id                                 int32
read_time                                  int16
scroll_percentage                           int8
device_type                                 int8
user_id                                    int32
is_sso_user                                 bool
gender                                      int8
postcode                                    int8
age                                         int8
is_subscriber                               bool
session_id                                 int32
view_num                                   uint8
view_cnt600                                int16
view_cnt3600                               int16
view_cnt21600                              int32
u_history_len                              int16
u_read_time_last                           int16
u_read_time_mean                         float32
u_scroll_percentage_last                    int8
u_scroll_percentage_

In [19]:
IGNORE_COL_ID = ["impression_id", "session_id", "user_id", 'article_id']
#LOW_IMP_COL = ["age", "gender", "postcode", "u_a_title_len_mean", "u_read_time_last", "u_scroll_percentage_last", "ua_read_time_last"] # 0 importanceのfeatureを削る
LOW_IMP_COL = [] # 0 importanceのfeatureを削る
print('remove id from features....')
train_x, valid_x = train_x.drop(IGNORE_COL_ID + LOW_IMP_COL, axis=1), valid_x.drop(IGNORE_COL_ID + LOW_IMP_COL, axis=1)


remove id from features....


# Optuna

In [20]:
num_round = 100
verbose_eval = 20

if OPTUNA_FLAG:
    params = {
        'objective': 'lambdarank',
        'metric': 'ndcg',
        'boosting_type': 'gbdt',
        'ndcg_eval_at': [5, 10],
        'learning_rate': 0.02,    # default = 0.1
        'random_state': 0,        # default = None
        'verbose': -1,
    }

    lgb_train = lgb.Dataset(train_x, train_y, group=query_list_train)
    lgb_eval = lgb.Dataset(valid_x, valid_y, group=query_list_valid)

    del train_x, train_y, query_list_train, query_list_valid
    gc.collect()
    model = lgb.train(params = params,
        train_set = lgb_train,
        num_boost_round = num_round,
        valid_sets=[lgb_train, lgb_eval],
        callbacks=[lgb.early_stopping(stopping_rounds=20,  verbose=True),  lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数
    )
    model.param



In [21]:
if OPTUNA_FLAG:
    print("Optuna results: ", model.params)
# final parameters are ditermined by Optuna
params = {
    'objective': 'lambdarank',
    'metric': 'ndcg',
    'random_state': 42,
    'boosting_type': 'gbdt',
    'ndcg_eval_at': [5, 10],
    'learning_rate': 0.05,
    'random_state': 0,
    'verbose': -1,
    'feature_pre_filter': False,
    'lambda_l1': 0.003202466714172952,
    'lambda_l2': 0.0006081573023395271,
    'num_leaves': 100,
    'feature_fraction': 0.6,
    'bagging_fraction': 0.9899028245960699,
    'bagging_freq': 6,
    'min_child_samples': 100,
    #'num_iterations': 300,
    'categorical_column': [19, 21, 26, 30, 31, 32]
}
verbose_eval = 20

## Training

In [22]:
num_round = 1000
if train_type == 'valid':
    num_round = 562

In [None]:
# 0.62, 0.65を目標
lgb_train = lgb.Dataset(train_x, train_y, group=query_list_train)
lgb_eval = lgb.Dataset(valid_x, valid_y, group=query_list_valid)

del train_x, train_y, query_list_train, query_list_valid
gc.collect()
model = lgb.train(params = params,
    train_set = lgb_train,
    num_boost_round = num_round,
    valid_sets=[lgb_train, lgb_eval],
    callbacks=[lgb.early_stopping(stopping_rounds=20,  verbose=True),  lgb.log_evaluation(verbose_eval)] # コマンドライン出力用コールバック関数
)
# save model
model_path = base_path + 'models'
if DEBUG_MODE:
    joblib.dump(model, f'{model_path}/small_lgbm_{train_type}_frac{TRAIN_FRAC}.pkl')
else:
    joblib.dump(model, f'{model_path}/lgbm_{train_type}_frac{TRAIN_FRAC}.pkl')
    #joblib.dump(model, f'{model_path}/lgbm_{train_type}_frac{TRAIN_FRAC}_tmp.pkl')

Training until validation scores don't improve for 20 rounds
[20]	training's ndcg@5: 0.663797	training's ndcg@10: 0.689358	valid_1's ndcg@5: 0.663797	valid_1's ndcg@10: 0.689358
[40]	training's ndcg@5: 0.67661	training's ndcg@10: 0.700365	valid_1's ndcg@5: 0.67661	valid_1's ndcg@10: 0.700365
[60]	training's ndcg@5: 0.684781	training's ndcg@10: 0.70726	valid_1's ndcg@5: 0.684781	valid_1's ndcg@10: 0.70726
[80]	training's ndcg@5: 0.690704	training's ndcg@10: 0.712382	valid_1's ndcg@5: 0.690704	valid_1's ndcg@10: 0.712382
[100]	training's ndcg@5: 0.695976	training's ndcg@10: 0.716799	valid_1's ndcg@5: 0.695976	valid_1's ndcg@10: 0.716799
[120]	training's ndcg@5: 0.700176	training's ndcg@10: 0.720341	valid_1's ndcg@5: 0.700176	valid_1's ndcg@10: 0.720341
[140]	training's ndcg@5: 0.703576	training's ndcg@10: 0.723228	valid_1's ndcg@5: 0.703576	valid_1's ndcg@10: 0.723228
[160]	training's ndcg@5: 0.706723	training's ndcg@10: 0.725891	valid_1's ndcg@5: 0.706723	valid_1's ndcg@10: 0.725891


In [None]:
model.best_iteration

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

# sorted(zip(clf.feature_importances_, X.columns), reverse=True)
feature_imp_split = pd.DataFrame(sorted(zip(model.feature_importance(),model.feature_name())), columns=['Value','Feature'])

plt.figure(figsize=(20, 15))
sns.barplot(x="Value", y="Feature", data=feature_imp_split.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
feature_imp = pd.DataFrame(sorted(zip(model.feature_importance('gain'),model.feature_name())), columns=['Value','Feature'])

plt.figure(figsize=(20, 15))
sns.barplot(x="Value", y="Feature", data=feature_imp.sort_values(by="Value", ascending=False))
plt.title('LightGBM Features (avg over folds)')
plt.tight_layout()
plt.show()

In [None]:
feature_imp

In [None]:
# small data
# [68] training's ndcg@5: 0.68384	training's ndcg@10: 0.708506	valid_1's ndcg@5: 0.550625	valid_1's ndcg@10: 0.59301
# 90% negatie sampling [96]	training's ndcg@5: 0.70648	training's ndcg@10: 0.727916	valid_1's ndcg@5: 0.55037	valid_1's ndcg@10: 0.592454
# 80% [84] training's ndcg@5: 0.724863	training's ndcg@10: 0.743467	valid_1's ndcg@5: 0.546675	valid_1's ndcg@10: 0.589225
# 50% [48] training's ndcg@5: 0.791471	training's ndcg@10: 0.801367	valid_1's ndcg@5: 0.549047	valid_1's ndcg@10: 0.590969
# full data
# full, valid_frac=0.5 [72]	training's ndcg@5: 0.676216	training's ndcg@10: 0.701085	valid_1's ndcg@5: 0.554916	valid_1's ndcg@10: 0.596697
#	full, valid_frac=0.25	[72]	training's ndcg@5: 0.676216	training's ndcg@10: 0.701085	valid_1's ndcg@5: 0.555081	valid_1's ndcg@10: 0.596856 -> 0.25で問題なさそう
# under sampling: 0.8 [84]	training's ndcg@5: 0.718681	training's ndcg@10: 0.737356	valid_1's ndcg@5: 0.554494	valid_1's ndcg@10: 0.596234 -> いったんこれでいく
# under sampling: 0.5 [85]	training's ndcg@5: 0.791716	training's ndcg@10: 0.801159	valid_1's ndcg@5: 0.549173	valid_1's ndcg@10: 0.591309

# testに含まれないfeatureを使っていたので修正、featureの数40個ちょっと + bugちょっと修正
# small data
# [179]	training's ndcg@5: 0.727879	training's ndcg@10: 0.743617	valid_1's ndcg@5: 0.542041	valid_1's ndcg@10: 0.583972
# full data
# [71] valid_frac=0.25	training's ndcg@5: 0.666762	training's ndcg@10: 0.689943	valid_1's ndcg@5: 0.537423	valid_1's ndcg@10: 0.580449
# validで学習, [71]	training's ndcg@5: 0.649776	training's ndcg@10: 0.676771	valid_1's ndcg@5: 0.649776	valid_1's ndcg@10: 0.676771
# small data add feature
# [21]	training's ndcg@5: 0.673465	training's ndcg@10: 0.697794	valid_1's ndcg@5: 0.585802	valid_1's ndcg@10: 0.621771

# featureを40くらい->120に増やしてOptuna一回目、データ多すぎたのでfrac_train=0.6, valid=0.2, lr=0.02, importance=0もそこそこある -> itr増やすと数個に減る
# small data again, lr=0.05, early stopping=40
# [293]	training's ndcg@5: 0.785899	training's ndcg@10: 0.797273	valid_1's ndcg@5: 0.624458	valid_1's ndcg@10: 0.653404
# 0 feature削る (7個) -> 若干過学習っぽくなってる？いったん削るのやめる
# [497]	training's ndcg@5: 0.813702	training's ndcg@10: 0.823218	valid_1's ndcg@5: 0.62413	valid_1's ndcg@10: 0.652956

# dataのサイズがでかすぎるのが問題だが、出せる範囲でいったんsubしてみたい。
# full size tarin frac=0.6, valid=0.25, n features=120, lr=0.05
# [562]	training's ndcg@5: 0.756472	training's ndcg@10: 0.767894	valid_1's ndcg@5: 0.6163	valid_1's ndcg@10: 0.645803
# full size importance 0 削る (7個) -> いったん見送り
# [475]	training's ndcg@5: 0.753473	training's ndcg@10: 0.765269	valid_1's ndcg@5: 0.614346	valid_1's ndcg@10: 0.644336


