# Feature Engineering


In [1]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive/RecSys2024/
base_path = '/content/drive/MyDrive/RecSys2024/'

Mounted at /content/drive
Exp_Recsys2024_LGBM_train.ipynb  output				     Recsys2024_ensemble.ipynb
Exp_Recsys2024_preprocess.ipynb  preprocess2_embed_similarity.ipynb  Recsys2024_LGBM_test.ipynb
feature_output			 preprocess2.ipynb		     Recsys2024_LGBM_train.ipynb
input				 preprocess_article.ipynb	     Recsys2024_preprocess.ipynb
models				 preprocess_create_embed.ipynb
old_Recsys2024_LGBM_train.ipynb  Recsys2024_EDA.ipynb


In [2]:
import pandas as pd
!pip install polars
import polars as pl
import numpy as np
import pickle
import gc

Collecting polars
  Downloading polars-0.20.31-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (28.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m28.8/28.8 MB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: polars
Successfully installed polars-0.20.31


In [3]:
#DEBUG_MODE = True
DEBUG_MODE = False
#DATA_TYPE = 'train'
#DATA_TYPE = 'valid'
DATA_TYPE = 'test'

# train, validにはあるがtestにはない特徴量
ignore_col_train = ['next_scroll_percentage', 'next_read_time', 'article_id']
out_path = base_path + 'feature_output'

# Preprocessing

In [4]:
if DEBUG_MODE:
  input_path = base_path + 'input/ebnerd_small/'
else:
  input_path = base_path + 'input/ebnerd_large/'

In [5]:
!ls {input_path}

articles.parquet  train  validation


In [6]:
if DATA_TYPE == 'test':
    input_path = base_path + 'input/ebnerd_testset/'

if DATA_TYPE == 'train':
    df_impression = pl.read_parquet(input_path + 'train/behaviors.parquet')
elif DATA_TYPE == 'valid':
    df_impression = pl.read_parquet(input_path + 'validation/behaviors.parquet')
else:
    df_impression = pl.read_parquet(input_path + 'test/behaviors.parquet')

# Feature Engineering

In [7]:
def show_df(df, n=5):
    print(df.shape)
    display(df.head(n))


def expand_behavior(df, data_type='train'):
    df = df.explode("article_ids_inview").rename({"article_ids_inview": "article_id_inview"})
    if data_type in ('train', 'valid'):
        df = df.with_columns(
            (pl.col("article_id_inview").is_in(pl.col("article_ids_clicked"))).alias("clicked")
        )
        df = df.drop("article_ids_clicked")
    return df


def datetime_to_unix(df, col_name):
    return df.with_columns(
        pl.col(col_name).map_elements(lambda lst: [int(dt.timestamp()) for dt in lst])
    )


def user_features(df):
  return df.select([
      pl.col("user_id"),
      pl.col("impression_time_fixed").list.len().alias("u_history_len"),
      pl.col("impression_time_fixed").list.last().alias("u_impression_time_last"),
      pl.col("impression_time_fixed").list.mean().alias("u_impression_time_mean"),
      pl.col("read_time_fixed").list.last().alias("u_read_time_last"),
      pl.col("read_time_fixed").list.mean().alias("u_read_time_mean"),
      pl.col("scroll_percentage_fixed").list.last().alias("u_scroll_percentage_last"),
      pl.col("scroll_percentage_fixed").list.mean().alias("u_scroll_percentage_mean"),
  ])


def exact_first(df, col_name):

    new_col_name = f"{col_name}_first"
    df = df.with_columns(
        pl.col(col_name).list.first().alias(new_col_name)
    )
    return df

def exact_last(df, col_name):

    new_col_name = f"{col_name}_last"
    df = df.with_columns(
        pl.col(col_name).list.last().alias(new_col_name)
    )
    return df


def exact_nth(df, col_name, n=2):

    new_col_name = f"{col_name}_{n}th"
    df = df.with_columns(
        pl.col(col_name).list.get(n).alias(new_col_name)
    )
    return df

def pop_count_list(df, out_col_name, bins):

    pop_list = []
    for bin in bins:
        tmp = df.with_columns(
            (df["impression_time"] / bin).cast(pl.Int64).alias(f"time_bin_{bin}")
        ).group_by([f"time_bin_{bin}", "article_id"]).agg(
            pl.count().alias(out_col_name)
        )
        pop_list.append(tmp)
    return pop_list


In [8]:
print('impression_time min:', df_impression.select(pl.col("impression_time")).min().item())
print('impression_time max:', df_impression.select(pl.col("impression_time")).max().item())

# 7時スタートかつ中途半端な曜日なので、0-6の日付になるように曜日から調整
df_impression = df_impression.with_columns(
    ((pl.col("impression_time") - pl.duration(hours=(7+24*3))).dt.weekday() - 1).alias("ndays")
)

# その日の0時からの経過秒数を計算
df_impression = df_impression.with_columns(
    (pl.col("impression_time").dt.hour() * 3600 +
     pl.col("impression_time").dt.minute() * 60 +
     pl.col("impression_time").dt.second()).alias("seconds_since_midnight")
)

df_impression = df_impression.with_columns(
    pl.col("article_ids_inview").list.len().alias("view_num").cast(pl.UInt8),
    (pl.col("impression_time").dt.timestamp("ms") / 1000).cast(pl.Int64),
)

if DATA_TYPE in ('train', 'valid'):
    df_impression = df_impression.drop(ignore_col_train)
show_df(df_impression, 3)

impression_time min: 2023-06-01 07:00:00
impression_time max: 2023-06-08 06:59:59
(13536710, 17)


impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy,ndays,seconds_since_midnight,view_num
u32,i64,f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,bool,i8,i16,u8
6451339,1685977369,8.0,,2,"[9796527, 7851321, … 9492777]",35982,False,,,,False,388,False,4,-11367,9
6451363,1685977436,20.0,,2,"[9798532, 9791602, … 9798958]",36012,False,,,,False,804,False,4,-11556,8
6451382,1685978753,9.0,,2,"[9798498, 9793856, … 9798724]",36162,False,,,,False,1528,False,4,-11519,5


In [None]:
df_session = df_impression.select(['session_id', 'impression_id', 'impression_time', 'article_ids_inview', 'user_id']).sort(by=["session_id", "impression_time"])
if DATA_TYPE == 'test':
    df_session = df_session.filter(pl.col("impression_id") != 0)
df_session = df_session.with_columns(
    pl.col("session_id").count().over("session_id").alias("session_size"),
    pl.arange(0, pl.count()).over("session_id").alias("impression_no").cast(pl.UInt32),
    (pl.col("impression_time") - pl.col("impression_time").shift(1)).over("session_id").alias("time_diff_prev").cast(pl.Int16),
    (pl.col("impression_time").shift(-1) - pl.col("impression_time")).over("session_id").alias("time_diff_next").cast(pl.Int16)
)
df_session = df_session.drop("impression_time")
show_df(df_session, 2)

  pl.arange(0, pl.count()).over("session_id").alias("impression_no").cast(pl.UInt32),


(13336710, 8)


session_id,impression_id,article_ids_inview,user_id,session_size,impression_no,time_diff_prev,time_diff_next
u32,u32,list[i32],u32,u32,u32,i16,i16
44,45742430,"[9791702, 9791638, … 9792071]",35949,1,0,,
46,189227445,"[9790744, 9794521, … 9794425]",35949,1,0,,


In [None]:
df_session = df_session.explode("article_ids_inview").rename({"article_ids_inview": "article_id"})
df_session = df_session.with_columns(
    pl.col("article_id").cum_count().over(["session_id", "user_id", "article_id"]).alias("cum_article_count"),
    pl.col("impression_no").first().over(["session_id", "user_id", "article_id"]).alias("first_imp_no"),
)
df_session = df_session.drop("session_id")
df_session = df_session.with_columns(
    (pl.col("impression_no") - pl.col("first_imp_no")).alias("diff_first_imp_no"),
)

original_names = df_session.columns
new_names = [f"s_{name}" if name not in ['impression_id', 'article_id', 'user_id'] else name for name in original_names]
df_session = df_session.rename(dict(zip(original_names, new_names)))
show_df(df_session, 2)

(155925868, 10)


impression_id,article_id,user_id,s_session_size,s_impression_no,s_time_diff_prev,s_time_diff_next,s_cum_article_count,s_first_imp_no,s_diff_first_imp_no
u32,i32,u32,u32,u32,i16,i16,u32,u32,u32
45742430,9791702,35949,1,0,,,1,0,0
45742430,9791638,35949,1,0,,,1,0,0


In [None]:
# save
if DEBUG_MODE:
    df_session.write_parquet(f"{out_path}/small_{DATA_TYPE}_session.parquet")
else:
    df_session.write_parquet(f"{out_path}/{DATA_TYPE}_session.parquet")
del df_session
gc.collect()

NameError: name 'df_session' is not defined

In [9]:
if DATA_TYPE == 'test':
  # testの場合はimpression_id=0が複数含まれており、user_idも入れないとuniqueにならないので入れる
  exp_col = ["article_ids_inview"]
  df_impression_article = df_impression.select(["impression_id", "user_id"] + exp_col)
else:
  exp_col = ["article_ids_inview", "article_ids_clicked"]
  #df_impression_article = df_impression.select(["impression_id"] + exp_col)
  df_impression_article = df_impression.select(["impression_id", "user_id"] + exp_col)

df_impression = df_impression.drop(exp_col)

In [10]:
# expand list
df_impression_article = expand_behavior(df_impression_article, DATA_TYPE)
show_df(df_impression_article, 2)

(205925868, 3)


impression_id,user_id,article_id_inview
u32,u32,i32
6451339,35982,9796527
6451339,35982,7851321


In [11]:
# save
if DEBUG_MODE:
    df_impression.write_parquet(f"{out_path}/small_{DATA_TYPE}_impression.parquet")
    df_impression_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_impression_article.parquet")
else:
    df_impression.write_parquet(f"{out_path}/{DATA_TYPE}_impression.parquet")
    df_impression_article.write_parquet(f"{out_path}/{DATA_TYPE}_impression_article.parquet")
del df_impression, df_impression_article
gc.collect()

0

## History features (key: user_id)

In [None]:
if DATA_TYPE == 'train':
    df_user = pl.read_parquet(input_path + 'train/history.parquet')
elif DATA_TYPE == 'valid':
    df_user = pl.read_parquet(input_path +  'validation/history.parquet')
else:
    df_user = pl.read_parquet(input_path + 'test/history.parquet')

In [None]:
df_user = datetime_to_unix(df_user, "impression_time_fixed")
show_df(df_user, 3)

(807677, 5)


user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[i64],list[f32],list[i32],list[f32]
40107,"[1683791461, 1684008546, … 1685597898]","[15.0, 100.0, … 7.0]","[9676294, 9763942, … 9776147]","[25.0, 39.0, … 43.0]"
40254,"[1683794426, 1683794478, … 1685600334]","[100.0, 72.0, … null]","[9759284, 9759389, … 9789473]","[49.0, 16.0, … 0.0]"
40926,"[1683802119, 1683802717, … 1685598218]","[98.0, 45.0, … 65.0]","[9759355, 9759707, … 9789896]","[595.0, 2.0, … 20.0]"


In [None]:
# User * Article (過去に同一記事のclickがあった場合)
df_user_article = df_user.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"]).rename({"article_id_fixed": "article_id"})
df_user_article = df_user_article.group_by(["user_id", "article_id"]).agg([
    pl.col("impression_time_fixed").mean().alias("ua_impression_time_mean"),
    pl.col("impression_time_fixed").last().alias("ua_impression_time_last"),
    pl.col("scroll_percentage_fixed").mean().alias("ua_scroll_percentage_mean"),
    pl.col("scroll_percentage_fixed").last().alias("ua_scroll_percentage_last"),
    pl.col("read_time_fixed").mean().alias("ua_read_time_mean"),
    pl.col("read_time_fixed").last().alias("ua_read_time_last"),
    pl.count().alias("ua_count")
])
show_df(df_user_article, 3)

(99089504, 9)


user_id,article_id,ua_impression_time_mean,ua_impression_time_last,ua_scroll_percentage_mean,ua_scroll_percentage_last,ua_read_time_mean,ua_read_time_last,ua_count
u32,i32,f64,i64,f32,f32,f32,f32,u32
40107,9716537,1684000000.0,1684041840,59.0,59.0,49.0,49.0,1
40107,9765410,1684100000.0,1684116149,100.0,100.0,182.0,182.0,1
40107,9772045,1684500000.0,1684474209,100.0,100.0,14.0,8.0,2


In [None]:
df_user = user_features(df_user)
show_df(df_user, 2)

(807677, 8)


user_id,u_history_len,u_impression_time_last,u_impression_time_mean,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean
u32,u32,i64,f64,f32,f32,f32,f32
40107,99,1685597898,1684600000.0,43.0,54.595959,7.0,82.934784
40254,226,1685600334,1684800000.0,0.0,51.181416,,63.786068


In [None]:
# save
if DEBUG_MODE:
    df_user.write_parquet(f"{out_path}/small_{DATA_TYPE}_user.parquet")
    df_user_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_user_article.parquet")
else:
    df_user.write_parquet(f"{out_path}/{DATA_TYPE}_user.parquet")
    df_user_article.write_parquet(f"{out_path}/{DATA_TYPE}_user_article.parquet")
del df_user, df_user_article
gc.collect()

0

## Article Features (key: article_id)

In [None]:
# read file
df_article = pl.read_parquet(input_path + '/articles.parquet')
# 使い道がなさそうなものは一旦けずる
ignore_cols = ["url"]
df_article = df_article.drop(ignore_cols)

# setで残しておくfeature, 後でuser素性と突き合わせて使う
#set_cols = ["subcategory", "topics", "entity_groups", "ner_clusters"]
#for i in set_cols:
#    df_article = df_article.with_columns(pl.col(i).map_elements(lambda x: set(x)).alias(f"{i}_set"))

# 後段で処理しやすいようにunixtimeに直しておく
df_article = df_article.with_columns(
    (df_article["published_time"].dt.timestamp("ms") / 1000).cast(pl.Int64),
    (df_article["last_modified_time"].dt.timestamp("ms") / 1000).cast(pl.Int64),
)

# 文字数だけをfeatureとして加える
str_cols = ["title", "subtitle", "body"]
for i in str_cols:
  df_article = df_article.with_columns(pl.col(i).str.len_bytes().cast(pl.UInt16).alias(f'{i}_len'))
  df_article = df_article.drop(i)

# listの最初の要素
first_element_list = ["ner_clusters", "entity_groups", "topics", "subcategory"]
for i in first_element_list:
  df_article = exact_first(df_article, i)

# 大事そうなやつはlastも加える
important_element_list = ["ner_clusters"]
for i in important_element_list:
  df_article = exact_last(df_article, i)
  # 2nd to 7 要素数の平均が7くらいなので
  for j in range(2, 8):
    df_article = exact_nth(df_article, i, j)

# listの要素数をfeatureとして加える
list_cols = ["image_ids", "ner_clusters", "entity_groups", "topics", "subcategory"]
for i in list_cols:
    df_article = df_article.with_columns(pl.col(i).list.len().cast(pl.UInt8).alias(f'{i}_len'))
    df_article = df_article.drop(i)

# fill null
num_features = ["total_inviews", "total_pageviews", "total_read_time", "sentiment_score"]
df_article = df_article.with_columns(pl.col(num_features).fill_null(0))

original_names = df_article.columns
new_names = [f"a_{name}" if name != 'article_id' else name for name in original_names]
df_article = df_article.rename(dict(zip(original_names, new_names)))
show_df(df_article, 3)

In [None]:
# save
if DEBUG_MODE:
    df_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_article.parquet")
else:
    df_article.write_parquet(f"{out_path}/{DATA_TYPE}_article.parquet")