# Feature Engineering


In [1]:
from google.colab import drive
drive.mount('/content/drive')
!ls /content/drive/MyDrive/RecSys2024/
base_path = '/content/drive/MyDrive/RecSys2024/'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
feature_output	models	Recsys2024_EDA.ipynb	    Recsys2024_LGBM_train.ipynb
input		output	Recsys2024_LGBM_test.ipynb  Recsys2024_preprocess.ipynb


In [2]:
import pandas as pd
!pip install polars
import polars as pl
import numpy as np
import pickle
import gc



In [3]:
#DEBUG_MODE = True
DEBUG_MODE = False
#DATA_TYPE = 'train'
#DATA_TYPE = 'valid'
DATA_TYPE = 'test'

# train, validにはあるがtestにはない特徴量
ignore_col_train = ['next_scroll_percentage', 'next_read_time', 'article_id']

# Preprocessing

In [4]:
if DEBUG_MODE:
  input_path = base_path + 'input/ebnerd_small/'
else:
  input_path = base_path + 'input/ebnerd_large/'

In [5]:
!ls {input_path}

articles.parquet  train  validation


In [6]:
if DATA_TYPE == 'test':
    input_path = base_path + 'input/ebnerd_testset/'

df_article = pl.read_parquet(input_path + '/articles.parquet')

In [7]:
if DATA_TYPE == 'train':
    df_impression = pl.read_parquet(input_path + 'train/behaviors.parquet')
    df_history = pl.read_parquet(input_path + 'train/history.parquet')
elif DATA_TYPE == 'valid':
    df_impression = pl.read_parquet(input_path + 'validation/behaviors.parquet')
    df_history = pl.read_parquet(input_path +  'validation/history.parquet')
else:
    df_impression = pl.read_parquet(input_path + 'test/behaviors.parquet')
    df_history = pl.read_parquet(input_path + 'test/history.parquet')

# Feature Engineering

In [8]:
def show_df(df, n=5):
    print(df.shape)
    display(df.head(n))


def expand_behavior(df, data_type='train'):
    df =  df.explode(["article_ids_inview", "view_pos"]).rename({"article_ids_inview": "article_id_inview"})

    if data_type in ('train', 'valid'):
        df = df.with_columns(
            (pl.col("article_id_inview").is_in(pl.col("article_ids_clicked"))).alias("clicked")
        )
        df = df.drop("article_ids_clicked")
    return df


def datetime_to_unix(df, col_name):
    return df.with_columns(
        pl.col(col_name).map_elements(lambda lst: [int(dt.timestamp()) for dt in lst])
    )


def user_features(df):
  return df.select([
      pl.col("user_id"),
      pl.col("impression_time_fixed").list.len().alias("u_history_len"),
      pl.col("impression_time_fixed").list.last().alias("u_impression_time_last"),
      pl.col("impression_time_fixed").list.mean().alias("u_impression_time_mean"),
      pl.col("read_time_fixed").list.last().alias("u_read_time_last"),
      pl.col("read_time_fixed").list.mean().alias("u_read_time_mean"),
      pl.col("scroll_percentage_fixed").list.last().alias("u_scroll_percentage_last"),
      pl.col("scroll_percentage_fixed").list.mean().alias("u_scroll_percentage_mean"),
  ])


def exact_first(df, col_name):

    new_col_name = f"{col_name}_first"
    df = df.with_columns(
        pl.col(col_name).list.first().alias(new_col_name)
    )
    return df

def pop_count_list(df, out_col_name, bins):

    pop_list = []
    for bin in bins:
        tmp = df.with_columns(
            (df["impression_time"] / bin).cast(pl.Int64).alias(f"time_bin_{bin}")
        ).group_by([f"time_bin_{bin}", "article_id"]).agg(
            pl.count().alias(out_col_name)
        )
        pop_list.append(tmp)
    return pop_list


In [None]:
df_impression = df_impression.with_columns(pl.col("article_ids_inview").list.len().alias("view_num").cast(pl.UInt8))
df_impression = df_impression.with_columns(pl.col("view_num").map_elements(lambda x: list(range(1, x + 1))).alias("view_pos"))

  df_impression = df_impression.with_columns(pl.col("view_num").map_elements(lambda x: list(range(1, x + 1))).alias("view_pos"))


In [None]:
if DATA_TYPE in ('train', 'valid'):
    df_impression = df_impression.drop(ignore_col_train)

In [None]:
show_df(df_impression, 3)

(13536710, 16)


impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy,view_num,view_pos
u32,datetime[μs],f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,bool,u8,list[i64]
6451339,2023-06-05 15:02:49,8.0,,2,"[9796527, 7851321, … 9492777]",35982,False,,,,False,388,False,9,"[1, 2, … 9]"
6451363,2023-06-05 15:03:56,20.0,,2,"[9798532, 9791602, … 9798958]",36012,False,,,,False,804,False,8,"[1, 2, … 8]"
6451382,2023-06-05 15:25:53,9.0,,2,"[9798498, 9793856, … 9798724]",36162,False,,,,False,1528,False,5,"[1, 2, … 5]"


In [None]:
print(df_impression.select(pl.col("impression_time")).min())
print(df_impression.select(pl.col("impression_time")).max())

shape: (1, 1)
┌─────────────────────┐
│ impression_time     │
│ ---                 │
│ datetime[μs]        │
╞═════════════════════╡
│ 2023-06-01 07:00:00 │
└─────────────────────┘
shape: (1, 1)
┌─────────────────────┐
│ impression_time     │
│ ---                 │
│ datetime[μs]        │
╞═════════════════════╡
│ 2023-06-08 06:59:59 │
└─────────────────────┘


In [None]:
df_impression = df_impression.with_columns(
    (df_impression["impression_time"].dt.timestamp("ms") / 1000).cast(pl.Int64),
)

In [None]:
if DATA_TYPE == 'test':
  # testの場合はimpression_id=0が複数含まれており、user_idも入れないとuniqueにならないので入れる
  exp_col = ["article_ids_inview", "view_pos"]
  df_impression_article = df_impression.select(["impression_id", "user_id"] + exp_col)
else:
  exp_col = ["article_ids_inview", "article_ids_clicked", "view_pos"]
  df_impression_article = df_impression.select(["impression_id"] + exp_col)

In [None]:
# expand list
df_impression_article = expand_behavior(df_impression_article, DATA_TYPE)

In [None]:
show_df(df_impression, 2)

(13536710, 16)


impression_id,impression_time,read_time,scroll_percentage,device_type,article_ids_inview,user_id,is_sso_user,gender,postcode,age,is_subscriber,session_id,is_beyond_accuracy,view_num,view_pos
u32,i64,f32,f32,i8,list[i32],u32,bool,i8,i8,i8,bool,u32,bool,u8,list[i64]
6451339,1685977369,8.0,,2,"[9796527, 7851321, … 9492777]",35982,False,,,,False,388,False,9,"[1, 2, … 9]"
6451363,1685977436,20.0,,2,"[9798532, 9791602, … 9798958]",36012,False,,,,False,804,False,8,"[1, 2, … 8]"


In [None]:
show_df(df_impression_article, 2)

(205925868, 4)


impression_id,user_id,article_id_inview,view_pos
u32,u32,i32,i64
6451339,35982,9796527,1
6451339,35982,7851321,2


## History features (key: user_id)

In [None]:
show_df(df_history, 2)

(807677, 5)


user_id,impression_time_fixed,scroll_percentage_fixed,article_id_fixed,read_time_fixed
u32,list[datetime[μs]],list[f32],list[i32],list[f32]
40107,"[2023-05-11 07:51:01, 2023-05-13 20:09:06, … 2023-06-01 05:38:18]","[15.0, 100.0, … 7.0]","[9676294, 9763942, … 9776147]","[25.0, 39.0, … 43.0]"
40254,"[2023-05-11 08:40:26, 2023-05-11 08:41:18, … 2023-06-01 06:18:54]","[100.0, 72.0, … null]","[9759284, 9759389, … 9789473]","[49.0, 16.0, … 0.0]"


In [None]:
df_history = datetime_to_unix(df_history, "impression_time_fixed")

  return df.with_columns(


In [None]:
df_user = user_features(df_history)

In [None]:
show_df(df_user, 2)

(807677, 8)


user_id,u_history_len,u_impression_time_last,u_impression_time_mean,u_read_time_last,u_read_time_mean,u_scroll_percentage_last,u_scroll_percentage_mean
u32,u32,i64,f64,f32,f32,f32,f32
40107,99,1685597898,1684600000.0,43.0,54.595959,7.0,82.934784
40254,226,1685600334,1684800000.0,0.0,51.181416,,63.786068


## History features (key: user_id, article_id)

In [None]:
df_user_article = df_history.explode(["impression_time_fixed", "scroll_percentage_fixed", "article_id_fixed", "read_time_fixed"]).rename({"article_id_fixed": "article_id"})
del df_history
gc.collect()

0

In [None]:
df_user_article = df_user_article.group_by(["user_id", "article_id"]).agg([
    pl.col("impression_time_fixed").mean().alias("ua_impression_time_mean"),
    pl.col("impression_time_fixed").last().alias("ua_impression_time_last"),
    pl.col("scroll_percentage_fixed").mean().alias("ua_scroll_percentage_mean"),
    pl.col("scroll_percentage_fixed").last().alias("ua_scroll_percentage_last"),
    pl.col("read_time_fixed").mean().alias("ua_read_time_mean"),
    pl.col("read_time_fixed").last().alias("ua_read_time_last"),
    pl.count().alias("ua_count")
])

  pl.count().alias("ua_count")


In [None]:
show_df(df_user_article, 3)

(99089504, 9)


user_id,article_id,ua_impression_time_mean,ua_impression_time_last,ua_scroll_percentage_mean,ua_scroll_percentage_last,ua_read_time_mean,ua_read_time_last,ua_count
u32,i32,f64,i64,f32,f32,f32,f32,u32
542962,9775703,1684800000.0,1684753313,39.0,39.0,13.0,13.0,1
850415,9768820,1684300000.0,1684299162,100.0,100.0,78.0,78.0,1
2149113,9787243,1685400000.0,1685422049,28.0,28.0,3.0,3.0,1


## Article Features (key: article_id)

In [None]:
show_df(df_article, 3)

(125541, 21)


article_id,title,subtitle,last_modified_time,premium,body,published_time,image_ids,article_type,url,ner_clusters,entity_groups,topics,category,subcategory,category_str,total_inviews,total_pageviews,total_read_time,sentiment_score,sentiment_label
i32,str,str,datetime[μs],bool,str,datetime[μs],list[i64],str,str,list[str],list[str],list[str],i16,list[i16],str,i32,i32,f32,f32,str
3000022,"""Hanks beskyldt for mishandling""","""Tom Hanks har angiveligt misha…",2023-06-29 06:20:32,False,"""Tom Hanks skulle angiveligt ha…",2006-09-20 09:24:18,[3518381],"""article_default""","""https://ekstrabladet.dk/underh…","[""David Gardner""]","[""PER""]","[""Kriminalitet"", ""Kendt"", … ""Litteratur""]",414,[432],"""underholdning""",,,,0.9911,"""Negative"""
3000063,"""Bostrups aske spredt i Furesøe…","""Studieværten blev mindet med g…",2023-06-29 06:20:32,False,"""Strålende sensommersol. Jazzed…",2006-09-24 07:45:30,"[3170935, 3170939]","""article_default""","""https://ekstrabladet.dk/nyhede…",[],[],"[""Kendt"", ""Underholdning"", … ""Personlig begivenhed""]",118,[133],"""nyheder""",,,,0.5155,"""Neutral"""
3000613,"""Jesper Olsen ramt af hjerneblø…","""Den tidligere danske landshold…",2023-06-29 06:20:33,False,"""Jesper Olsen, der er noteret f…",2006-05-09 11:29:00,[3164998],"""article_default""","""https://ekstrabladet.dk/sport/…","[""Frankrig"", ""Jesper Olsen"", … ""Jesper Olsen""]","[""LOC"", ""PER"", … ""PER""]","[""Kendt"", ""Sport"", … ""Sygdom og behandling""]",142,"[196, 271]","""sport""",,,,0.9876,"""Negative"""


In [None]:
df_article = df_article.with_columns(
    (df_article["published_time"].dt.timestamp("ms") / 1000).cast(pl.Int64),
    (df_article["last_modified_time"].dt.timestamp("ms") / 1000).cast(pl.Int64),
)

In [None]:
first_element_list = ["ner_clusters", "entity_groups", "topics", "subcategory"]
for i in first_element_list:
  df_article = exact_first(df_article, i)
  df_article = df_article.drop(i)

In [None]:
# NLP的な処理が必要なカラムは一旦けずる
ignore_cols = ["title", "subtitle", "body", "image_ids", "url"]
df_article = df_article.drop(ignore_cols)

# わかりやすいように、articleで決まるfeatureはa_のprefixをつける
original_names = df_article.columns
new_names = [f"a_{name}" if name != 'article_id' else name for name in original_names]
df_article = df_article.rename(dict(zip(original_names, new_names)))

In [None]:
show_df(df_article, 3)

(125541, 16)


article_id,a_last_modified_time,a_premium,a_published_time,a_article_type,a_category,a_category_str,a_total_inviews,a_total_pageviews,a_total_read_time,a_sentiment_score,a_sentiment_label,a_ner_clusters_first,a_entity_groups_first,a_topics_first,a_subcategory_first
i32,i64,bool,i64,str,i16,str,i32,i32,f32,f32,str,str,str,str,i16
3000022,1688019632,False,1158744258,"""article_default""",414,"""underholdning""",,,,0.9911,"""Negative""","""David Gardner""","""PER""","""Kriminalitet""",432
3000063,1688019632,False,1159083930,"""article_default""",118,"""nyheder""",,,,0.5155,"""Neutral""",,,"""Kendt""",133
3000613,1688019633,False,1147174140,"""article_default""",142,"""sport""",,,,0.9876,"""Negative""","""Frankrig""","""LOC""","""Kendt""",196


# Article Popurality Features (key: article_id, time)

In [None]:
# articleごとのpopularityをview数、click数などから計算、10分ごと、1hごとなど計算
bins = [600, 3600, 3600 * 6]

In [None]:
df = df_impression.explode("article_ids_inview").select("impression_time", "article_ids_inview").rename(
    {"article_ids_inview": "article_id"}
)
df_impression = df_impression.drop(exp_col)
train_pop_view_list = pop_count_list(df, "view_cnt", bins)

  pl.count().alias(out_col_name)


In [None]:
train_pop_view_list

[shape: (1_202_613, 3)
 ┌──────────────┬────────────┬──────────┐
 │ time_bin_600 ┆ article_id ┆ view_cnt │
 │ ---          ┆ ---        ┆ ---      │
 │ i64          ┆ i32        ┆ u32      │
 ╞══════════════╪════════════╪══════════╡
 │ 2809490      ┆ 8392487    ┆ 1        │
 │ 2810137      ┆ 9794786    ┆ 97       │
 │ 2810143      ┆ 9778795    ┆ 1        │
 │ 2810080      ┆ 9794250    ┆ 8        │
 │ 2809632      ┆ 9786497    ┆ 2        │
 │ …            ┆ …          ┆ …        │
 │ 2809911      ┆ 9790691    ┆ 2        │
 │ 2810122      ┆ 9794932    ┆ 5        │
 │ 2809450      ┆ 9782131    ┆ 1        │
 │ 2809968      ┆ 9798094    ┆ 16       │
 │ 2809585      ┆ 9532638    ┆ 23       │
 └──────────────┴────────────┴──────────┘,
 shape: (356_367, 3)
 ┌───────────────┬────────────┬──────────┐
 │ time_bin_3600 ┆ article_id ┆ view_cnt │
 │ ---           ┆ ---        ┆ ---      │
 │ i64           ┆ i32        ┆ u32      │
 ╞═══════════════╪════════════╪══════════╡
 │ 468339        ┆ 9799002

# Save
- save all output dataframe which will be used for training

In [None]:
out_path = base_path + 'feature_output'

if DEBUG_MODE:
    df_impression.write_parquet(f"{out_path}/small_{DATA_TYPE}_impression.parquet")
    df_impression_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_impression_article.parquet")
    df_user.write_parquet(f"{out_path}/small_{DATA_TYPE}_user.parquet")
    df_user_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_user_article.parquet")
    df_article.write_parquet(f"{out_path}/small_{DATA_TYPE}_article.parquet")
    #for bin, df in zip(bins, train_pop_view_list):
    #    df.write_parquet(f"{out_path}/small_{DATA_TYPE}_pop_view_{bin}.parquet")
else:
    df_impression.write_parquet(f"{out_path}/{DATA_TYPE}_impression.parquet")
    df_impression_article.write_parquet(f"{out_path}/{DATA_TYPE}_impression_article.parquet")
    df_user.write_parquet(f"{out_path}/{DATA_TYPE}_user.parquet")
    df_user_article.write_parquet(f"{out_path}/{DATA_TYPE}_user_article.parquet")
    df_article.write_parquet(f"{out_path}/{DATA_TYPE}_article.parquet")
    #for bin, df in zip(bins, train_pop_view_list):
    #    df.write_parquet(f"{out_path}/{DATA_TYPE}_pop_view_{bin}.parquet")