In [None]:
import os
import polars as pl

In [None]:
data_dir = os.path.join("..", "data")
train_parquet = os.path.join(data_dir, "train.parquet")
test_parquet = os.path.join(data_dir, "test.parquet")
min_timediff_unique = 30        # The minimum number of seconds between identical interactions (user, adgroup, btag), or (user, cate, brand, btag), before they are considered duplicates
min_training_interactions = 5   # The minimum number of non-ad-click, browse, ad-click, favorite, add-to-cart, or purchase interactions a user must have to be included
sequence_len = 100
slide_window_every = 100

In [None]:
user_feats = ["user", "gender", "age", "shopping", "occupation"]
ad_feats = ["adgroup", "cate", "brand", "campaign", "customer"]

In [None]:
training_data = (pl.scan_parquet(train_parquet)
    .filter(pl.col("timediff").is_null() | (pl.col("timediff") >= min_timediff_unique))
    .filter(pl.len().over("user") >= min_training_interactions)
    .collect()
)
training_data

In [None]:
validation_data = (pl.scan_parquet(test_parquet)
    .filter(pl.col("user").is_in(training_data.select("user").unique()))
    .collect()
)
validation_data

In [None]:
interactions: pl.DataFrame = pl.concat([training_data, validation_data], how="vertical")
rel_ad_freqs = (interactions
    .filter(pl.col("adgroup") > -1)
    .select("adgroup", rel_ad_freq = (pl.len().over("adgroup") / pl.count("adgroup")).cast(pl.Float32))
    .unique()
)
rel_ad_freq_sum = rel_ad_freqs.select("rel_ad_freq").sum().item()
print("Relative Ad Frequency Sanity Check Sum:", rel_ad_freq_sum)

In [None]:
sequences = (interactions
    .join(rel_ad_freqs, on="adgroup", how="left")
    .group_by("user")
    .agg(
        pl.col(user_feats[1:]).first(),
        pl.col(*ad_feats, "rel_ad_freq", "btag", "timestamp", "is_test").sort_by("timestamp"),
        seq_len = pl.col("btag").len().cast(pl.Int32)
    )
    .with_columns(pl.col("timestamp").list.diff())
)
min_seq_len = sequences.select(pl.col("seq_len").min()).item()
max_seq_len = sequences.select(pl.col("seq_len").max()).item()
print("Minimum sequence length:", min_seq_len)
print("Maximum sequence length:", max_seq_len)
sequences

In [None]:
train_sequences = pl.concat([
    (sequences
        .filter(pl.col("seq_len") > abs(end_idx))
        .select(
            pl.col(user_feats),
            pl.col(*ad_feats, "rel_ad_freq", "btag", "timestamp", "is_test")
                .list.gather(range(end_idx-sequence_len, end_idx), null_on_oob=True)
                .list.shift(pl.min_horizontal(pl.col("seq_len") + (end_idx-sequence_len), 0))
                .list.to_array(sequence_len),
            seq_len = pl.min_horizontal(pl.col("seq_len") + end_idx, sequence_len).cast(pl.Int32)
        )
    ) for end_idx in range(-1, -max_seq_len, -slide_window_every)
], how="vertical").filter(pl.col("seq_len") >= min_training_interactions)
train_sequences

In [None]:
test_sequences = (sequences.select(
    pl.col(user_feats),
    pl.col(*ad_feats, "rel_ad_freq", "btag", "timestamp", "is_test")
        .list.gather(range(-sequence_len, 0), null_on_oob=True)
        .list.shift(pl.min_horizontal(pl.col("seq_len") - sequence_len, 0))
        .list.to_array(sequence_len),
    seq_len = pl.min_horizontal(pl.col("seq_len"), sequence_len).cast(pl.Int32)
))
test_sequences

In [None]:
train_sequences.write_parquet(os.path.join(data_dir, "train_sequences.parquet"))
test_sequences.write_parquet(os.path.join(data_dir, "test_sequences.parquet"))