In [None]:
import os
import polars as pl
import numpy as np

In [None]:
data_dir = os.path.join("..", "data")
train_parquet = os.path.join(data_dir, "train.parquet")
test_parquet = os.path.join(data_dir, "test.parquet")
min_timediff_unique = 30        # The minimum number of seconds between identical interactions (user, adgroup, btag), or (user, cate, brand, btag), before they are considered duplicates
min_training_interactions = 5   # The minimum number of non-ad-click, browse, ad-click, favorite, add-to-cart, or purchase interactions required in a training sequence
augmented = False               # Whether to include behavior log interaction data or not
sequence_len = 128
slide_window_every = 64

In [None]:
train_sequence_params = f"timediff{min_timediff_unique}_mintrain{min_training_interactions}_seqlen{sequence_len}_slide{slide_window_every}" + ("_aug" if augmented else "")
test_sequence_params = f"timediff{min_timediff_unique}_mintrain{min_training_interactions}_seqlen{sequence_len}" + ("_aug" if augmented else "")
user_feats = ["user", "gender", "age", "shopping", "occupation"]
ad_feats = ["adgroup", "cate", "brand", "campaign", "customer"]
full_ad_feats = ad_feats + ["rel_ad_freq", "btag", "timestamp", "is_test"]
selected_feats = [*user_feats, *full_ad_feats, "seq_len"]

In [None]:
training_data = (pl.scan_parquet(train_parquet)
    .filter((pl.col("timediff").is_null() | (pl.col("timediff") >= min_timediff_unique)) &
            ((pl.col("btag").is_in([-1, 1])) if not augmented else True))
    .filter(pl.len().over("user") >= min_training_interactions)
    .collect()
)
training_data

In [None]:
validation_data = (pl.scan_parquet(test_parquet)
    .filter(pl.col("user").is_in(training_data.select("user").unique()))
    .collect()
)
validation_data

In [None]:
interactions: pl.DataFrame = pl.concat([training_data, validation_data], how="vertical", rechunk=True)
del training_data, validation_data
interactions

In [None]:
rel_ad_freqs = (interactions
    .filter(pl.col("adgroup") > -1)
    .select("adgroup", rel_ad_freq = (pl.len().over("adgroup") / pl.count("adgroup")).cast(pl.Float32))
    .unique()
)
rel_ad_freq_sum = rel_ad_freqs.select("rel_ad_freq").sum().item()
print("Relative Ad Frequency Sanity Check Sum:", rel_ad_freq_sum)

In [None]:
sequences = (interactions
    .join(rel_ad_freqs, on="adgroup", how="left")
    .with_columns(pl.col("rel_ad_freq").fill_null(0.0))
    .group_by("user")
    .agg(
        pl.col(user_feats[1:]).first(),
        pl.col(full_ad_feats).sort_by("timestamp"),
        seq_len = pl.col("btag").len().cast(pl.Int32)
    )
    .with_columns(pl.col("timestamp").list.diff().list.eval(pl.element().fill_null(0)))
)
del interactions, rel_ad_freqs
sequences

In [None]:
min_seq_len = sequences.select(pl.col("seq_len").min()).item()
max_seq_len = sequences.select(pl.col("seq_len").max()).item()
print("Minimum sequence length:", min_seq_len)
print("Maximum sequence length:", max_seq_len)

In [None]:
train_sequences = (pl
    .concat([
        (sequences
            .filter((pl.col("seq_len") + end_idx > sequence_len - slide_window_every) if end_idx < -1 else True)
            .select(
                pl.col(user_feats),
                pl.col(full_ad_feats)
                    .list.gather(range(end_idx-sequence_len, end_idx), null_on_oob=True)
                    .list.shift(pl.min_horizontal(pl.col("seq_len") + (end_idx-sequence_len), 0)),
                seq_len = pl.min_horizontal(pl.col("seq_len") + end_idx, sequence_len).cast(pl.Int32)
            )
        ) for end_idx in range(-1, -max_seq_len, -slide_window_every)
    ], how="vertical")
    .filter(pl.col("seq_len") >= min_training_interactions)
    .with_columns(
        pl.col(ad_feats).list.eval(pl.element().fill_null(-1)).list.to_array(sequence_len),
        pl.col("rel_ad_freq").list.eval(pl.element().fill_null(0.0)).list.to_array(sequence_len),
        pl.col("btag").list.eval(pl.element().fill_null(-2)).list.to_array(sequence_len),
        pl.col("timestamp").list.eval(pl.element().fill_null(0)).list.to_array(sequence_len),
        pl.col("is_test").list.eval(pl.element().fill_null(True)).list.to_array(sequence_len),
    )
)
train_sequences

In [None]:
test_sequences = (sequences
    .select(
        pl.col(user_feats),
        pl.col(full_ad_feats)
            .list.gather(range(-sequence_len, 0), null_on_oob=True)
            .list.shift(pl.min_horizontal(pl.col("seq_len") - sequence_len, 0)),
        seq_len = pl.min_horizontal(pl.col("seq_len"), sequence_len).cast(pl.Int32)
    )
    .with_columns(
        pl.col(ad_feats).list.eval(pl.element().fill_null(-1)).list.to_array(sequence_len),
        pl.col("rel_ad_freq").list.eval(pl.element().fill_null(0.0)).list.to_array(sequence_len),
        pl.col("btag").list.eval(pl.element().fill_null(-2)).list.to_array(sequence_len),
        pl.col("timestamp").list.eval(pl.element().fill_null(0)).list.to_array(sequence_len),
        pl.col("is_test").list.eval(pl.element().replace(True, False))
                         .list.eval(pl.element().fill_null(True)).list.to_array(sequence_len),
    )
)
test_sequences

In [None]:
np.savez_compressed(
    file = os.path.join(data_dir, f"train_data_{train_sequence_params}"),
    user_data = train_sequences.select(user_feats).to_numpy(),
    **{feat: train_sequences[feat].to_numpy() for feat in ad_feats},
    rel_ad_freqs = train_sequences["rel_ad_freq"].to_numpy(),
    interaction_data = train_sequences["btag"].to_numpy(),
    timestamps = train_sequences["timestamp"].to_numpy(),
    padded_masks = train_sequences["is_test"].to_numpy(),
    seq_lens = train_sequences["seq_len"].to_numpy(),
)

In [None]:
np.savez_compressed(
    file = os.path.join(data_dir, f"test_data_{test_sequence_params}"),
    user_data = test_sequences.select(user_feats).to_numpy(),
    **{feat: test_sequences[feat].to_numpy() for feat in ad_feats},
    rel_ad_freqs = test_sequences["rel_ad_freq"].to_numpy(),
    interaction_data = test_sequences["btag"].to_numpy(),
    timestamps = test_sequences["timestamp"].to_numpy(),
    padded_masks = test_sequences["is_test"].to_numpy(),
    seq_lens = test_sequences["seq_len"].to_numpy(),
)