In [None]:
import os
import polars as pl
import numpy as np
from sklearn.preprocessing import OrdinalEncoder

In [None]:
data_dir = os.path.join("..", "data")
user_feats = ["user", "gender", "age", "shopping", "occupation"]
ad_feats = ["adgroup", "cate", "brand", "campaign", "customer"]

In [None]:
user_profile = pl.read_parquet(os.path.join(data_dir, "user_profile.parquet"))
ad_feature = pl.read_parquet(os.path.join(data_dir, "ad_feature.parquet"))
train_data = pl.read_parquet(os.path.join(data_dir, "train_raw.parquet"))
test_data = pl.read_parquet(os.path.join(data_dir, "test_raw.parquet"))

In [None]:
user_encoder = OrdinalEncoder(dtype=np.int32, encoded_missing_value=-1).fit(user_profile)
user_encoder.set_output(transform="polars")
ad_encoder = OrdinalEncoder(dtype=np.int32, encoded_missing_value=-1).fit(ad_feature)
ad_encoder.set_output(transform="polars")

In [None]:
train_data: pl.DataFrame = pl.concat([
    user_encoder.transform(train_data.select(user_feats)), 
    ad_encoder.transform(train_data.select(ad_feats)), 
    train_data.select("btag", pl.col("timestamp").cast(pl.Int32), "timediff", is_test = pl.lit(False))
], how="horizontal")
train_data

In [None]:
test_data: pl.DataFrame = pl.concat([
    user_encoder.transform(test_data.select(user_feats)), 
    ad_encoder.transform(test_data.select(ad_feats)), 
    test_data.select("btag", pl.col("timestamp").cast(pl.Int32), "timediff", is_test = pl.lit(True))
], how="horizontal")
test_data

In [None]:
train_data.write_parquet(os.path.join(data_dir, "train.parquet"))
test_data.write_parquet(os.path.join(data_dir, "test.parquet"))