In [None]:
import os
import sys
import polars as pl

sys.path.append("../")
from encoder.polars_ordinal_encoder import PolarsOrdinalEncoder

In [None]:
data_dir = os.path.join("..", "data")
user_feats = ["user", "gender", "age", "shopping", "occupation"]
ad_feats = ["adgroup", "cate", "brand", "campaign", "customer"]

In [None]:
user_profile = pl.read_parquet(os.path.join(data_dir, "user_profile.parquet"))
ad_feature = pl.read_parquet(os.path.join(data_dir, "ad_feature.parquet"))
train_data = pl.read_parquet(os.path.join(data_dir, "train_raw.parquet"))
test_data = pl.read_parquet(os.path.join(data_dir, "test_raw.parquet"))

In [None]:
user_encoder = PolarsOrdinalEncoder(fit_data = user_profile)
ad_encoder = PolarsOrdinalEncoder(fit_data = ad_feature)

In [None]:
train_data: pl.DataFrame = pl.concat([
    user_encoder.transform(train_data.select(user_feats)), 
    ad_encoder.transform(train_data.select(ad_feats)), 
    train_data.select(
        pl.col("btag", "timestamp", "timediff").cast(pl.Int32),
        is_test = pl.lit(False, pl.Boolean),
    )
], how="horizontal")
train_data

In [None]:
test_data: pl.DataFrame = pl.concat([
    user_encoder.transform(test_data.select(user_feats)), 
    ad_encoder.transform(test_data.select(ad_feats)), 
    test_data.select(
        pl.col("btag", "timestamp", "timediff").cast(pl.Int32),
        is_test = pl.lit(True, pl.Boolean)
    )
], how="horizontal")
test_data

In [None]:
train_data.write_parquet(os.path.join(data_dir, "train.parquet"))
test_data.write_parquet(os.path.join(data_dir, "test.parquet"))