In [1]:
import polars as pl
from pathlib import Path

data_dir = Path("../data")

transactions = pl.scan_parquet(data_dir / "transactions_train_3.parquet")
lines = pl.scan_parquet(data_dir / "transaction_lines_train_3.parquet")
products = pl.scan_csv(data_dir / "products.csv")

In [2]:
joined = transactions.filter(pl.col("label") != "UNKNOWN").join(
  lines.join(
    products,
    left_on="product_id",
    right_on="id",
    how="left",
    suffix="_product",
  ),
  left_on="id",
  right_on="transaction_id",
  how="left",
  suffix="_line",
)

In [5]:
rabatt_fraud = (
  joined
  .with_columns((pl.col("sales_price") / (pl.col("price") * pl.col("pieces_or_weight"))).alias("price_ratio"))
  .filter(abs(pl.col("price_ratio")  - 0.70) < 0.01)
 )

In [6]:
rabatt_fraud.select(pl.len()).collect()

len
u32
24025


In [9]:
rb = rabatt_fraud.group_by("id").agg(
  [
    pl.col("label").first().alias("label"),
    pl.col("damage").first().alias("damage"),
  ]
)

In [10]:
rb.group_by("label").agg(
  [
    pl.len().alias("count"),
    pl.col("damage").sum().alias("total_damage"),
  ]
).collect()

label,count,total_damage
str,u32,f64
"""NORMAL""",8394,0.0
"""FRAUD""",2713,18827.49
