In [2]:
import polars as pl
from pathlib import Path

In [3]:
data_dir = Path("../data")

stores = pl.scan_csv(data_dir / "stores.csv")
transactions = pl.scan_parquet(data_dir / "transactions_train_3.parquet")
transactions_labeled = transactions.filter(pl.col("label") != "UNKNOWN")

transactions_test = pl.scan_parquet(data_dir / "transactions_test_3.parquet")

# Stores in transactions

In [3]:
n_stores = len(stores.select("id").unique().collect())
transaction_store_ids = transactions.select("store_id").unique().collect()
n_stores_in_transactions = len(transaction_store_ids)


print(f"Number of stores: {n_stores}")
print(f"Number of stores in transactions: {n_stores_in_transactions}")

Number of stores: 18
Number of stores in transactions: 5


## Anzahl Transaktionen pro Store

Verhältnis von Transaktionen pro Store (auf ganzer und gelabelter Datenbasis)

In [8]:
transactions_per_store = (transactions
  .group_by("store_id")
  .agg(pl.len().alias("count"))
  .with_columns((pl.col("count") / pl.col("count").sum()).alias("proportion"))
  .sort("count", descending=True)
)
transactions_per_store.collect()

store_id,count,proportion
str,u32,f64
"""6a7406ec-4ebb-4df7-83ce-952866…",377817,0.254975
"""46e6da32-f4b0-40f3-ada7-fc6ca8…",377446,0.254724
"""cd833b6b-0666-42df-b538-6bb1f1…",298486,0.201437
"""3fffea06-686f-42bd-8362-818af8…",232882,0.157163
"""581831fc-6a03-4e38-9025-0a889b…",195152,0.131701


In [9]:
(transactions_labeled
  .group_by("store_id")
  .agg(pl.len().alias("count"))
  .with_columns((pl.col("count") / pl.col("count").sum()).alias("proportion"))
  .sort("count", descending=True)
).collect()

store_id,count,proportion
str,u32,f64
"""46e6da32-f4b0-40f3-ada7-fc6ca8…",37921,0.25618
"""6a7406ec-4ebb-4df7-83ce-952866…",37378,0.252511
"""cd833b6b-0666-42df-b538-6bb1f1…",30061,0.203081
"""3fffea06-686f-42bd-8362-818af8…",23110,0.156122
"""581831fc-6a03-4e38-9025-0a889b…",19555,0.132106


Die Transaktionsdaten stammen aus 5 verschiedenen Filialen. 

In [10]:
transactions_per_store_joined = transactions_per_store.join(
    stores, left_on="store_id", right_on="id", how="left"
)
transactions_per_store_joined.collect()

store_id,count,proportion,opening_date,location,state,urbanization,sco_introduction
str,u32,f64,str,str,str,str,str
"""6a7406ec-4ebb-4df7-83ce-952866…",377817,0.254975,"""1993-08-16""","""Düsseldorf""","""Nordrhein-Westfalen""","""TOWNS""","""2022-03-18"""
"""46e6da32-f4b0-40f3-ada7-fc6ca8…",377446,0.254724,"""2004-10-05""","""Stuttgart""","""Baden-Württemberg""","""CITIES""","""2022-02-08"""
"""cd833b6b-0666-42df-b538-6bb1f1…",298486,0.201437,"""2000-10-05""","""Köln""","""Nordrhein-Westfalen""","""RURAL""","""2022-02-02"""
"""3fffea06-686f-42bd-8362-818af8…",232882,0.157163,"""1998-02-17""","""Bonn""","""Nordrhein-Westfalen""","""TOWNS""","""2022-02-15"""
"""581831fc-6a03-4e38-9025-0a889b…",195152,0.131701,"""1990-07-17""","""Dortmund""","""Nordrhein-Westfalen""","""CITIES""","""2022-02-22"""


# Test Daten

In den Testdaten sind Transaktione aus den gleichen 5 Filialen mit ähnlichen Proportionen enhtalten.

In [7]:
transactions_per_store_test = (transactions_test
  .group_by("store_id")
  .agg(pl.len().alias("count"))
  .with_columns((pl.col("count") / pl.col("count").sum()).alias("proportion"))
  .sort("count", descending=True)
).join(
    stores, left_on="store_id", right_on="id", how="left"
)
transactions_per_store_test.collect()


store_id,count,proportion,opening_date,location,state,urbanization,sco_introduction
str,u32,f64,str,str,str,str,str
"""6a7406ec-4ebb-4df7-83ce-952866…",208213,0.264396,"""1993-08-16""","""Düsseldorf""","""Nordrhein-Westfalen""","""TOWNS""","""2022-03-18"""
"""46e6da32-f4b0-40f3-ada7-fc6ca8…",197926,0.251334,"""2004-10-05""","""Stuttgart""","""Baden-Württemberg""","""CITIES""","""2022-02-08"""
"""cd833b6b-0666-42df-b538-6bb1f1…",154874,0.196665,"""2000-10-05""","""Köln""","""Nordrhein-Westfalen""","""RURAL""","""2022-02-02"""
"""3fffea06-686f-42bd-8362-818af8…",122673,0.155775,"""1998-02-17""","""Bonn""","""Nordrhein-Westfalen""","""TOWNS""","""2022-02-15"""
"""581831fc-6a03-4e38-9025-0a889b…",103817,0.131831,"""1990-07-17""","""Dortmund""","""Nordrhein-Westfalen""","""CITIES""","""2022-02-22"""


In [14]:
transactions_per_store.join(transactions_per_store_test, on="store_id", how="inner", suffix="_test").select(
  [
    "store_id",
    "count",
    "proportion",
    "count_test",
    "proportion_test",
  ]
).collect()

store_id,count,proportion,count_test,proportion_test
str,u32,f64,u32,f64
"""6a7406ec-4ebb-4df7-83ce-952866…",377817,0.254975,208213,0.264396
"""46e6da32-f4b0-40f3-ada7-fc6ca8…",377446,0.254724,197926,0.251334
"""cd833b6b-0666-42df-b538-6bb1f1…",298486,0.201437,154874,0.196665
"""3fffea06-686f-42bd-8362-818af8…",232882,0.157163,122673,0.155775
"""581831fc-6a03-4e38-9025-0a889b…",195152,0.131701,103817,0.131831
