In [3]:
import polars as pl
import numpy as np
from pathlib import Path

data_dir = Path("../data")

joined = pl.scan_parquet(data_dir / "data_matrix.parquet")


In [2]:

store = pl.scan_csv(data_dir / "stores.csv")

In [8]:
len(store.select("location").unique().collect().to_numpy())

15

In [4]:
# create a polars dataframe with one columns class, attribute and is_importatn
# 3 classes: "store", "product", "customer"
# store has attributes "location", "store_id", "store_name" (false, false, false)
# product has attributes "product_id", "product_name", "category" (false, true, ture)
# customer has attributes "customer_id", "customer_name", "age" (true, true true)

df = pl.DataFrame(
    {
        "class": ["store", "store","store","product", "product","product","customer","customer","customer"],
        "attribute": [
            "location",
            "store_id",
            "store_name",
            "product_id",
            "product_name",
            "category",
            "customer_id",
            "customer_name",
            "age",
        ],
        "is_important": [False, False, False, False, True, True, True, True, True],
    }
)

In [10]:
df.group_by("class").agg(
    (~pl.col("is_important").min()).alias("has_not_important"),
    pl.col("is_important").max().alias("has_important"),
)

class,has_not_important,has_important
str,bool,bool
"""customer""",False,True
"""store""",True,False
"""product""",True,True


In [3]:
joined.select(["transaction_duration_seconds", "mean_time_between_scans", "max_time_between_scans", "time_to_first_scan", "time_from_last_scan_to_end"]).describe()

statistic,transaction_duration_seconds,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end
str,f64,f64,f64,f64,f64
"""count""",1481783.0,1367669.0,1367669.0,1481770.0,1481770.0
"""null_count""",0.0,114114.0,114114.0,13.0,13.0
"""mean""",77.068517,6.20206,12.118309,6.206807,10.314386
"""std""",72.926844,2.517007,6.748936,4.57406,4.698093
"""min""",3.0,0.201087,0.201087,0.51685,2.513962
"""25%""",28.0,4.891213,9.102887,2.276961,7.497902
"""50%""",53.0,6.189657,12.03634,5.543029,9.801902
"""75%""",101.0,7.382909,14.334259,9.290475,12.045619
"""max""",1244.0,156.513264,293.990727,207.342886,135.892762


In [2]:
joined.describe()

statistic,id,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,daytime,hour,hour_categorical,day_of_week,month,transaction_duration_seconds,damage,label,store_id,location,urbanization,has_voided,n_voided,n_age_restricted,has_age_restricted,popularity_max,popularity_min,max_product_price,n_sold_by_weight,has_sold_by_weight,has_camera_detected_wrong_product,has_camera_detected_wrong_product_high_certainty,has_convenience,has_snacks,has_personal_care,has_limited_time_offers,has_bakery,has_long_shelf_life,has_fruits_vegetables,has_frozen_goods,has_beverages,has_dairy,has_fruits_vegetables_pieces,has_household,has_alcohol,has_tobacco,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
str,str,str,f64,f64,str,f64,f64,f64,f64,f64,str,f64,str,str,str,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""1481783""","""1481783""",1481783.0,1481783.0,"""1481783""",1481783.0,1481783.0,1481783.0,1481783.0,1481783.0,"""1481783""",1481783.0,"""1481783""","""1481783""","""1481783""",1481783.0,148025.0,"""1481783""","""1481783""","""1481783""","""1481783""",1481770.0,1481770.0,1481770.0,1481768.0,1481768.0,1481768.0,1481768.0,1481770.0,1481768.0,1480694.0,1480694.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1481770.0,1367669.0,1367669.0,1481770.0,1481770.0,1481783.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,"""0""","""0""","""0""",0.0,1333758.0,"""0""","""0""","""0""","""0""",13.0,13.0,13.0,15.0,15.0,15.0,15.0,13.0,15.0,1089.0,1089.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,13.0,114114.0,114114.0,13.0,13.0,0.0
"""mean""",,,98.423293,10.578223,,0.070672,0.001804,0.003683,0.008668,0.056517,,14.79277,,,,77.068517,0.237609,,,,,0.073656,0.080336,0.023253,0.021878,0.180433,0.116002,3.018227,0.026762,0.023664,5e-05,4.5e-05,0.180961,0.021829,0.0789,4e-06,0.013516,0.241284,0.023664,0.40896,0.693368,0.279618,0.847676,0.036736,0.016291,0.00576,0.000123,6.20206,12.118309,6.206807,10.314386,339.17909
"""std""",,,109.957256,11.106638,,,,,,,,3.796015,,,,72.926844,1.858836,,,,,,0.298604,0.160121,,0.016093,0.035104,1.885928,0.182311,,,,,,,,,,,,,,,,,,,2.517007,6.748936,4.57406,4.698093,197.26692
"""min""","""000010f4-1428-4dc1-8de5-6ddf86…",,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,8.0,,,,3.0,0.0,,,,,0.0,0.0,0.0,0.0,0.00168,0.001222,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.201087,0.201087,0.51685,2.513962,0.0
"""25%""",,,24.74,3.0,,,,,,,,12.0,,,,28.0,0.0,,,,,,0.0,0.0,,0.18133,0.090046,1.79,0.0,,,,,,,,,,,,,,,,,,,4.891213,9.102887,2.276961,7.497902,165.0
"""50%""",,,62.48,7.0,,,,,,,,15.0,,,,53.0,0.0,,,,,,0.0,0.0,,0.184993,0.114471,2.79,0.0,,,,,,,,,,,,,,,,,,,6.189657,12.03634,5.543029,9.801902,339.0
"""75%""",,,132.86,14.0,,,,,,,,18.0,,,,101.0,0.0,,,,,,0.0,0.0,,0.18873,0.139815,3.99,0.0,,,,,,,,,,,,,,,,,,,7.382909,14.334259,9.290475,12.045619,511.0
"""max""","""ffffdb48-c154-4ef8-a02d-a3f14a…",,1931.97,219.0,,1.0,1.0,1.0,1.0,1.0,,22.0,,,,1244.0,110.32,,,,,1.0,7.0,5.0,1.0,0.192543,0.192543,25.49,6.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,156.513264,293.990727,207.342886,135.892762,696.0


In [12]:
w = joined.filter(pl.col("has_sold_by_weight").is_null()).collect().to_pandas()

In [13]:
w

Unnamed: 0,id,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,...,has_fruits_vegetables_pieces,has_household,has_alcohol,has_tobacco,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
0,ce6ad684-ec59-46a2-9180-baf04322f477,2,0.0,0,CREDIT_CARD,False,False,False,False,False,...,,,,,,,,,,58
1,5890e8d9-606c-4f48-ac7b-d2182fcb22f8,3,0.0,0,CASH,False,False,False,False,False,...,,,,,,,,,,222
2,405865c2-86bc-4660-830c-87744bc7ada3,2,0.0,0,CREDIT_CARD,False,False,False,False,False,...,,,,,,,,,,438
3,f89eb906-4fcb-4b16-98a1-e129cc4279f8,2,0.0,0,CASH,False,False,False,False,False,...,,,,,,,,,,466
4,0a7c744f-db45-4446-a12d-29073304fcf5,3,0.0,0,CREDIT_CARD,False,False,False,False,False,...,,,,,,,,,,147
5,f05d0ef8-b3e7-4032-ab23-497a8581559b,0,0.0,0,CASH,False,False,False,False,False,...,False,False,False,False,True,,,29.829949,6.122794,125
6,7019ccc8-ba2a-4958-8c3d-9eba54b25c85,0,0.0,0,CASH,False,False,False,False,False,...,,,,,,,,,,115
7,fe0eb499-d2e6-4ea8-9ade-94ee3502c0ef,2,0.0,0,CREDIT_CARD,False,False,False,False,False,...,,,,,,,,,,296
8,cce5317f-e4b6-4890-b776-fd9838730ae2,3,0.0,0,CASH,False,False,False,False,False,...,,,,,,,,,,542
9,96e0f707-03f5-4ee0-b3e6-d46b234e6e75,3,0.0,0,CREDIT_CARD,False,False,False,False,False,...,,,,,,,,,,534


In [5]:
df = joined.head(2).collect().to_pandas()

In [6]:
df

Unnamed: 0,id,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,...,has_fruits_vegetables_pieces,has_household,has_alcohol,has_tobacco,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
0,ec44460a-ba9a-43d0-b183-068681615500,0,20.41,6,CASH,False,False,False,False,False,...,True,False,False,False,False,9.794671,11.345046,1.543855,9.130059,479
1,c57e0405-469c-4716-bf4c-3f98b228c411,0,11.57,5,CREDIT_CARD,False,False,False,False,False,...,True,False,False,False,False,9.141796,11.948798,9.7375,7.331237,479


In [86]:
joined.filter(pl.col("n_lines") <= 1).select(pl.len()).collect()

len
u32
115393


In [89]:
joined.filter(pl.col("label") != "UNKNOWN").describe()

statistic,id,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,daytime,hour,hour_categorical,day_of_week,month,transaction_duration_seconds,damage,label,store_id,location,urbanization,has_voided,n_voided,n_age_restricted,has_age_restricted,popularity_max,popularity_min,max_product_price,n_sold_by_weight,has_sold_by_weight,has_camera_detected_wrong_product,has_camera_detected_wrong_product_high_certainty,has_bakery,has_beverages,has_convenience,has_tobacco,has_household,has_dairy,has_alcohol,has_snacks,has_personal_care,has_long_shelf_life,has_frozen_goods,has_limited_time_offers,has_fruits_vegetables,has_fruits_vegetables_pieces,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
str,str,str,f64,f64,str,f64,f64,f64,f64,f64,str,f64,str,str,str,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""148025""","""148025""",148025.0,148025.0,"""148025""",148025.0,148025.0,148025.0,148025.0,148025.0,"""148025""",148025.0,"""148025""","""148025""","""148025""",148025.0,148025.0,"""148025""","""148025""","""148025""","""148025""",148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,147911.0,147911.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,148024.0,136546.0,136546.0,148024.0,148024.0,148025.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,"""0""","""0""","""0""",0.0,0.0,"""0""","""0""","""0""","""0""",1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,114.0,114.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,11479.0,11479.0,1.0,1.0,0.0
"""mean""",,,98.50975,10.603607,,0.071907,0.00175,0.003702,0.008877,0.057578,,14.780949,,,,77.307191,0.237609,,,,,0.074319,0.080852,0.022969,0.021706,0.180423,0.116131,3.009293,0.025611,0.02255,6.8e-05,6.1e-05,0.013437,0.693698,0.181079,0.005499,0.036616,0.279056,0.016335,0.022699,0.078197,0.240704,0.409852,0.0,0.02255,0.848045,0.000108,6.201643,12.137966,6.229749,10.331285,339.287451
"""std""",,,110.079582,11.155176,,,,,,,,3.795821,,,,73.201987,1.858836,,,,,,0.298491,0.158782,,0.016138,0.035038,1.870487,0.179285,,,,,,,,,,,,,,,,,,,2.482892,6.792813,4.568019,4.691329,197.184415
"""min""","""00002a24-d845-436f-8102-aaad11…",,0.0,0.0,,0.0,0.0,0.0,0.0,0.0,,8.0,,,,3.0,0.0,,,,,0.0,0.0,0.0,0.0,0.004307,0.00168,0.19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.21266,0.21266,0.518004,2.633553,0.0
"""25%""",,,24.74,3.0,,,,,,,,12.0,,,,28.0,0.0,,,,,,0.0,0.0,,0.18133,0.090046,1.79,0.0,,,,,,,,,,,,,,,,,,,4.899114,9.121386,2.291389,7.512052,165.0
"""50%""",,,62.55,7.0,,,,,,,,15.0,,,,53.0,0.0,,,,,,0.0,0.0,,0.184993,0.114471,2.79,0.0,,,,,,,,,,,,,,,,,,,6.190756,12.045497,5.593395,9.811409,340.0
"""75%""",,,132.79,14.0,,,,,,,,18.0,,,,101.0,0.0,,,,,,0.0,0.0,,0.18873,0.139815,3.99,0.0,,,,,,,,,,,,,,,,,,,7.377009,14.343155,9.325711,12.085011,511.0
"""max""","""ffffdb48-c154-4ef8-a02d-a3f14a…",,1931.97,219.0,,1.0,1.0,1.0,1.0,1.0,,22.0,,,,1244.0,110.32,,,,,1.0,5.0,5.0,1.0,0.192543,0.192543,23.897569,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,79.916389,256.095394,158.612089,99.444539,696.0


In [99]:
# get the columns with missing values and the number of missing values
missing_values = joined.filter(pl.col("label") != "UNKNOWN").select(
    pl.all().is_null().sum()
).collect()

transposed = pl.DataFrame({
    "column": missing_values.columns,
    "missing_values": missing_values.row(0)
})

transposed = transposed.filter(pl.col("missing_values") > 0).sort("missing_values", descending=True)
transposed

column,missing_values
str,i64
"""mean_time_between_scans""",11479
"""max_time_between_scans""",11479
"""has_camera_detected_wrong_prod…",114
"""has_camera_detected_wrong_prod…",114
"""has_voided""",1
…,…
"""has_fruits_vegetables""",1
"""has_fruits_vegetables_pieces""",1
"""has_missing""",1
"""time_to_first_scan""",1


In [73]:
dfpl = joined.head(4).collect()

In [74]:
dfpl.describe()

statistic,id,cash_desk,total_amount,n_lines,payment_medium,has_feedback,feedback_low,feedback_middle,feedback_high,feedback_top,daytime,hour,hour_categorical,day_of_week,month,transaction_duration_seconds,damage,label,store_id,location,urbanization,has_voided,n_voided,n_age_restricted,has_age_restricted,popularity_max,popularity_min,max_product_price,n_sold_by_weight,has_sold_by_weight,has_camera_detected_wrong_product,has_camera_detected_wrong_product_high_certainty,has_bakery,has_beverages,has_convenience,has_tobacco,has_household,has_dairy,has_alcohol,has_snacks,has_personal_care,has_long_shelf_life,has_frozen_goods,has_limited_time_offers,has_fruits_vegetables,has_fruits_vegetables_pieces,has_missing,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end,days_since_sco_introduction
str,str,str,f64,f64,str,f64,f64,f64,f64,f64,str,f64,str,str,str,f64,f64,str,str,str,str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""","""4""","""4""",4.0,4.0,"""4""",4.0,4.0,4.0,4.0,4.0,"""4""",4.0,"""4""","""4""","""4""",4.0,1.0,"""4""","""4""","""4""","""4""",4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0
"""null_count""","""0""","""0""",0.0,0.0,"""0""",0.0,0.0,0.0,0.0,0.0,"""0""",0.0,"""0""","""0""","""0""",0.0,3.0,"""0""","""0""","""0""","""0""",0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
"""mean""",,,55.4175,4.75,,0.0,0.0,0.0,0.0,0.0,,15.25,,,,42.5,0.0,,,,,0.0,0.0,0.0,0.0,0.180716,0.090525,4.79,0.25,0.25,0.0,0.0,0.0,0.25,0.0,0.0,0.0,0.5,0.0,0.0,0.25,0.5,0.25,0.0,0.25,1.0,0.0,6.148326,9.121115,9.1064725,9.8851775,478.75
"""std""",,,65.874235,2.872281,,,,,,,,5.057997,,,,21.886069,,,,,,,0.0,0.0,,0.01163,0.055095,3.45157,0.5,,,,,,,,,,,,,,,,,,,1.740244,2.423411,1.452623,3.777059,0.5
"""min""","""03387f4f-aa69-4bd1-9dae-754040…",,16.15,3.0,,0.0,0.0,0.0,0.0,0.0,,9.0,,,,25.0,0.0,,,,,0.0,0.0,0.0,0.0,0.164074,0.034478,1.39,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.800602,6.312293,7.029285,6.150008,478.0
"""25%""",,,16.88,3.0,,,,,,,,14.0,,,,31.0,0.0,,,,,,0.0,0.0,,0.18133,0.054615,2.49,0.0,,,,,,,,,,,,,,,,,,,6.078169,8.023343,9.245956,8.26353,479.0
"""50%""",,,35.31,4.0,,,,,,,,17.0,,,,40.0,0.0,,,,,,0.0,0.0,,0.18873,0.121549,6.49,0.0,,,,,,,,,,,,,,,,,,,6.789134,10.406037,9.831578,10.127446,479.0
"""75%""",,,35.31,4.0,,,,,,,,17.0,,,,40.0,0.0,,,,,,0.0,0.0,,0.18873,0.121549,6.49,0.0,,,,,,,,,,,,,,,,,,,6.789134,10.406037,9.831578,10.127446,479.0
"""max""","""c8225339-f7a0-47e6-98be-b178c2…",,153.33,9.0,,0.0,0.0,0.0,0.0,0.0,,21.0,,,,74.0,0.0,,,,,0.0,0.0,0.0,0.0,0.18873,0.151459,8.79,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0,7.925398,11.742786,10.319071,14.999726,479.0


In [68]:
dfpl.select(["n_lines", "transaction_duration_seconds", "mean_time_between_scans", "max_time_between_scans", "time_to_first_scan", "time_from_last_scan_to_end"
])

n_lines,transaction_duration_seconds,mean_time_between_scans,max_time_between_scans,time_to_first_scan,time_from_last_scan_to_end
i64,i64,f64,f64,f64,f64
3,25,3.800602,6.312293,9.831578,8.26353
1,7,0.0,0.0,1.380731,6.571964
16,122,7.410429,15.346954,2.63362,8.560743
9,95,9.478357,14.544279,8.560001,11.00353


In [8]:
# what data types are in the dataframe?
df.dtypes

id                                                           object
cash_desk                                                  category
total_amount                                                float64
n_lines                                                       int64
payment_medium                                             category
has_feedback                                                   bool
feedback_low                                                   bool
feedback_middle                                                bool
feedback_high                                                  bool
feedback_top                                                   bool
daytime                                                    category
hour                                                           int8
day_of_week                                                category
month                                                      category
transaction_duration_seconds                    