In [2]:
import pandas as pd

t_exp = pd.read_csv("test_split_merged_expanded_data.csv", parse_dates=["date"])
t_upd = pd.read_csv("analysis/test_split_merged_data_updated.csv", parse_dates=["date"])

dates_exp = set(t_exp["date"].dt.date.unique())
dates_upd = set(t_upd["date"].dt.date.unique())

only_in_exp = sorted(dates_exp - dates_upd)
only_in_upd = sorted(dates_upd - dates_exp)

print("Dates only in EXPANDED:", len(only_in_exp))
print("Dates only in UPDATED: ", len(only_in_upd))

print("\nFirst 10 dates only in EXPANDED:", only_in_exp[:10])
print("First 10 dates only in UPDATED: ", only_in_upd[:10])


Dates only in EXPANDED: 10
Dates only in UPDATED:  0

First 10 dates only in EXPANDED: [datetime.date(2018, 10, 31), datetime.date(2018, 12, 25), datetime.date(2018, 12, 26), datetime.date(2019, 1, 1), datetime.date(2019, 3, 20), datetime.date(2019, 4, 3), datetime.date(2019, 4, 19), datetime.date(2019, 5, 1), datetime.date(2019, 7, 10), datetime.date(2019, 7, 31)]
First 10 dates only in UPDATED:  []


In [3]:
ids_exp = set(t_exp["id"].unique())
ids_upd = set(t_upd["id"].unique())

extra_ids = sorted(ids_exp - ids_upd)
missing_ids = sorted(ids_upd - ids_exp)

print("Extra IDs in EXPANDED:", len(extra_ids))
print("Missing IDs in EXPANDED:", len(missing_ids))

print("\nExample extra IDs:", extra_ids[:10])


Extra IDs in EXPANDED: 56
Missing IDs in EXPANDED: 0

Example extra IDs: [np.int64(1810311), np.int64(1810312), np.int64(1810313), np.int64(1810314), np.int64(1810315), np.int64(1812251), np.int64(1812252), np.int64(1812253), np.int64(1812254), np.int64(1812255)]


In [4]:
extra_ids = set(t_exp["id"].unique()) - set(t_upd["id"].unique())

extra_rows = t_exp[t_exp["id"].isin(extra_ids)][["id", "date", "warengruppe"]].copy()
extra_rows = extra_rows.sort_values(["date", "warengruppe"])

print("Extra rows example:")
print(extra_rows.head(30))

print("\nExtra rows per warengruppe:")
print(extra_rows["warengruppe"].value_counts().sort_index())

print("\nExtra rows per date (top 10):")
print(extra_rows["date"].value_counts().head(10))


Extra rows example:
           id       date  warengruppe
455   1810311 2018-10-31            1
456   1810312 2018-10-31            2
457   1810313 2018-10-31            3
458   1810314 2018-10-31            4
459   1810315 2018-10-31            5
784   1812251 2018-12-25            1
785   1812252 2018-12-25            2
786   1812253 2018-12-25            3
787   1812254 2018-12-25            4
788   1812255 2018-12-25            5
789   1812256 2018-12-25            6
790   1812261 2018-12-26            1
791   1812262 2018-12-26            2
792   1812263 2018-12-26            3
793   1812264 2018-12-26            4
794   1812265 2018-12-26            5
795   1812266 2018-12-26            6
813   1812296 2018-12-29            6
819   1812306 2018-12-30            6
823   1812314 2018-12-31            4
825   1812316 2018-12-31            6
826   1901011 2019-01-01            1
827   1901012 2019-01-01            2
828   1901013 2019-01-01            3
829   1901014 2019-01-01      

In [6]:
t_upd = pd.read_csv("analysis/test_split_merged_data_updated.csv")
t_exp = pd.read_csv("test_split_merged_expanded_data.csv")

t_exp_filtered = t_exp[t_exp["id"].isin(t_upd["id"])].copy()
t_exp_filtered.to_csv("test_split_merged_expanded_data_filtered.csv", index=False)

print("Filtered expanded test rows:", len(t_exp_filtered))
print("Expected test rows:", t_upd["id"].nunique())


Filtered expanded test rows: 1830
Expected test rows: 1830
