In [38]:
from pathlib import Path

import pandas as pd

In [39]:
data_path = Path("data_store")
datasheet_csv = data_path / "datasheet.csv"

In [40]:
def get_datasheet(data_path: Path) -> pd.DataFrame:
    try:
        return pd.read_csv(data_path)
    except pd.errors.EmptyDataError as e:
        print(f"Datasheet is empty: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error
    except Exception as e:
        print(f"Error reading datasheet: {e}")
        return pd.DataFrame()  # Return an empty DataFrame on error

In [41]:
datasheet_df = get_datasheet(datasheet_csv)
# Process the DataFrame
print(datasheet_df.head())

print(datasheet_df["Cat Description"].value_counts())

print(datasheet_df.isna().sum())

print(datasheet_df.duplicated().sum())

  STYLE Description Cat Description  USD $ - MSRP  GBP G - MSRP
0          90'S BAG    CLUTCH/POUCH           990           830
1          90'S BAG      SMALL BAGS           990           830
2     90S BAG COVER    CLUTCH/POUCH           260           210
3     90S BAG COVER    CLUTCH/POUCH           190           160
4      ABBETA BRIEF           KNITS           220           190
Cat Description
SWEATERS               201
PANTS                  120
KNITS                  113
TOPS                   100
COATS                   79
JACKETS                 78
DRESSES                 75
SKIRTS                  54
SMALL LEATHER GOODS     46
DENIM                   34
SHOULDER/HOBO           33
TOP HANDLE              28
CLUTCH/POUCH            22
ACCESSORIES             20
FLAT SANDAL             20
TOP HANDLE BAGS         18
SANDALS                 15
BOOTIE                  13
CROSSBODY BAGS          12
SLIPPER                 11
LOAFERS                 11
SNEAKER                 10
HEELED

In [42]:
datasheet_df = datasheet_df.rename(
    columns={
        "USD $ - MSRP": "usd_msrp",
        "GBP G - MSRP": "gbp_msrp",
    },
)
datasheet_df.columns = [
    col.strip().replace(" ", "_").lower() for col in datasheet_df.columns
]

print(datasheet_df.head())

  style_description cat_description  usd_msrp  gbp_msrp
0          90'S BAG    CLUTCH/POUCH       990       830
1          90'S BAG      SMALL BAGS       990       830
2     90S BAG COVER    CLUTCH/POUCH       260       210
3     90S BAG COVER    CLUTCH/POUCH       190       160
4      ABBETA BRIEF           KNITS       220       190


In [43]:
print(datasheet_df[datasheet_df[["style_description", "gbp_msrp"]].duplicated()])

          style_description  cat_description  usd_msrp  gbp_msrp
1                  90'S BAG       SMALL BAGS       990       830
107                     AVA            FLATS       790       690
149             BARE SANDAL          SANDALS       850       740
246                  BOURSE       SMALL BAGS      1590      1330
339            CLASSIC BELT            BELTS       595       500
454          ELASTIC BALLET            FLATS       790       690
535               FISHERMAN          SANDALS       990       860
580                   GINZA          SANDALS       925       810
584                     GIO          DUFFLES      3900      3250
624           HOOK-AND-LOOP          SANDALS       990       860
641                    IOWA          DUFFLES      4150      3460
700             KITTEN PUMP            PUMPS       990       860
821   MARGAUX 17 INSIDE-OUT  TOP HANDLE BAGS      3950      3300
829                 MARIE H         SNEAKERS       790       690
912            NU MINI TW