In [7]:
import pandas as pd
from pathlib import Path

In [49]:
# set the file path and the sheet names to explore
base = Path(r"d:\yzy\CMU\26SPRING\95451 pm\project\VSP Vision Datasets")

sources = [
    ("AO (demand)", "AO-BI275 DEMAND KC KP LA LS KO KS 12.17.25.xlsx", ["Brand View"]),
    ("Calvin Klein", "Calvin Klein_Sept24 ATP.xlsx", ["SUN", "OPH"]),
    ("Lacoste", "LACOSTE_Sept24 ATP.xlsx", ["LACOSTE OPTICAL", "LACOSTE SUN"]),
    ("Nike", "Nike_Sept24 ATP.xlsm", ["Nike Sept 24 Optical", "Nike Sept 24 Sun"]),
]

## 1. Preprocess 'demand' Data

In [40]:
# load and clean the 'damand' data
df_ao_raw = pd.read_excel(
    base / "AO-BI275 DEMAND KC KP LA LS KO KS 12.17.25.xlsx",
    sheet_name="Brand View",
    header=None,
)

n_meta = 7
meta_names = ["Collection", "BrandLine", "Material", "StyleCode", "GridValue", "Style", "Region"]

month_cols = df_ao_raw.iloc[1, n_meta:].tolist()   
demand = df_ao_raw.iloc[2:].copy()                  
demand.columns = meta_names + month_cols

demand = demand[demand.iloc[:, 0].astype(str).str.upper() != "COLLECTION"].copy()

for i in range(n_meta, demand.shape[1]):
    demand.iloc[:, i] = pd.to_numeric(demand.iloc[:, i], errors="coerce")


def _parse_style(val):
    if pd.isna(val):
        return pd.Series({"Size": "", "Color": ""})
    txt = str(val)
    parts = txt.split("/")
    if len(parts) >= 3:
        size = parts[1].strip()
        color = parts[2].strip()
    elif len(parts) == 2:
        size = parts[0].strip()
        color = parts[1].strip()
    else:
        size = ""
        color = ""
    return pd.Series({"Size": size, "Color": color})

style_extras = demand["Style"].apply(_parse_style)
demand = pd.concat([demand, style_extras], axis=1)

print(demand.head())


  Collection         BrandLine Material StyleCode GridValue  \
2         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
3         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
4         KC  CALVIN KLEIN SUN    45073  CK20541S   5719235   
5         KC  CALVIN KLEIN SUN    45073  CK20541S   5719235   
6         KC  CALVIN KLEIN SUN    45073  CK20541S   5719605   

                  Style Region 09/2023 10/2023 11/2023  ... 02/2024 03/2024  \
2     CK20541S/57/BLACK   AMER    26.0    28.0    27.0  ...    24.0    21.0   
3     CK20541S/57/BLACK   EMEA    10.0    18.0    11.0  ...    36.0    84.0   
4  CK20541S/57/DARK TOR   AMER    33.0    39.0    14.0  ...    15.0    16.0   
5  CK20541S/57/DARK TOR   EMEA    21.0    15.0    11.0  ...    61.0    55.0   
6   CK20541S/57/CRYSTAL   AMER    11.0    21.0    15.0  ...     8.0    11.0   

  04/2024 05/2024 06/2024 07/2024 08/2024 Overall Result Size     Color  
2    35.0    21.0    20.0    32.0    16.0          285.0   57     BLACK 

In [41]:
print('demand shape:', demand.shape)

demand shape: (1866, 22)


In [42]:
# first drop rows without a style (they're empty/aggregate rows)
dropped = demand['Style'].isna().sum()
print(f"dropping {dropped} rows with missing Style")
demand = demand[demand['Style'].notna()].copy()

dropping 402 rows with missing Style


In [57]:
# reshape the demand data to long format
# first drop any styles that were missing (already done earlier)
demand_long = demand.melt(
    id_vars=meta_names + ["Size", "Color"],
    value_vars=month_cols,
    var_name="Month",
    value_name="Demand",
)

# drop any non-month labels such as "Overall Result" before converting
mask_valid = demand_long["Month"].astype(str).str.match(r"^\d{2}/\d{4}$")
demand_long = demand_long[mask_valid].copy()

# diagnostic prints
print("month_cols:", month_cols)
print("filtered demand_long shape:", demand_long.shape)
print(demand_long.head())

# align the date format to "YYYY-MM" using the correct pattern for MM/YYYY
# (the original strings are like '09/2023')
demand_long["Month"] = pd.to_datetime(
    demand_long["Month"],
    format="%m/%Y",         
    errors="coerce"
).dt.strftime("%Y-%m")

# check NaNs in the melted dataframe
nan_counts = demand_long.isna().sum()
print("NaN counts in demand_long:\n", nan_counts)


month_cols: ['09/2023', '10/2023', '11/2023', '12/2023', '01/2024', '02/2024', '03/2024', '04/2024', '05/2024', '06/2024', '07/2024', '08/2024', 'Overall Result']
filtered demand_long shape: (17568, 11)
  Collection         BrandLine Material StyleCode GridValue  \
0         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
1         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
2         KC  CALVIN KLEIN SUN    45073  CK20541S   5719235   
3         KC  CALVIN KLEIN SUN    45073  CK20541S   5719235   
4         KC  CALVIN KLEIN SUN    45073  CK20541S   5719605   

                  Style Region Size     Color    Month Demand  
0     CK20541S/57/BLACK   AMER   57     BLACK  09/2023   26.0  
1     CK20541S/57/BLACK   EMEA   57     BLACK  09/2023   10.0  
2  CK20541S/57/DARK TOR   AMER   57  DARK TOR  09/2023   33.0  
3  CK20541S/57/DARK TOR   EMEA   57  DARK TOR  09/2023   21.0  
4   CK20541S/57/CRYSTAL   AMER   57   CRYSTAL  09/2023   11.0  
NaN counts in demand_long:
 Collec

In [None]:
# merge by style region and month/ deduplicate
# This will automatically fill the demand=nan with 0.0
demand_monthly = demand_long.groupby(meta_names + ["Size", "Color",'Month'], as_index=False)["Demand"].sum()

In [58]:
# count rows where demand is exactly zero
demand_zero = (demand_monthly['Demand'] == 0).sum()
print(f"rows in demand_monthly with Demand == 0: {demand_zero}")

rows in demand_monthly with Demand == 0: 2563


In [59]:
print(demand_monthly.head())

  Collection         BrandLine Material StyleCode GridValue  \
0         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
1         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
2         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
3         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   
4         KC  CALVIN KLEIN SUN    45073  CK20541S   5719001   

               Style Region Size  Color    Month Demand  
0  CK20541S/57/BLACK   AMER   57  BLACK  2023-09   26.0  
1  CK20541S/57/BLACK   AMER   57  BLACK  2023-10   28.0  
2  CK20541S/57/BLACK   AMER   57  BLACK  2023-11   27.0  
3  CK20541S/57/BLACK   AMER   57  BLACK  2023-12   26.0  
4  CK20541S/57/BLACK   AMER   57  BLACK  2024-01    9.0  


## 2.Merge the products sheet

In [None]:
# load and merge all the products sheet into one dataframe, only keep the common columns

def _norm_cols(df):
    df.columns = [str(c).strip().upper() for c in df.columns]
    return df

product_sheets = [
    ("Calvin Klein", "Calvin Klein_Sept24 ATP.xlsx", "SUN"),
    ("Calvin Klein", "Calvin Klein_Sept24 ATP.xlsx", "OPH"),
    ("Lacoste", "LACOSTE_Sept24 ATP.xlsx", "LACOSTE OPTICAL"),
    ("Lacoste", "LACOSTE_Sept24 ATP.xlsx", "LACOSTE SUN"),
    ("Nike", "Nike_Sept24 ATP.xlsm", "Nike Sept 24 Optical"),
    ("Nike", "Nike_Sept24 ATP.xlsm", "Nike Sept 24 Sun"),
]

dfs = []
for brand, fname, sheet in product_sheets:
    df = pd.read_excel(base / fname, sheet_name=sheet)
    df = _norm_cols(df)
    
    df["_BRAND"] = brand
    df["_SHEET_TYPE"] = "Optical" if any(x in sheet.upper() for x in ["OPTICAL", "OPH"]) else "Sun"
    
    dfs.append(df)

common_cols = set(dfs[0].columns)
for df in dfs[1:]:
    common_cols = common_cols.intersection(set(df.columns))

parts = [df[list(common_cols)] for df in dfs]
products = pd.concat(parts, ignore_index=True)

print("shared cols:", list(products.columns))
print("shape", products.shape)

shared cols: ['FRAMESHAPE', 'FIT', 'FAMILY', 'MATERIALNUMBER', 'USRETAILPRICE', 'IMAGE', '_SHEET_TYPE', 'PROTOTYPECODE', 'RELEASEDATE', 'RECOMMENDEDREASONS', 'BASECURVE', 'GENDER', 'SIZES', 'EUROWHOLESALEPRICE', '_BRAND', 'MADEIN', 'COLORCODE', 'NOTES', 'MATERIALCODE1', 'COLORADD', 'MATERIALCODE2', 'SUNOPTICAL', 'USWHOLESALEPRICE', 'BRAND', 'FRAMECONSTRUCTION', 'EURORETAILPRICE', 'RXABLE', 'COLORDESCRIPTION']
shape: (316, 28)


In [61]:
print(products.head())

           FRAMESHAPE     FIT      FAMILY MATERIALNUMBER USRETAILPRICE  IMAGE  \
0           BUTTERFLY  GLOBAL  AVANTGARDE       CK24110S        209,00    NaN   
1           BUTTERFLY  GLOBAL  AVANTGARDE       CK24110S        209,00    NaN   
2           BUTTERFLY  GLOBAL  AVANTGARDE       CK24110S        209,00    NaN   
3           BUTTERFLY  GLOBAL  AVANTGARDE       CK24110S        209,00    NaN   
4  MODIFIED RECTANGLE  GLOBAL  AVANTGARDE       CK24111S        209,00    NaN   

  _SHEET_TYPE PROTOTYPECODE              RELEASEDATE RECOMMENDEDREASONS  ...  \
0         Sun      E11397D1  CALVIN KLEIN - SEP 2024                ADV  ...   
1         Sun      E11397D1  CALVIN KLEIN - SEP 2024      Potential ADV  ...   
2         Sun      E11397D1  CALVIN KLEIN - SEP 2024      Potential ADV  ...   
3         Sun      E11397D1  CALVIN KLEIN - SEP 2024      Potential ADV  ...   
4         Sun      E11397A1  CALVIN KLEIN - SEP 2024                NaN  ...   

  MATERIALCODE1 COLORADD MATERIA

In [62]:
# diagnose NaNs and drop columns that are entirely NaN
nan_counts = products.isna().sum()
print("NaN counts in products:")
print(nan_counts)

all_nan_cols = nan_counts[nan_counts == len(products)].index.tolist()
print(f"dropping {len(all_nan_cols)} all-NaN columns: {all_nan_cols}")
products = products.drop(columns=all_nan_cols)
print("new shape:", products.shape)
print("remaining cols:", list(products.columns))

NaN counts in products:
FRAMESHAPE              0
FIT                     0
FAMILY                  0
MATERIALNUMBER          0
USRETAILPRICE           0
IMAGE                 316
_SHEET_TYPE             0
PROTOTYPECODE           0
RELEASEDATE             0
RECOMMENDEDREASONS    181
BASECURVE               0
GENDER                  0
SIZES                   0
EUROWHOLESALEPRICE      0
_BRAND                  0
MADEIN                  0
COLORCODE               0
NOTES                 302
MATERIALCODE1           0
COLORADD              316
MATERIALCODE2           0
SUNOPTICAL              0
USWHOLESALEPRICE        0
BRAND                   0
FRAMECONSTRUCTION       0
EURORETAILPRICE         0
RXABLE                  0
COLORDESCRIPTION        0
dtype: int64
dropping 2 all-NaN columns: ['IMAGE', 'COLORADD']
new shape: (316, 26)
remaining cols: ['FRAMESHAPE', 'FIT', 'FAMILY', 'MATERIALNUMBER', 'USRETAILPRICE', '_SHEET_TYPE', 'PROTOTYPECODE', 'RELEASEDATE', 'RECOMMENDEDREASONS', 'BASECURVE',