In [None]:
# ==========================================
# Import Required Libraries
# ==========================================
import os
import glob
import re
import pandas as pd

def load_product_data(input_folder: str) -> pd.DataFrame:
    all_dfs = []
    for file_path in glob.glob(os.path.join(input_folder, "product_master_*.xlsb")):
        year_match = re.search(r"(\d{4})", os.path.basename(file_path))
        year = year_match.group(1) if year_match else None
        temp_df = pd.read_excel(file_path, sheet_name="DANH MỤC", skiprows=1, engine="pyxlsb")
        temp_df["applied_year"] = year
        all_dfs.append(temp_df)
    combined_df = pd.concat(all_dfs, ignore_index=True)
    return combined_df

if __name__ == "__main__":
    input_folder = r"D:\footwear_retail_chain_project\0. input_data\product\raw_data"
    product_df = load_product_data(input_folder)
    
def clean_product_data(df: pd.DataFrame) -> pd.DataFrame:
    df = df[[
        "SKU", "Style color", "Style", "Color", "Color Group", "Listing Price",
        "Price Group", "Brand", "Gender", "Product Group", "Detailed Product Group",
        "Shoe Product", "Specialized Function", "Dedicated functions", "Size Group",
        "Size", "Size Range", "Age Group", "Activity Group", "Collection", "Copyright group",
        "Image copyright", "Product Line", "Design Inspiration", "Lifestyle Group", "Launch Season",
        "Similar group", "Heel Height", "applied_year"
    ]]
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # clean sku
    df["sku"] = df["sku"].apply(
        lambda x: str(x) + "00" 
        if len(str(x)) < 14 
        else str(x)
        ).astype(str).str.upper()

    # clean color
    df["color"] = (
        df["color"]
        .fillna(df["style_color"].astype(str).str[-3:])
        .replace(["", "0", 0], "000")
    ).astype(str).str.upper()

    # clean color group
    df["color_group"] = df["color_group"].replace({
        "TRẮNG": "WHITE",
        "ĐEN": "BLACK",
        "ĐEN ": "BLACK",
        "SÁNG": "LIGHT",
        "TỐI": "DARK",
        "NHIỀU MÀU": "MULTI",
        "MÀU SẮC": "MULTI",
        "KHÁC": "OTHER",
    }).astype(str).str.upper()

    # clean listing price
    df["listing_price"] = df["listing_price"].fillna(0).astype(int)

    # clean price group
    df["price_group"] = pd.cut(
        df["listing_price"],
        bins=[-float("inf"), 70_000, 100_000, 200_000, 300_000, 400_000, 
            500_000, 600_000, 800_000, 900_000, 1_000_000, 1_200_000, float("inf")],
        labels=["<=70", "70<=100", "100<=200", "200<=300", "300<=400", 
                "400<=500", "500<=600", "600<=800", "800<=900", 
                "900<=1000", "1000<=1200", ">1200"]
    )

    # specialized_function
    

    return df

if __name__ == "__main__":
    input_folder = r"D:\footwear_retail_chain_project\0. input_data\product\raw_data"
    product_df = clean_product_data(product_df)
    product_df.info()

# ==========================================
# Export Cleaned DataFrame to CSV
# ==========================================

# Define output directory and file path
output_path = r"D:\footwear_retail_chain_project\0. input_data\product\processed_data"
output_file = os.path.join(output_path, "product_master.csv")

# Save DataFrame to CSV
product_df.to_csv(output_file, index=False)
print(f"✅ Cleaned distribution data exported to: {output_file}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sku"] = df["sku"].apply(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["color"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["color_group"] = df["color_group"].replace({
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = val

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392477 entries, 0 to 392476
Data columns (total 29 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   sku                     392477 non-null  object  
 1   style_color             392477 non-null  object  
 2   style                   392477 non-null  object  
 3   color                   392477 non-null  object  
 4   color_group             392477 non-null  object  
 5   listing_price           392477 non-null  int64   
 6   price_group             392477 non-null  category
 7   brand                   392477 non-null  object  
 8   gender                  392477 non-null  object  
 9   product_group           392477 non-null  object  
 10  detailed_product_group  392477 non-null  object  
 11  shoe_product            392477 non-null  object  
 12  specialized_function    392477 non-null  object  
 13  dedicated_functions     392392 non-null  object  
 14  size

In [11]:
product_df["price_group"].unique()

array(['200<300', '100<200', '<100', '300<600', '600<900', '>=900', nan,
       '70<100', '<70', '<=70', '400<500', '300<400', '500<600',
       '600<800', '>=1200', '900<1000', '800<900', '1000<1200'],
      dtype=object)

In [10]:
product_df[product_df["sku"] == "DTB04103000001"]

Unnamed: 0,sku,style_color,style,color,color_group,listing_price,price_group,brand,gender,product_group,...,collection,copyright_group,image_copyright,product_line,design_inspiration,lifestyle_group,launch_season,similar_group,heel_height,applied_year
76026,DTB04103000001,DTB041030000,DTB041030,0,DARK,38000.0,200<300,BITI'S,BOY,SAN,...,CƠ BẢN,CƠ BẢN,CƠ BẢN,KHÁC,Không có,Casual,KHÁC,QLNQ,Cao 3 phân,2022
166169,DTB04103000001,DTB041030000,DTB041030,0,DARK,99000.0,70<100,BITI'S,BOY,SAN,...,CƠ BẢN,CƠ BẢN,CƠ BẢN,KHÁC,Không có,Casual,KHÁC,QLNQ,Cao 3 phân,2023
262180,DTB04103000001,DTB041030000,DTB041030,0,DARK,69000.0,<=70,BITI'S,BOY,SAN,...,CƠ BẢN,CƠ BẢN,CƠ BẢN,KHÁC,Không có,Casual,KHÁC,QLNQ,Cao 3 phân,2024
363782,DTB04103000001,DTB041030000,DTB041030,0,DARK,39000.0,<=70,BITI'S,BOY,SAN,...,CƠ BẢN,CƠ BẢN,CƠ BẢN,KHÁC,Không có,Casual,KHÁC,QLNQ,Cao 3 phân,2025


In [None]:
# Specialized Function
df["Specialized Function"].str.strip()
map_Specialized_Function = {"Hàng ngày":"Daily Use","Bóng đá":"Football","Đi bộ":"Walking","Chạy bộ":"Jogging","Y tế":"Medical","Công trình":"Construction","Khiêu vũ":"Dancing"}
df["Specialized Function"] = df["Specialized Function"].replace(map_Specialized_Function)

# Dedicated functions
df["Dedicated functions"].str.strip()
map_Dedicated_Function = {"Không có":"None","Sân đất nện":"Clay court","Sân cỏ nhân tạo":"Artificial turf","Giày đi bộ":"Walking shoes","20km":"20km","Sân futsal":"futsal court","40km":"40km","Sân Futsal":"futsal court","Giày bảo hộ":"Safety shoes","10km":"10km"}
df["Dedicated functions"] = df["Dedicated functions"].replace(map_Dedicated_Function)

# Size Group
df["Size Group"].str.strip()
map_Size_Group = {"Ngoại lệ":"Exception","Khung size chính":"Main size frame", "Khác":"Other","Đặc biệt":"Special"}
df["Size Group"] = df["Size Group"].replace(map_Size_Group)

# Age Group
replacements = {
    "đến": "To",
    "tuổi": "yrs old",
    "Trên": "Above",
    "Khác": "Other"
}
df["Age Group"] = df["Age Group"].astype(str)
df["Age Group"] = df["Age Group"].replace(replacements, regex=True)
df["Age Group"]

# Activity Group
df["Activity Group"].str.strip()
map_Activity_Group = {"Thường nhật/Trường học":"Daily/School","Văn phòng":"Office","Khác":"Other","Thể thao":"Sport","Chuyên biệt":"Specialized"}
df["Activity Group"] = df["Activity Group"].replace(map_Activity_Group)

# Copyright group
df["Copyright group"].str.strip()
map_Copyright_group = {"CƠ BẢN":"BASIC"}
df["Copyright group"] = df["Copyright group"].replace(map_Copyright_group)

# Image copyright
df["Image copyright"].str.strip()
map_Image_copyright = {"CƠ BẢN":"BASIC","KHÁC":"OTHER","KHÔNG":"NONE","KHÔNG CÓ":"NONE","VƯỜN RAU CỦA MẸ":"MOTHER'S VEGETABLE GARDEN","GẤU CHÚ TÂM":"Bear Mind","Gấu Chú Tâm":"Bear Mind"}
df["Image copyright"] = df["Image copyright"].replace(map_Image_copyright)

# Product Line
df["Product Line"].str.strip()
map_Product_Line = {"KHÁC":"OTHER","ÊMBRACE":"EMBRACE"}
df["Product Line"] = df["Product Line"].replace(map_Product_Line)