In [1]:
# ==========================================
# Import Required Libraries
# ==========================================
import os
import glob
import re
import pandas as pd


# ==========================================
# Function: load_product_data
# Purpose :
#   - Load raw product master data from multiple .xlsb files
#   - Extract applied year from filename
#   - Combine into a single DataFrame
# ==========================================
def load_product_data(input_folder: str) -> pd.DataFrame:
    """
    Load raw product master data from multiple .xlsb files.

    Parameters
    ----------
    input_folder : str
        Path to the folder containing product master .xlsb files.

    Returns
    -------
    pd.DataFrame
        Combined DataFrame with applied year column.
    """
    all_dfs = []

    # Step 1: Iterate through all .xlsb files in the input folder
    for file_path in glob.glob(os.path.join(input_folder, "product_master_*.xlsb")):
        # Extract year from filename
        year_match = re.search(r"(\d{4})", os.path.basename(file_path))
        year = year_match.group(1) if year_match else None

        # Read the "DANH MỤC" sheet
        temp_df = pd.read_excel(file_path, sheet_name="DANH MỤC", skiprows=1, engine="pyxlsb")
        temp_df["applied_year"] = year
        all_dfs.append(temp_df)

    # Step 2: Combine all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)

    return combined_df


# ==========================================
# Function: clean_product_data
# Purpose :
#   - Select relevant columns
#   - Standardize column names
#   - Clean and transform key fields (sku, color, price, etc.)
#   - Create price group categories
# ==========================================
def clean_product_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize raw product master data.

    Parameters
    ----------
    df : pd.DataFrame
        Raw product master DataFrame.

    Returns
    -------
    pd.DataFrame
        Cleaned and standardized DataFrame.
    """
    # Step 1: Keep only relevant columns
    df = df[[
        "SKU", "Style color", "Style", "Color", "Color Group", "Listing Price",
        "Price Group", "Brand", "Gender", "Product Group", "Detailed Product Group",
        "Shoe Product", "Specialized Function", "Dedicated functions", "Size Group",
        "Size", "Size Range", "Age Group", "Activity Group", "Collection", "Copyright group",
        "Image copyright", "Product Line", "Design Inspiration", "Lifestyle Group", "Launch Season",
        "Similar group", "Heel Height", "applied_year"
    ]]

    # Step 2: Standardize column names
    df.columns = df.columns.str.lower().str.replace(" ", "_")

    # Step 3: Clean SKU
    df["sku"] = (
        df["sku"].apply(lambda x: str(x) + "00" if len(str(x)) < 14 else str(x))
        .astype(str)
        .str.upper()
    )

    # Step 4: Clean color
    df["color"] = (
        df["color"]
        .fillna(df["style_color"].astype(str).str[-3:])
        .replace(["", "0", 0], "000")
        .astype(str)
        .str.upper()
    )

    # Step 5: Clean color group
    df["color_group"] = (
        df["color_group"]
        .replace({
            "TRẮNG": "WHITE",
            "ĐEN": "BLACK",
            "ĐEN ": "BLACK",
            "SÁNG": "LIGHT",
            "TỐI": "DARK",
            "NHIỀU MÀU": "MULTI",
            "MÀU SẮC": "MULTI",
            "KHÁC": "OTHER",
        })
        .astype(str)
        .str.upper()
    )

    # Step 6: Clean listing price
    df["listing_price"] = df["listing_price"].fillna(0).astype(int)

    # Step 7: Create price group
    df["price_group"] = pd.cut(
        df["listing_price"],
        bins=[-float("inf"), 70_000, 100_000, 200_000, 300_000, 400_000,
              500_000, 600_000, 800_000, 900_000, 1_000_000, 1_200_000, float("inf")],
        labels=["<=70", "70<=100", "100<=200", "200<=300", "300<=400",
                "400<=500", "500<=600", "600<=800", "800<=900",
                "900<=1000", "1000<=1200", ">1200"]
    )

    # (Optional) Step 8: Handle specialized_function / dedicated_functions
    # Placeholder for additional cleaning logic

    return df


# ==========================================
# Main Script
# ==========================================
if __name__ == "__main__":
    # Define input folder
    input_folder = r"D:\footwear_retail_chain_project\0. input_data\product\raw_data"

    # Step 1: Load raw data
    product_df = load_product_data(input_folder)

    # Step 2: Clean data
    product_df = clean_product_data(product_df)

    # Step 3: Inspect cleaned DataFrame
    product_df.info()

    # Step 4: Export cleaned DataFrame to CSV
    output_path = r"D:\footwear_retail_chain_project\0. input_data\product\processed_data"
    output_file = os.path.join(output_path, "product_master.csv")

    product_df.to_csv(output_file, index=False)
    print(f"✅ Cleaned product data exported to: {output_file}")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["sku"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["color"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df["color_group"] = (
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the docume

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 392477 entries, 0 to 392476
Data columns (total 29 columns):
 #   Column                  Non-Null Count   Dtype   
---  ------                  --------------   -----   
 0   sku                     392477 non-null  object  
 1   style_color             392477 non-null  object  
 2   style                   392477 non-null  object  
 3   color                   392477 non-null  object  
 4   color_group             392477 non-null  object  
 5   listing_price           392477 non-null  int64   
 6   price_group             392477 non-null  category
 7   brand                   392477 non-null  object  
 8   gender                  392477 non-null  object  
 9   product_group           392477 non-null  object  
 10  detailed_product_group  392477 non-null  object  
 11  shoe_product            392477 non-null  object  
 12  specialized_function    392477 non-null  object  
 13  dedicated_functions     392392 non-null  object  
 14  size

In [2]:
# # Specialized Function
# df["Specialized Function"].str.strip()
# map_Specialized_Function = {"Hàng ngày":"Daily Use","Bóng đá":"Football","Đi bộ":"Walking","Chạy bộ":"Jogging","Y tế":"Medical","Công trình":"Construction","Khiêu vũ":"Dancing"}
# df["Specialized Function"] = df["Specialized Function"].replace(map_Specialized_Function)

# # Dedicated functions
# df["Dedicated functions"].str.strip()
# map_Dedicated_Function = {"Không có":"None","Sân đất nện":"Clay court","Sân cỏ nhân tạo":"Artificial turf","Giày đi bộ":"Walking shoes","20km":"20km","Sân futsal":"futsal court","40km":"40km","Sân Futsal":"futsal court","Giày bảo hộ":"Safety shoes","10km":"10km"}
# df["Dedicated functions"] = df["Dedicated functions"].replace(map_Dedicated_Function)

# # Size Group
# df["Size Group"].str.strip()
# map_Size_Group = {"Ngoại lệ":"Exception","Khung size chính":"Main size frame", "Khác":"Other","Đặc biệt":"Special"}
# df["Size Group"] = df["Size Group"].replace(map_Size_Group)

# # Age Group
# replacements = {
#     "đến": "To",
#     "tuổi": "yrs old",
#     "Trên": "Above",
#     "Khác": "Other"
# }
# df["Age Group"] = df["Age Group"].astype(str)
# df["Age Group"] = df["Age Group"].replace(replacements, regex=True)
# df["Age Group"]

# # Activity Group
# df["Activity Group"].str.strip()
# map_Activity_Group = {"Thường nhật/Trường học":"Daily/School","Văn phòng":"Office","Khác":"Other","Thể thao":"Sport","Chuyên biệt":"Specialized"}
# df["Activity Group"] = df["Activity Group"].replace(map_Activity_Group)

# # Copyright group
# df["Copyright group"].str.strip()
# map_Copyright_group = {"CƠ BẢN":"BASIC"}
# df["Copyright group"] = df["Copyright group"].replace(map_Copyright_group)

# # Image copyright
# df["Image copyright"].str.strip()
# map_Image_copyright = {"CƠ BẢN":"BASIC","KHÁC":"OTHER","KHÔNG":"NONE","KHÔNG CÓ":"NONE","VƯỜN RAU CỦA MẸ":"MOTHER'S VEGETABLE GARDEN","GẤU CHÚ TÂM":"Bear Mind","Gấu Chú Tâm":"Bear Mind"}
# df["Image copyright"] = df["Image copyright"].replace(map_Image_copyright)

# # Product Line
# df["Product Line"].str.strip()
# map_Product_Line = {"KHÁC":"OTHER","ÊMBRACE":"EMBRACE"}
# df["Product Line"] = df["Product Line"].replace(map_Product_Line)