In [1]:
# ==========================================
# Import Required Libraries
# ==========================================
import os
import glob
import re
import pandas as pd


# ==========================================
# Function: load_distribution_data
# Purpose :
#   - Read all distribution master Excel files across years
#   - Extract year information from filenames
#   - Concatenate into a single DataFrame
# ==========================================
def load_distribution_data(input_folder: str) -> pd.DataFrame:
    """
    Load and combine distribution master data across multiple years.

    Parameters
    ----------
    input_folder : str
        Path to the folder containing raw distribution master Excel files.

    Returns
    -------
    pd.DataFrame
        Concatenated DataFrame containing all years' data,
        with an additional column 'applied_year'.
    """

    all_dfs = []

    # Find all Excel files with naming pattern: distribution_master_YYYY.xlsx
    for file_path in glob.glob(os.path.join(input_folder, "distribution_master_*.xlsx")):

        # Extract year from filename (e.g., distribution_master_2021.xlsx → "2021")
        year_match = re.search(r"(\d{4})", os.path.basename(file_path))
        year = year_match.group(1) if year_match else None

        # Load sheet "1.Data dữ liệu", skip first row (header starts from 2nd row)
        temp_df = pd.read_excel(file_path, sheet_name="1.Data dữ liệu", skiprows=1)

        # Add applied year column
        temp_df["applied_year"] = year

        # Store DataFrame
        all_dfs.append(temp_df)

    # Concatenate all DataFrames
    combined_df = pd.concat(all_dfs, ignore_index=True)

    return combined_df


# ==========================================
# Function: clean_distribution_data
# Purpose :
#   - Clean and standardize raw distribution data
#   - Drop irrelevant columns
#   - Standardize column names and values
#   - Handle missing values
#   - Create active store flag
# ==========================================
def clean_distribution_data(df: pd.DataFrame) -> pd.DataFrame:
    """
    Clean and standardize raw distribution data.

    Parameters
    ----------
    df : pd.DataFrame
        Raw distribution master DataFrame.

    Returns
    -------
    pd.DataFrame
        Cleaned and standardized DataFrame.
    """

    # Step 1: Drop unnecessary columns
    df = df.loc[:, ~df.columns.str.contains("^Unnamed")]
    df = df.drop(
        columns=[
            "Address 1", "Address 2", "Address 3", "Business forms",
            "Tháng mở", "Năm mở", "Tháng đóng", "Note"
        ],
        errors="ignore"  # avoid errors if columns are missing
    )

    # Step 2: Standardize column names
    df = df.rename(columns={
        "Custmer code": "customer_code",
        "Site store": "store_code",
        "B2B/B2C": "b2b_b2c",
        "Channel": "channel",
        "Region": "region",
        "City Level": "city_level",
        "Store Concept": "store_concept",
        "Store Level": "store_level",
        "Trade term": "trade_term",
        "Area group": "area_group",
        "Store type": "store_type",
        "Urbanization": "urbanization",
        "Customer Name": "customer_name",
        "Branch": "branch",
        "Show room area": "showroom_area",
        "Warehouse area": "warehouse_area",
        "Năm đóng": "close_year"
    })

    # Step 3: Define value mappings
    mappings = {
        "b2b_b2c": {"B2B": 0, "B2C": 1},
        "channel": {
            "CHTT": "retail_store", "ONLINE": "online",
            "TGPP": "distribution_intermediary", "ST": "supermarket"
        },
        "region": {
            "KVMN": "southern_area", "KVMB": "northern_area",
            "KVMT": "western_area", "KVMTR": "central_area",
            "KVTN": "highland_area", "KVCA": "cambodia_area",
            "Trung Quốc": "china_area", "Khác": "other"
        },
        "city_level": {
            "Cấp 1": "level_1", "Cấp 2": "level_2",
            "Cấp TW": "central_level", "Còn lại": "other",
            "Khác": "other", "": "other"
        },
        "store_concept": {
            "Biti's": "regular_store", "Biti’s": "regular_store",
            "Biti’s Hunter": "specialized_product_store",
            "Biti’s Premium": "premium_store", "BITI'S WEB": "online_store",
            "CHC": "standard_store", "CHCD": "specialized_business_store",
            "DL": "agency", "Outlet": "outlet", "Siêu thị": "supermarket",
            "TMDT": "ecommerce_platform", "Khác": "other", "": "other"
        },
        "store_level": {
            "TIÊU BIỂU": "best_store", "A+": "A+", "A": "A", "B": "B", "C": "C",
            "N": "new_store", "Closed": "other", "NA": "other", "NA ": "other"
        },
        "trade_term": {
            "Drop-ship": "drop_ship",
            "Mua đứt bán đoạn": "buy_and_sell",
            "Ký gửi": "consignment"
        },
        "area_group": {
            "<= 50 m2": "<=50m2", "50 m2 < và <= 100 m2": "50-100m2",
            "100 m2 < và <= 150 m2": "100-150m2", "150 m2 < và <= 200 m2": "150-200m2",
            "> 200 m2": ">200m2", "Closed": "other", "NA": "other", "NA ": "other"
        },
        "store_type": {
            "Siêu thị": "supermarket", "Độc lâp": "independent_store",
            "Độc lập": "independent_store", "Doc lap": "independent_store",
            "Store in Store": "store_in_store", "Store In Store": "store_in_store",
            "Online": "online", "Trong mall": "shopping_mall",
            "Pop - up": "pop_up", "POP-UP": "pop_up",
            "Khác": "other", "": "other"
        },
        "urbanization": {
            "TT hành chính kinh tế": "downtown", "Nông thôn": "rural",
            "Nội thành": "urban", "Khác": "other", "": "other"
        },
        "branch": {
            "CNMB": "northern_branch", "CNMTR-TN": "central_highland_branch",
            "CNMN": "southern_branch", "CNMT": "western_branch",
            "CNTP": "pioneer_branch", "CNLC": "lao_cai_branch",
            "CNCA": "cambodia_branch"
        },
        "showroom_area": {"Closed": 0, "NA": 0},
        "warehouse_area": {"Closed": 0, "NA": 0},
    }

    # Step 4: Apply value mappings
    for col, mapping in mappings.items():
        if col in df.columns:
            if all(isinstance(v, (int, float)) for v in mapping.values()):
                df[col] = df[col].replace(mapping)
            else:
                df[col] = df[col].map(mapping)

    # Step 5: Handle missing values
    fill_na = {
        "channel": "partner",
        "store_concept": "other",
        "showroom_area": 0,
        "warehouse_area": 0,
        "area_group": "other",
        "store_level": "other",
        "customer_name": "unknown"
    }
    df = df.fillna(value=fill_na)

    # For store_code: if NA → take customer_code
    df["store_code"] = df["store_code"].fillna(df["customer_code"])

    # Step 6: Create active store flag
    df["active_store"] = df["close_year"].notna().map({True: 0, False: 1})

    # Step 7: Drop the "close_year" column
    df = df.drop(columns=["close_year"], errors="ignore")

    # Step 8: Align data types
    dtype_mappings = {
        "store_code": int,
        "b2b_b2c": int,
        "showroom_area": float,
        "warehouse_area": float,
        "applied_year": int,
        "active_store": int
    }
    for col, dtype in dtype_mappings.items():
        if col in df.columns:
            df[col] = df[col].astype(dtype, errors="ignore")

    return df


# ==========================================
# Main Script
# ==========================================
if __name__ == "__main__":
    # Define input folder
    input_folder = r"D:\footwear_retail_chain_project\0. input_data\distribution\raw_data"
    
    # Step 1: Load raw distribution data
    distribution_df = load_distribution_data(input_folder)

    # Step 2: Clean data
    distribution_df = clean_distribution_data(distribution_df)

    # Step 3: Inspect cleaned DataFrame
    distribution_df.info()

    # ==========================================
    # Export Cleaned DataFrame to CSV
    # ==========================================

    # Define output directory and file path
    output_path = r"D:\footwear_retail_chain_project\0. input_data\distribution\processed_data"
    output_file = os.path.join(output_path, "distribution_master.csv")

    # Save DataFrame to CSV
    distribution_df.to_csv(output_file, index=False)
    print(f"✅ Cleaned distribution data exported to: {output_file}")

  warn(msg)
  warn(msg)
  warn(msg)


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13441 entries, 0 to 13440
Data columns (total 18 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   customer_code   13441 non-null  object 
 1   store_code      13441 non-null  int64  
 2   b2b_b2c         13441 non-null  int64  
 3   channel         13441 non-null  object 
 4   region          13441 non-null  object 
 5   city_level      13441 non-null  object 
 6   store_concept   13441 non-null  object 
 7   store_level     13441 non-null  object 
 8   trade_term      13441 non-null  object 
 9   area_group      13441 non-null  object 
 10  store_type      13441 non-null  object 
 11  urbanization    13441 non-null  object 
 12  customer_name   13441 non-null  object 
 13  branch          13441 non-null  object 
 14  showroom_area   13441 non-null  float64
 15  warehouse_area  13441 non-null  float64
 16  applied_year    13441 non-null  int64  
 17  active_store    13441 non-null 

  warn(msg)
  df[col] = df[col].replace(mapping)
  df["store_code"] = df["store_code"].fillna(df["customer_code"])
