In [None]:
# ==========================================
# Import Required Libraries
# ==========================================
import pandas as pd
import os

# ==========================================
# Function: sales_data_for_db
# Purpose :
#   - Prepare sales records for PostgreSQL database
#   - Standardize column names and dtypes
#   - Aggregate data at (customer, product, year, month, week) level
# ==========================================
def sales_data_for_db(sales_path: str) -> pd.DataFrame:
    """
    Prepare sales records for PostgreSQL database.

    Parameters
    ----------
    sales_path : str
        File path to the sales CSV file.

    Returns
    -------
    pd.DataFrame
        Cleaned and aggregated sales dataset with standardized column names
        and optimized data types.
    """

    # Step 1: Define columns to read
    sales_usecols = [
        "MÃ KHÁCH",
        "MÃ HÀNG",
        "SLTT",
        "TGTT-GTT",
        "TGTT-GV",
        "TGTT-GSCK",
        "NĂM",
        "THÁNG",
        "TUẦN",
    ]

    # Step 2: Load dataset with selected columns
    raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)

    # Step 3: Rename columns to English-friendly names
    cleaned_sales_df = raw_sales_df.rename(
        columns={
            "MÃ KHÁCH": "customer_id",
            "MÃ HÀNG": "product_id",
            "SLTT": "sales_qty",
            "TGTT-GTT": "sales_gross_amt",
            "TGTT-GV": "sales_cogs_amt",
            "TGTT-GSCK": "sales_net_amt",
            "NĂM": "year",
            "THÁNG": "month",
            "TUẦN": "week",
        }
    )

    # Step 4: Keep only the last two digits of month/week
    cleaned_sales_df["month"] = cleaned_sales_df["month"].astype(str).str[-2:]
    cleaned_sales_df["week"] = cleaned_sales_df["week"].astype(str).str[-2:]

    # Step 5: Standardize data types
    cleaned_sales_df = cleaned_sales_df.astype(
        {
            "customer_id": "string",
            "product_id": "string",
            "sales_qty": "int16",
            "sales_gross_amt": "int32",
            "sales_cogs_amt": "int32",
            "sales_net_amt": "int32",
            "year": "int16",
            "month": "int8",
            "week": "int8",
        }
    )

    # Step 6: Aggregate sales data at (customer, product, year, month, week) level
    cleaned_sales_df = (
        cleaned_sales_df.groupby(
            ["customer_id", "product_id", "year", "month", "week"], as_index=False
        )
        .agg(
            {
                "sales_qty": "sum",
                "sales_gross_amt": "sum",
                "sales_cogs_amt": "sum",
                "sales_net_amt": "sum",
            }
        )
    )

    # Step 7: Sort records for better readability
    cleaned_sales_df = cleaned_sales_df.sort_values(
        by=["year", "month", "week", "customer_id", "product_id"],
        ascending=[False, False, False, True, True],
    ).reset_index(drop=True)

    return cleaned_sales_df

In [2]:
# ==========================================
# Run Sales Data Cleaning and Export Results
# ==========================================

# Define input file paths
sales_paths = {
    "2022": r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2022_adjusted.csv",
    "2023": r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2023.csv",
    "2024": r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2024.csv",
    "2025": r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2025.csv",
}

# Define output folder
output_dir = r"D:\footwear_retail_chain_project\0. input_data\sales\cleaned_data"

# Process each file and export cleaned results
for year, path in sales_paths.items():
    sales_df = sales_data_for_db(path)
    output_file = os.path.join(output_dir, f"sales_cleaned_{year}.csv")
    sales_df.to_csv(output_file, index=False)
    print(f"✅ Cleaned sales data for {year} saved to {output_file}")

  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)


✅ Cleaned sales data for 2022 saved to D:\footwear_retail_chain_project\0. input_data\sales\cleaned_data\sales_cleaned_2022.csv


  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)


✅ Cleaned sales data for 2023 saved to D:\footwear_retail_chain_project\0. input_data\sales\cleaned_data\sales_cleaned_2023.csv


  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)


✅ Cleaned sales data for 2024 saved to D:\footwear_retail_chain_project\0. input_data\sales\cleaned_data\sales_cleaned_2024.csv
✅ Cleaned sales data for 2025 saved to D:\footwear_retail_chain_project\0. input_data\sales\cleaned_data\sales_cleaned_2025.csv
