In [1]:
# ==========================================
# Import Required Libraries
# ==========================================
import os
import glob
import numpy as np
import pandas as pd


# ==========================================
# Function: clean_retail_price_2022
# Purpose :
#   - Standardize and clean retail price data for 2022
#   - Convert all prices to VND
#   - Expand availability by month
#   - Reshape into long format
# ==========================================
def clean_retail_price_2022(path: str) -> pd.DataFrame:
    """
    Clean and standardize 2022 retail price data.

    Steps:
    - Read and concatenate all retail price Excel files.
    - Normalize dates and convert to monthly periods.
    - Convert all prices to VND.
    - Keep only the latest valid price per product/month.
    - Generate monthly availability flags for 2022.
    - Reshape into long format with product_id, retail_price_vnd, month.

    Parameters
    ----------
    path : str
        Directory containing retail_price_*_2022.xlsx files.

    Returns
    -------
    pd.DataFrame
        Cleaned retail price data.
    """
    # Step 1: Read and combine Excel files
    df = pd.concat(
        [
            pd.read_excel(file, usecols=[1, 2, 3, 6, 7])
            for file in glob.glob(os.path.join(path, "retail_price_*_2022.xlsx"))
        ],
        ignore_index=True,
    )
    df.columns = ["product_id", "retail_price", "currency", "valid_from", "valid_to"]

    # Step 2: Normalize dates → monthly period
    df[["valid_from", "valid_to"]] = df[["valid_from", "valid_to"]].replace(
        "31.12.9999", "31.12.2050"
    )
    df["valid_from"] = pd.to_datetime(
        df["valid_from"], format="%d.%m.%Y", errors="coerce"
    ).dt.to_period("M")
    df["valid_to"] = pd.to_datetime(
        df["valid_to"], format="%d.%m.%Y", errors="coerce"
    ).dt.to_period("M")

    # Step 3: Convert all prices to VND
    df["retail_price"] = pd.to_numeric(df["retail_price"], errors="coerce").fillna(0).astype("int32")
    df["retail_price_vnd"] = np.where(
        df["currency"] == "USD",
        np.round(df["retail_price"] * 25000),
        df["retail_price"]
    ).astype("int32")

    # Step 4: Keep only the latest updated price
    df = (
        df.sort_values(by=["product_id", "valid_from", "valid_to", "retail_price_vnd"])
          .drop_duplicates(subset=["product_id", "valid_from"], keep="last")
    )

    # Step 5: Generate availability flags (Jan–Dec 2022)
    months_2022 = pd.period_range("2022-01", "2022-12", freq="M")
    for month in months_2022:
        df[str(month)] = ((df["valid_from"] <= month) & (df["valid_to"] >= month)).astype("int16")

    # Step 6: Reshape to long format
    df = df.melt(
        id_vars=["product_id", "retail_price_vnd"],
        value_vars=[str(m) for m in months_2022],
        var_name="month",
        value_name="price_availability",
    )

    # Step 7: Keep only available prices
    df = (
        df[df["price_availability"] == 1]
        .drop(columns=["price_availability"])
        .drop_duplicates(subset=["product_id", "month"], keep="last")
    )

    # Step 8: Extract month number (1–12)
    df["month"] = df["month"].str[5:].astype("int16")

    return df


# ==========================================
# Function: join_sales_with_retail_price
# Purpose :
#   - Merge sales data with retail price
#   - Compute gross sales amount
# ==========================================
def join_sales_with_retail_price(sales_2022_path: str, retail_price_2022_path: str) -> pd.DataFrame:
    """
    Join 2022 sales data with cleaned retail price data.

    Parameters
    ----------
    sales_2022_path : str
        Path to sales CSV file.
    retail_price_2022_path : str
        Path to retail price directory.

    Returns
    -------
    pd.DataFrame
        Sales joined with retail price and gross sales amount.
    """
    # Step 1: Load sales data
    sales_2022 = pd.read_csv(sales_2022_path)
    sales_2022["THÁNG2"] = sales_2022["THÁNG"].astype(str).str[4:].astype(int)

    # Step 2: Load cleaned retail price data
    retail_price_2022 = clean_retail_price_2022(retail_price_2022_path)

    # Step 3: Merge sales with retail price
    join_data_2022 = pd.merge(
        sales_2022,
        retail_price_2022,
        left_on=["SKU", "THÁNG2"],
        right_on=["product_id", "month"],
        how="left",
    )

    # Step 4: Compute gross sales amount
    join_data_2022["retail_price_vnd"] = join_data_2022["retail_price_vnd"].fillna(0).astype("int32")
    join_data_2022["TGTT-GTT"] = join_data_2022["retail_price_vnd"] * join_data_2022["SLTT"]

    # Step 5: Select & rename columns
    join_data_2022 = join_data_2022[
        ["MÃ KHÁCH", "SKU", "SLTT", "TGTT-GTT", "TGTT-GV", "TGTT-GSCK", "NĂM", "THÁNG", "TUẦN", "retail_price_vnd"]
    ].rename(columns={"SKU": "MÃ HÀNG"})

    return join_data_2022


# ==========================================
# Execution
# ==========================================
if __name__ == "__main__":
    sales_2022_path = r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2022.csv"
    retail_price_2022_path = r"D:\footwear_retail_chain_project\0. input_data\retail_price"

    join_data_2022 = join_sales_with_retail_price(sales_2022_path, retail_price_2022_path)
    print("✅ Sales joined with retail prices for 2022")
    print(join_data_2022.head())

  sales_2022 = pd.read_csv(sales_2022_path)


✅ Sales joined with retail prices for 2022
   MÃ KHÁCH         MÃ HÀNG  SLTT  TGTT-GTT  TGTT-GV  TGTT-GSCK   NĂM  \
0  60000003  DEB008500DEN34     4   1220000   829600     963800  2022   
1  60000003  DEB008500DEN35     4   1220000   829600     963800  2022   
2  60000003  DEB008500DEN36     4   1220000   829600     963800  2022   
3  60000003  DEB008500DEN37     4   1220000   829600     963800  2022   
4  60000003  DEB008500REU34     4   1220000   829600     963800  2022   

     THÁNG    TUẦN  retail_price_vnd  
0  2022001  202203            305000  
1  2022001  202203            305000  
2  2022001  202203            305000  
3  2022001  202203            305000  
4  2022001  202203            305000  


In [2]:
# ==========================================
# Recheck Retail Price Processing Results
# Product: DEB008500DEN34
# ==========================================


# ------------------------------------------
# Step 1: Load raw retail price data
# ------------------------------------------
path_price = r"D:\footwear_retail_chain_project\0. input_data\retail_price"

raw_price = pd.concat(
    [
        pd.read_excel(file, usecols=[1, 2, 3, 6, 7])
        for file in glob.glob(os.path.join(path_price, "retail_price_*_2022.xlsx"))
    ],
    ignore_index=True,
)
raw_price.columns = ["product_id", "retail_price", "currency", "valid_from", "valid_to"]

# Filter for specific product
raw_price = (
    raw_price[raw_price["product_id"] == "DEB008500DEN34"]
    .drop_duplicates(
        subset=["product_id", "valid_from", "valid_to", "retail_price"],
        keep="last",
    )
    .sort_values(by=["product_id", "valid_from", "valid_to", "retail_price"])
)

print("✅ Raw retail price data (DEB008500DEN34):")
display(raw_price)


# ------------------------------------------
# Step 2: Load processed sales–price data
# ------------------------------------------
processed_price = (
    join_data_2022[join_data_2022["MÃ HÀNG"] == "DEB008500DEN34"]
    .groupby(by=["MÃ HÀNG", "THÁNG"])
    .agg(average_price=("retail_price_vnd", "mean"))
    .reset_index()
    .sort_values(by=["THÁNG"])
)

print("✅ Processed retail price data (DEB008500DEN34):")
display(processed_price)

✅ Raw retail price data (DEB008500DEN34):


Unnamed: 0,product_id,retail_price,currency,valid_from,valid_to
3430629,DEB008500DEN34,305000.0,VND,01.01.2022,13.02.2022
2160319,DEB008500DEN34,305000.0,VND,01.01.2022,31.12.9999
4016607,DEB008500DEN34,305000.0,VND,01.01.2023,31.12.9999
5227614,DEB008500DEN34,325000.0,VND,01.01.2023,31.12.9999
1539489,DEB008500DEN34,280000.0,VND,03.02.2021,31.12.2021
2380083,DEB008500DEN34,299455.0,VND,14.02.2022,14.09.2022
4016606,DEB008500DEN34,299455.0,VND,14.02.2022,31.12.2022
5227613,DEB008500DEN34,319091.0,VND,15.09.2022,31.12.2022


✅ Processed retail price data (DEB008500DEN34):


Unnamed: 0,MÃ HÀNG,THÁNG,average_price
0,DEB008500DEN34,2022001,305000.0
1,DEB008500DEN34,2022002,299455.0
2,DEB008500DEN34,2022003,299455.0
3,DEB008500DEN34,2022004,299455.0
4,DEB008500DEN34,2022005,299455.0
5,DEB008500DEN34,2022006,299455.0
6,DEB008500DEN34,2022007,299455.0
7,DEB008500DEN34,2022008,299455.0
8,DEB008500DEN34,2022009,319091.0
9,DEB008500DEN34,2022010,319091.0


In [3]:
# ==========================================
# Compare Sales Results (2022 vs 2023 vs 2024)
# ==========================================

def summarize_sales(df: pd.DataFrame, year: int) -> None:
    """
    Print sales summary (quantity, gross amount, net amount).

    Parameters
    ----------
    df : pd.DataFrame
        Sales data for the year.
    year : int
        Year of the sales data.
    """
    sales_qty = df["SLTT"].sum()
    gross_amt = df["TGTT-GTT"].sum()
    net_amt = df["TGTT-GSCK"].sum()

    print(f"📊 Sales Summary {year}")
    print(f"   - Sales quantity : {sales_qty:,}")
    print(f"   - Gross amount   : {gross_amt:,}")
    print(f"   - Net amount     : {net_amt:,}")
    print("-" * 40)


# ------------------------------------------
# 2022 (already joined with retail price)
# ------------------------------------------
summarize_sales(join_data_2022, 2022)

# ------------------------------------------
# 2023
# ------------------------------------------
sales_2023 = pd.read_csv(
    r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2023.csv"
)
summarize_sales(sales_2023, 2023)

# ------------------------------------------
# 2024
# ------------------------------------------
sales_2024 = pd.read_csv(
    r"D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2024.csv"
)
summarize_sales(sales_2024, 2024)

📊 Sales Summary 2022
   - Sales quantity : 8,055,190
   - Gross amount   : 2,569,777,366,486
   - Net amount     : 2,277,043,458,782
----------------------------------------


  sales_2023 = pd.read_csv(


📊 Sales Summary 2023
   - Sales quantity : 6,386,469
   - Gross amount   : 2,114,829,142,804
   - Net amount     : 1,869,450,634,284
----------------------------------------


  sales_2024 = pd.read_csv(


📊 Sales Summary 2024
   - Sales quantity : 6,192,119
   - Gross amount   : 1,976,836,210,159
   - Net amount     : 1,748,603,616,677
----------------------------------------


In [4]:
# Export 2022 data to CSV
join_data_2022.to_csv(r'D:\footwear_retail_chain_project\0. input_data\sales\raw_data\sales_raw_2022_adjusted.csv')