In [1]:
# Define a function to clean sales records for SQL database
# Data schema:
    # customer_id (str), 
    # product_id (str), 
    # sales_qty (int16),
    # sales_gross_amt (int32),
    # sales_cogs_amt (int32),
    # sales_net_amt (int32),
    # year (int16), 
    # month (int8), 
    # week (int8)

In [2]:
import pandas as pd
import numpy as np
import polars as pl

def sales_data_for_db(sales_path):

    # Determine columns to be used
    sales_usecols = [
        'MÃ KHÁCH',
        'MÃ HÀNG',
        'SLTT',
        'TGTT-GTT',
        'TGTT-GV',
        'TGTT-GSCK',
        'NĂM',
        'THÁNG',
        'TUẦN'
        ]

    # Read the sales data and rename columns
    raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)
    cleaned_sales_df = raw_sales_df.rename(
        columns={
            'MÃ KHÁCH': 'customer_id',
            'MÃ HÀNG': 'product_id',
            'SLTT': 'sales_qty',
            'TGTT-GTT': 'sales_gross_amt',
            'TGTT-GV': 'sales_cogs_amt',
            'TGTT-GSCK': 'sales_net_amt',
            'NĂM': 'year',
            'THÁNG': 'month',
            'TUẦN': 'week'
        })

    # Extract the last two digits of month and week
    cleaned_sales_df['month'] = cleaned_sales_df['month'].astype(str).str[-2:]
    cleaned_sales_df['week'] = cleaned_sales_df['week'].astype(str).str[-2:]

    # Change data types
    cleaned_sales_df = cleaned_sales_df.astype({
        'customer_id': 'str',
        'product_id': 'str',
        'sales_qty': 'int16',
        'sales_gross_amt': 'int32',
        'sales_cogs_amt': 'int32',
        'sales_net_amt': 'int32',
        'year': 'int16',
        'month': 'int8',
        'week': 'int8'
    })

    # Group by customer_id, product_id, year, month, and week
    cleaned_sales_df = cleaned_sales_df.groupby(
        ['customer_id', 'product_id', 'year', 'month', 'week'],
        as_index=False
    ).agg({
        'sales_qty': 'sum',
        'sales_gross_amt': 'sum',
        'sales_cogs_amt': 'sum',
        'sales_net_amt': 'sum'
    })

    # Sort by customer_id, product_id, year, month, and week
    cleaned_sales_df = cleaned_sales_df.sort_values(
        by=['year', 'month', 'week', 'customer_id', 'product_id'],
        ascending=[False, False, False, True, True]
    ).reset_index(drop=True)

    return cleaned_sales_df

In [3]:
# Run the function
sales_path_2022 = "D:\\footwear_retail_chain_project\\0. input_data\\sales\\raw_data\\sales_raw_2022_adjusted.csv"
sales_path_2023 = "D:\\footwear_retail_chain_project\\0. input_data\\sales\\raw_data\\sales_raw_2023.csv"
sales_path_2024 = "D:\\footwear_retail_chain_project\\0. input_data\\sales\\raw_data\\sales_raw_2024.csv"

sales_2022 = sales_data_for_db(sales_path_2022)
sales_2023 = sales_data_for_db(sales_path_2023)
sales_2024 = sales_data_for_db(sales_path_2024)

# Export to CSV
sales_2022.to_csv("D:\\footwear_retail_chain_project\\0. input_data\\sales\\cleaned_data\\sales_cleaned_2022.csv", index=False)
sales_2023.to_csv("D:\\footwear_retail_chain_project\\0. input_data\\sales\\cleaned_data\\sales_cleaned_2023.csv", index=False)
sales_2024.to_csv("D:\\footwear_retail_chain_project\\0. input_data\\sales\\cleaned_data\\sales_cleaned_2024.csv", index=False)

  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)
  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)
  raw_sales_df = pd.read_csv(sales_path, usecols=sales_usecols)
