In [1]:
# ----------------------------------------
# Environment & Path Setup
# ----------------------------------------

import sys
from pathlib import Path

python_dir = Path().resolve().parent
sys.path.insert(0, str(python_dir))

print("Python path configured:", python_dir)


Python path configured: D:\projects\online-shop-2024-analysis\python


In [2]:
# ----------------------------------------
# Database Connection
# ----------------------------------------

import pandas as pd
from scripts.db_connection import get_mysql_engine

engine = get_mysql_engine()
print("Database engine ready")


Database engine ready


In [4]:
# Validate customer distribution by region
region_check = pd.read_sql("""
SELECT region, COUNT(*) AS customer_count
FROM customer_region
GROUP BY region
""", engine)

region_check


Unnamed: 0,region,customer_count
0,North,4890
1,East,3290
2,South,1540
3,West,280


In [5]:
# Validate region-level revenue in Python
region_sales_check = pd.read_sql("""
SELECT
    cr.region,
    SUM(fs.revenue) AS total_revenue
FROM fact_sales fs
JOIN customer_region cr
    ON fs.customer_id = cr.customer_id
GROUP BY cr.region
""", engine)

region_sales_check


Unnamed: 0,region,total_revenue
0,North,16823939.27
1,East,11259547.98
2,South,4985027.89
3,West,954610.58


In [8]:
# =====================================================
# Load Required Tables
# =====================================================

fact_sales       = pd.read_sql("SELECT * FROM fact_sales", engine)
dim_product      = pd.read_sql("SELECT * FROM dim_product", engine)
dim_date         = pd.read_sql("SELECT * FROM dim_date", engine)
customer_region  = pd.read_sql("SELECT * FROM customer_region", engine)
customer_rfm     = pd.read_sql("SELECT * FROM customer_rfm_segments", engine)

print("Tables loaded successfully")


Tables loaded successfully


In [9]:
# =====================================================
# Sales Trend Dataset (Year-Month Level)
# =====================================================

sales_trend = (
    fact_sales
    .merge(dim_date, on="date_id", how="left")
    .groupby(["year", "month"])
    .agg(
        total_revenue=("revenue", "sum"),
        total_orders=("order_id", "nunique")
    )
    .reset_index()
)

sales_trend.head()


Unnamed: 0,year,month,total_revenue,total_orders
0,2023,11,2167424.54,780
1,2023,12,2656485.05,974
2,2024,1,3043299.68,1047
3,2024,2,2527362.36,924
4,2024,3,3016162.11,1042


In [10]:
# =====================================================
# Product Performance Dataset
# =====================================================

product_performance = (
    fact_sales
    .merge(dim_product, on="product_id", how="left")
    .groupby(["product_id", "product_name", "category"])
    .agg(
        total_quantity=("quantity", "sum"),
        total_revenue=("revenue", "sum")
    )
    .reset_index()
)

product_performance.head()


Unnamed: 0,product_id,product_name,category,total_quantity,total_revenue
0,1,Office Chair,Furniture,2.0,280.2
1,2,Coffee Maker,Home & Kitchen,22.0,9258.98
2,3,Document Scanner,Electronics,40.0,15353.63
3,4,Desk Mat,Accessories,60.0,30000.1
4,5,Tablet Stand,Accessories,29.0,12823.29


In [11]:
# =====================================================
# Region-wise Sales Dataset
# =====================================================

region_sales = (
    fact_sales
    .merge(customer_region, on="customer_id", how="left")
    .groupby("region")
    .agg(
        total_revenue=("revenue", "sum"),
        total_orders=("order_id", "nunique")
    )
    .reset_index()
)

region_sales.head()


Unnamed: 0,region,total_revenue,total_orders
0,East,11259547.98,3940
1,North,16823939.27,5871
2,South,4985027.89,1859
3,West,954610.58,330


In [12]:
# =====================================================
# Customer Segment Performance Dataset
# =====================================================

segment_sales = (
    fact_sales
    .merge(customer_rfm, on="customer_id", how="left")
    .groupby("Segment")
    .agg(
        total_revenue=("revenue", "sum"),
        total_orders=("order_id", "nunique")
    )
    .reset_index()
)

segment_sales.head()


Unnamed: 0,Segment,total_revenue,total_orders
0,About To Sleep,2275361.45,811
1,Champions,7217583.68,1891
2,Hibernating,413810.94,422
3,Lost,13344526.69,4502
4,Loyal Customers,4259873.61,1497


In [13]:
# =====================================================
# Customer Segment Performance Dataset
# =====================================================

segment_sales = (
    fact_sales
    .merge(customer_rfm, on="customer_id", how="left")
    .groupby("Segment")
    .agg(
        total_revenue=("revenue", "sum"),
        total_orders=("order_id", "nunique")
    )
    .reset_index()
)

segment_sales.head()


Unnamed: 0,Segment,total_revenue,total_orders
0,About To Sleep,2275361.45,811
1,Champions,7217583.68,1891
2,Hibernating,413810.94,422
3,Lost,13344526.69,4502
4,Loyal Customers,4259873.61,1497


In [14]:
# =====================================================
# Persist Dashboard Tables (NO LOGIC IN BI)
# =====================================================

sales_trend.to_sql("dash_sales_trend", engine, if_exists="replace", index=False)
product_performance.to_sql("dash_product_performance", engine, if_exists="replace", index=False)
region_sales.to_sql("dash_region_sales", engine, if_exists="replace", index=False)
segment_sales.to_sql("dash_segment_sales", engine, if_exists="replace", index=False)

print("Dashboard tables successfully created in MySQL")


Dashboard tables successfully created in MySQL
