In [None]:
import pandas as pd

FILE_PATH = r"C:\Users\tkorz\OneDrive\Documents\revenue-analysis-case-study\data\processed\case_study_cleaned.xlsx"

# Load data
rr_customer = pd.read_excel(FILE_PATH, sheet_name="Recurring Revenue by Customer")
rr_waterfall = pd.read_excel(FILE_PATH, sheet_name="Recurring Revenue Waterfall")

# Ensure date format
rr_customer["date"] = pd.to_datetime(rr_customer["date"])
rr_waterfall["date"] = pd.to_datetime(rr_waterfall["date"])

# DIM COMPANY
dim_company = rr_customer[["company"]].drop_duplicates().reset_index(drop=True)
dim_company["company_key"] = dim_company.index + 1

# DIM CUSTOMER
dim_customer = rr_customer[["customer", "company"]].drop_duplicates().reset_index(drop=True)
dim_customer = dim_customer.merge(dim_company, on="company", how="left")
dim_customer["customer_key"] = dim_customer.index + 1

# DIM DATE (combine all unique dates from both tables)
dim_date = pd.DataFrame({"date": pd.concat([rr_customer["date"], rr_waterfall["date"]]).drop_duplicates().sort_values().reset_index(drop=True)})
dim_date["date_key"] = dim_date.index + 1
dim_date["year"] = dim_date["date"].dt.year
dim_date["month"] = dim_date["date"].dt.month
dim_date["month_name"] = dim_date["date"].dt.month_name()
dim_date["quarter"] = dim_date["date"].dt.quarter

# DIM CATEGORY
dim_category = rr_waterfall[["category"]].drop_duplicates().reset_index(drop=True)
dim_category["category_key"] = dim_category.index + 1

# FACT RECURRING REVENUE
fact_recurring_revenue = rr_customer.merge(dim_company, on="company", how="left") \
    .merge(dim_customer[["customer", "customer_key"]], on="customer", how="left") \
    .merge(dim_date[["date", "date_key"]], on="date", how="left")
fact_recurring_revenue = fact_recurring_revenue[["company_key", "customer_key", "date_key", "revenue"]]

# FACT WATERFALL
fact_waterfall = rr_waterfall.merge(dim_company, on="company", how="left") \
    .merge(dim_customer[["customer", "customer_key"]], on="customer", how="left") \
    .merge(dim_date[["date", "date_key"]], on="date", how="left") \
    .merge(dim_category, on="category", how="left")
fact_waterfall = fact_waterfall[["company_key", "customer_key", "date_key", "category_key", "revenue"]]

# EXPORT FOR POWER BI
dim_company.to_csv("dim_company.csv", index=False)
dim_customer.to_csv("dim_customer.csv", index=False)
dim_date.to_csv("dim_date.csv", index=False)
dim_category.to_csv("dim_category.csv", index=False)
fact_recurring_revenue.to_csv("fact_recurring_revenue.csv", index=False)
fact_waterfall.to_csv("fact_waterfall.csv", index=False)

print("Star schema tables created successfully.")

# Show output samples
print("\nSample dim_company:")
print(dim_company.head())
print("\nSample dim_customer:")
print(dim_customer.head())
print("\nSample dim_date:")
print(dim_date.head())
print("\nSample dim_category:")
print(dim_category.head())
print("\nSample fact_recurring_revenue:")
print(fact_recurring_revenue.head())
print("\nSample fact_waterfall:")
print(fact_waterfall.head())