# setup

In [None]:
# from pyspark.sql import functions as F, Window

# 5.1 Customer Segmentation w RFM

In [None]:
# CONFIG IN LOCAL

# import toml
# from snowflake.snowpark import Session
# from snowflake.snowpark.functions import max, count, sum, datediff, col, current_date

# # 1. Connection name in file TOML
# CONNECTION_NAME = "snowflake_connection"

# # 2. read config TOML
# TOML_FILE_PATH = "connections.toml"
# try:
#     config_data = toml.load(TOML_FILE_PATH)
#     connection_config_dict = config_data[CONNECTION_NAME] 
    
# except Exception as e:
#     print(f"Lỗi khi đọc file TOML: {e}")
#     exit()

# # 3. Create Session using .configs()
# # This method takes the entire dictionary of configuration read in step 2
# try:
#     session = Session.builder.configs(connection_config_dict).create()
    
#     print("✅ Kết nối Snowpark thành công.")
#     print(f"Database: {session.get_current_database()}")

# except Exception as e:
#     print(f"❌ Lỗi kết nối Snowpark: {e}")

# ###
# # Customer RFM Segmentation using Snowpark
# ###
# # from snowflake.snowpark.session import Session



In [None]:
use role tpch.developer;
use database TPCH_ANALYTICS_DB;
select * from ANALYTICS.CUSTOMER_SILVER limit 10

In [None]:
from snowflake.snowpark.context import get_active_session

session = get_active_session()


customers = session.table("ANALYTICS.CUSTOMER_SILVER") 
orders = session.table("ANALYTICS.ORDERS_SILVER")
print("Load completed")

customers.show(10)

In [None]:
customers.cache_result()
orders.cache_result()

In [None]:
from snowflake.snowpark.functions import max, count, sum, datediff, col, current_date
# Calculate RFM metrics
rfm_df = (
    customers
    .join(orders, customers["C_CUSTKEY"] == orders["O_CUSTKEY"], "left")
    .group_by("C_CUSTKEY", "C_NAME")
    .agg([
        max("O_ORDERDATE").alias("LAST_ORDER_DATE"),
        count("O_ORDERKEY").alias("FREQUENCY"),
        sum("O_TOTALPRICE").alias("MONETARY")
    ])
    .with_column("RECENCY_DAYS", datediff("day", col("LAST_ORDER_DATE"), current_date()))
)

# Save to table
rfm_df.write.mode("overwrite").save_as_table("ANALYTICS.CUSTOMER_RFM_SCORES")

print(f"✅ RFM Segmentation completed!")
print(f"   Total customers processed: {rfm_df.count()}")

# Show sample
rfm_df.cache_result()
rfm_df.show(10)

In [None]:
rfm_df.filter(col("FREQUENCY") > 0).show(10)


# 5.2 Sales Trend Analysis

In [None]:
from snowflake.snowpark.functions import *

In [None]:
monthly_sales = (orders
    .with_column("MONTH", date_trunc("month", col("O_ORDERDATE")))
    .group_by("MONTH")
    .agg([
        count("O_ORDERKEY").alias("ORDER_COUNT"),
        sum("O_TOTALPRICE").alias("TOTAL_REVENUE"),
        avg("O_TOTALPRICE").alias("AVG_ORDER_VALUE")
    ])
    .sort("MONTH")
)

monthly_sales.cache_result()
# Convert to pandas for visualization
df_pandas = monthly_sales.to_pandas()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

df_pandas['MONTH'] = pd.to_datetime(df_pandas['MONTH'])

fig, ax1 = plt.subplots(figsize=(12, 6))

# --- Axis main Y (Column Chart for Doanh Thu) ---
color_revenue = 'tab:blue'
ax1.set_xlabel('Tháng')
ax1.set_ylabel('Tổng Doanh Thu (TOTAL_REVENUE)', color=color_revenue)

# Draw (Bar Chart)
ax1.bar(
    df_pandas['MONTH'], 
    df_pandas['TOTAL_REVENUE'], 
    color=color_revenue, 
    label='Tổng Doanh Thu', 
    width=20,
    zorder=1
)
ax1.tick_params(axis='y', labelcolor=color_revenue)
ax1.grid(True, axis='y')
ax1.patch.set_visible(False)

# 2. Draw(ax2) for Line Chart
ax2 = ax1.twinx()  
color_avg = 'tab:red'
ax2.set_ylabel('Giá Trị Đơn Hàng TB (AVG_ORDER_VALUE)', color=color_avg)  

# Draw (Line Chart)
ax2.plot(
    df_pandas['MONTH'], 
    df_pandas['AVG_ORDER_VALUE'], 
    color=color_avg, 
    linestyle='-', 
    marker='o',
    label='Giá Trị Đơn Hàng TB',
    zorder=3
)
ax2.tick_params(axis='y', labelcolor=color_avg)

plt.title('Doanh Thu Hàng Tháng (Cột) và Giá Trị Đơn Hàng TB (Đường)')
plt.gcf().autofmt_xdate()

plt.show() 
# plt.savefig('revenue_vs_aov_fixed.png')

# end