In [0]:
# MACHINE LEARNING - Customer Segmentation using K-Means
# Purpose: Create data-driven customer segments

from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.clustering import KMeans
from pyspark.ml import Pipeline
from pyspark.sql import functions as F

# Load customer metrics
customer_data = spark.table("gold_customer_metrics")

# Select fewer features for clustering
features = ["recency_days", "frequency", "monetary_value"]

# Prepare data
ml_data = customer_data.select(
    "customer_unique_id",
    *features
).na.drop()

# Sample data to reduce model size further
ml_data_sampled = ml_data.sample(
    fraction=0.1,
    seed=42
)

print(f"Training on {ml_data_sampled.count()} customers (sampled)")

# Feature engineering pipeline
assembler = VectorAssembler(
    inputCols=features,
    outputCol="features_raw"
)
scaler = StandardScaler(
    inputCol="features_raw",
    outputCol="features",
    withStd=True,
    withMean=True
)

# K-Means clustering (k=3 segments)
kmeans = KMeans(
    k=3,
    seed=42,
    featuresCol="features",
    predictionCol="ml_segment"
)

# Create pipeline
pipeline = Pipeline(
    stages=[assembler, scaler, kmeans]
)

# Train model
print("Training K-Means model...")
model = pipeline.fit(ml_data_sampled)
predictions = model.transform(ml_data)

# Analyze segments
segment_analysis = predictions.groupBy("ml_segment").agg(
    F.count("*").alias("customer_count"),
    F.avg("recency_days").alias("avg_recency"),
    F.avg("frequency").alias("avg_frequency"),
    F.avg("monetary_value").alias("avg_monetary")
).orderBy("ml_segment")

display(segment_analysis)

# Save predictions
predictions.select(
    "customer_unique_id",
    "ml_segment"
).write.format("delta").mode("overwrite").saveAsTable("gold_customer_segments_ml")

print("âœ“ Created gold_customer_segments_ml")
print("\nðŸŽ‰ Machine Learning complete! Customer segments created.")