In [None]:

# 3. Customer Segmentation with RFM & K-Means
# **Goal:** Use RFM (Recency, Frequency, Monetary) analysis and K-Means clustering to segment customers.


import pandas as pd
import numpy as np
import datetime as dt
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
import seaborn as sns

# Load the cleaned data
df = pd.read_csv('../data/cleaned_retail.csv', parse_dates=['InvoiceDate'])
df.head()


# ### Calculate RFM Metrics


# Set a snapshot date for recency calculation (one day after the last transaction)
snapshot_date = df['InvoiceDate'].max() + dt.timedelta(days=1)

# Calculate RFM values
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda date: (snapshot_date - date.max()).days,
    'InvoiceNo': 'nunique',
    'TotalPrice': 'sum'
})

# Rename columns
rfm.rename(columns={'InvoiceDate': 'Recency', 'InvoiceNo': 'Frequency', 'TotalPrice': 'Monetary'}, inplace=True)

rfm.head()


# ### Preprocess RFM Data for Clustering
# K-Means is sensitive to skewed data and differing scales. We will log-transform to reduce skewness and then scale the data.

# Log transform to handle skewness
rfm_log = rfm.copy()

# Add a small constant to all columns to avoid log(0) and infinite values
for col in ['Recency', 'Frequency', 'Monetary']:
    # Adding 1 is a simple way to handle zeros, ensuring all values are > 0 before log
    rfm_log[col] = rfm_log[col] + 1
    
# Apply the log transformation to the entire DataFrame
rfm_log = rfm_log.apply(np.log)

# Initialize scaler
scaler = StandardScaler()

# Scale the data
rfm_scaled = scaler.fit_transform(rfm_log)
rfm_scaled = pd.DataFrame(rfm_scaled, index=rfm.index, columns=rfm.columns)

# Determine Optimal Number of Clusters (Elbow Method)


inertia = {}
for k in range(1, 11):
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    inertia[k] = kmeans.inertia_

# Plot the elbow curve
plt.figure(figsize=(8, 5))
plt.plot(list(inertia.keys()), list(inertia.values()), marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.xticks(list(inertia.keys()))
plt.grid(True)
plt.tight_layout()

# Save the plot
plt.savefig('../images/kmeans_elbow_plot.png')
plt.show()


# From the plot, `k=4` appears to be a good elbow point.


# Build K-Means model with k=4
kmeans = KMeans(n_clusters=4, random_state=42, n_init=10)
kmeans.fit(rfm_scaled)

# Assign cluster labels to the original RFM data
rfm['Cluster'] = kmeans.labels_

rfm.head()


# Analyze the Customer Segments


# Calculate the mean RFM values for each cluster
cluster_analysis = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean'
}).round(2)

print(cluster_analysis)


# **Interpretation:**
# - **Cluster 0:** Low Recency, High Frequency, High Monetary -> **Champions/Best Customers**
# - **Cluster 1:** High Recency, Low Frequency, Low Monetary -> **At-Risk/Lost Customers**
# - **Cluster 2:** Mid Recency, Low Frequency, Low Monetary -> **Potential/New Customers**
# - **Cluster 3:** Low Recency, Low Frequency, Low Monetary -> **Loyal/Recent Customers**


# Assign meaningful names to clusters
rfm['Segment'] = rfm['Cluster'].map({
    0: 'Champions',
    1: 'At-Risk',
    2: 'Potential',
    3: 'Loyal'
})

rfm.head()



# Visualize the Segments


plt.figure(figsize=(10, 8))
sns.scatterplot(data=rfm, x='Recency', y='Monetary', hue='Segment', palette='bright', s=60)
plt.title('Customer Segments by Recency and Monetary Value')
plt.yscale('log') # Use log scale for better visualization
plt.xscale('log')
plt.tight_layout()

# Save the plot
plt.savefig('../images/customer_segments.png')
plt.show()



# Save the final segmented data to a CSV
rfm.to_csv('../data/customer_segments.csv')
print("\nSegmented customer data saved to 'customer_segments.csv'.")

ValueError: Input X contains infinity or a value too large for dtype('float64').