In [None]:
""" 
# ITS 2122 - Phase 3: Advanced Analytics (RFM Customer Segmentation)
**Author:** Sachintha Keshan  
**Date:** 2025-08-19  
**Purpose:** Implement the RFM (Recency, Frequency, Monetary) model to segment customers based on purchasing behavior, enabling data-driven marketing strategies.

## Overview
This notebook moves beyond descriptive analysis and focuses on customer segmentation using the RFM model:
- Calculate RFM metrics: Recency, Frequency, and Monetary values for each customer
- Assign RFM scores using quintile-based ranking
- Combine scores to generate RFM segments
- Map segments to descriptive categories (e.g., Champions, At-Risk, Loyal Customers)
- Provide insights for targeted marketing and customer retention strategies
"""

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

# Better chart style
sns.set(style="whitegrid")

# Load the cleaned dataset (only rows with CustomerID)
df = pd.read_csv('../data/processed/online_retail_clean_with_customerids.csv', encoding='ISO-8859-1')

# Convert dates
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

print("Dataset shape:", df.shape)
df.head()

In [None]:
# Snapshot date = one day after the last invoice
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)
print("Snapshot Date:", snapshot_date)

In [None]:
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,   # Recency
    'InvoiceNo': 'nunique',                                   # Frequency
    'TotalPrice': 'sum'                                       # Monetary
}).reset_index()

rfm.rename(columns={
    'InvoiceDate': 'Recency',
    'InvoiceNo': 'Frequency',
    'TotalPrice': 'Monetary'
}, inplace=True)

rfm.head()

In [None]:

# Create TotalPrice
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

#  Drop rows with missing CustomerID (important for grouping)
df = df.dropna(subset=['CustomerID'])

#  Define snapshot date (one day after last invoice date)
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

#  Calculate RFM metrics per customer
rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',                                   # Frequency
    'TotalPrice': 'sum'                                       # Monetary
}).reset_index()

rfm.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

#  Remove NaN values (if any remain after grouping)
print("Missing values before dropping:\n", rfm.isna().sum())
rfm = rfm.dropna(subset=['Recency', 'Frequency', 'Monetary'])
print("After dropping NaN:", rfm.shape)

#  Create R, F, M scores using quintiles
rfm['R_Score'] = pd.qcut(rfm['Recency'], 5, labels=[5,4,3,2,1])  # lower = better
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), 5, labels=[1,2,3,4,5])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], 5, labels=[1,2,3,4,5])

#  Handle missing categories (in case qcut fails for small data)
for col in ['R_Score', 'F_Score', 'M_Score']:
    rfm[col] = rfm[col].cat.add_categories([0])
rfm[['R_Score','F_Score','M_Score']] = rfm[['R_Score','F_Score','M_Score']].fillna(0)

#  Create combined RFM Segment and RFM Score
rfm['RFM_Segment'] = (
    rfm['R_Score'].astype(str) +
    rfm['F_Score'].astype(str) +
    rfm['M_Score'].astype(str)
)

rfm['RFM_Score'] = (
    rfm[['R_Score','F_Score','M_Score']].astype(int).sum(axis=1)
)

#  Preview
print(rfm.head(10))

In [None]:
def map_segment(row):
    """Convert RFM numeric scores to business categories."""
    if row['R_Score'] >= 4 and row['F_Score'] >= 4 and row['M_Score'] >= 4:
        return 'Champions'
    elif row['R_Score'] >= 3 and row['F_Score'] >= 3:
        return 'Loyal Customers'
    elif row['R_Score'] >= 4 and row['F_Score'] <= 2:
        return 'New Customers'
    elif row['R_Score'] <= 2 and row['F_Score'] >= 4:
        return 'At-Risk Customers'
    else:
        return 'Others'

rfm['Segment'] = rfm.apply(map_segment, axis=1)
rfm['Segment'].value_counts()

In [None]:
plt.figure(figsize=(10,6))
sns.countplot(data=rfm, x='Segment', order=rfm['Segment'].value_counts().index)
plt.title("Customer Segments (RFM)")
plt.xlabel("Segment")
plt.ylabel("Number of Customers")
plt.show()

In [None]:
rfm.to_csv("../data/processed/phase3_rfm_segments.csv", index=False)
print(" RFM segmentation saved ")