In [None]:
"""
# ITS 2122 - Phase 4: Strategic Recommendations
**Author:** Sherul Dhanushka  
**Date:** 2025-08-25  
**Purpose:** Translate RFM customer segmentation results into actionable business strategies, 
with a focus on differentiating between wholesalers and retail customers.

## Overview
This notebook builds upon the RFM segmentation analysis and investigates the wholesaler hypothesis:
- Analyze the distribution of Monetary values per customer to detect skewness and potential wholesaler clusters
- Use visualizations (histograms, boxplots, log-transforms) to highlight differences between low-value retail customers and high-value wholesalers
- Provide strategic recommendations for wholesalers and retail segments
- Support marketing, customer relationship management, and inventory planning decisions with data-driven insights
"""


In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path

sns.set(style="whitegrid")

# Load the RFM dataset from Phase 3
RFM_PATH = Path("../data/processed/phase3_rfm_segments.csv")
if not RFM_PATH.exists():
    raise FileNotFoundError(f"{RFM_PATH} not found. Run Phase 3 first.")

rfm = pd.read_csv(RFM_PATH)

# Also load the cleaned full dataset for customer-level revenue checks
DATA_PATH = Path("../data/processed/online_retail_clean_with_customerids.csv")
df = pd.read_csv(DATA_PATH)

# Convert dates to datetime
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'], errors='coerce')

print("RFM shape:", rfm.shape)
rfm.head()

In [None]:
# Summary of segments
segment_summary = rfm.groupby('Segment').agg({
    'CustomerID': 'count',
    'Monetary': 'mean',
    'Frequency': 'mean',
    'Recency': 'mean'
}).rename(columns={
    'CustomerID': 'Num_Customers',
    'Monetary': 'Avg_Spend',
    'Frequency': 'Avg_Orders',
    'Recency': 'Avg_Recency_Days'
}).reset_index()

segment_summary


In [None]:
# Step 1: Calculate total spend per customer
customer_spend = df.groupby('CustomerID')['TotalPrice'].sum()

# Step 2: Visualize distribution to spot wholesalers (high spenders)
plt.figure(figsize=(10,6))
sns.histplot(customer_spend, bins=100, kde=True)
plt.xlim(0, 10000)  # zoom in on most customers
plt.title("Distribution of Customer Spend (£)")
plt.xlabel("Total Spend")
plt.ylabel("Number of Customers")
plt.show()

In [None]:
# Rule of thumb: anyone spending > £5000 is likely a wholesaler
wholesaler_threshold = 5000
wholesalers = customer_spend[customer_spend > wholesaler_threshold]
retail_customers = customer_spend[customer_spend <= wholesaler_threshold]

print(f"Wholesalers count: {len(wholesalers)}")
print(f"Retail customers count: {len(retail_customers)}")

In [None]:
# Merge spend back with RFM segments
rfm_with_spend = rfm.merge(customer_spend.rename("TotalSpend"), on="CustomerID")

# Tag wholesaler vs retail
rfm_with_spend['CustomerType'] = rfm_with_spend['TotalSpend'].apply(
    lambda x: 'Wholesaler' if x > wholesaler_threshold else 'Retail'
)

# Compare average metrics
behavior_summary = rfm_with_spend.groupby('CustomerType').agg({
    'Monetary': 'mean',
    'Frequency': 'mean',
    'Recency': 'mean'
}).rename(columns={
    'Monetary': 'Avg_Spend',
    'Frequency': 'Avg_Orders',
    'Recency': 'Avg_Recency_Days'
}).reset_index()

behavior_summary

In [None]:
plt.figure(figsize=(8,5))
sns.countplot(data=rfm_with_spend, x='CustomerType', order=['Retail','Wholesaler'])
plt.title("Retail vs Wholesaler Count")
plt.ylabel("Number of Customers")
plt.show()

In [None]:
output_dir = Path("../data/processed")
output_dir.mkdir(exist_ok=True)

segment_summary.to_csv(output_dir / "phase4_segment_summary.csv", index=False)
rfm_with_spend.to_csv(output_dir / "phase4_rfm_with_spend.csv", index=False)

print("Phase 4 outputs saved in outputs/ directory.")