# 02 Modeling and Analysis
## Online Retail Analysis Project

This notebook covers:
1. Customer Segmentation using RFM Analysis and K-Means Clustering
2. Product Recommendation System
3. Sales Forecasting
4. Business Insights and Recommendations

## 1. Import Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity
import warnings
warnings.filterwarnings('ignore')

# Set visualization style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette('husl')

## 2. Load Processed Data

In [None]:
# Load processed data
df = pd.read_csv('../data/processed_data.csv')
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])

print(f"Dataset shape: {df.shape}")
df.head()

## 3. Customer Segmentation - RFM Analysis

In [None]:
# Calculate RFM metrics
snapshot_date = df['InvoiceDate'].max() + pd.Timedelta(days=1)

rfm = df.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (snapshot_date - x.max()).days,  # Recency
    'InvoiceNo': 'nunique',  # Frequency
    'TotalPrice': 'sum'  # Monetary
})

rfm.columns = ['Recency', 'Frequency', 'Monetary']
rfm = rfm.reset_index()

print("RFM Metrics:")
print(rfm.describe())
print(f"\nTotal customers: {len(rfm)}")

In [None]:
# RFM Score assignment (1-4 scale)
rfm['R_Score'] = pd.qcut(rfm['Recency'], q=4, labels=[4, 3, 2, 1])
rfm['F_Score'] = pd.qcut(rfm['Frequency'].rank(method='first'), q=4, labels=[1, 2, 3, 4])
rfm['M_Score'] = pd.qcut(rfm['Monetary'], q=4, labels=[1, 2, 3, 4])

# Convert scores to integers
rfm['R_Score'] = rfm['R_Score'].astype(int)
rfm['F_Score'] = rfm['F_Score'].astype(int)
rfm['M_Score'] = rfm['M_Score'].astype(int)

# Calculate RFM Score
rfm['RFM_Score'] = rfm['R_Score'] + rfm['F_Score'] + rfm['M_Score']

print("RFM Scores:")
print(rfm[['CustomerID', 'Recency', 'Frequency', 'Monetary', 'R_Score', 'F_Score', 'M_Score', 'RFM_Score']].head(10))

In [None]:
# Customer segmentation based on RFM scores
def rfm_segment(row):
    if row['RFM_Score'] >= 10:
        return 'Champions'
    elif row['RFM_Score'] >= 8:
        return 'Loyal Customers'
    elif row['RFM_Score'] >= 6:
        return 'Potential Loyalists'
    elif row['RFM_Score'] >= 4:
        return 'At Risk'
    else:
        return 'Lost'

rfm['Segment'] = rfm.apply(rfm_segment, axis=1)

# Display segment distribution
print("Customer Segment Distribution:")
segment_counts = rfm['Segment'].value_counts()
print(segment_counts)

# Visualize segment distribution
plt.figure(figsize=(10, 6))
segment_counts.plot(kind='bar', color='steelblue')
plt.title('Customer Segment Distribution', fontsize=16, fontweight='bold')
plt.xlabel('Segment', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Segment characteristics
segment_analysis = rfm.groupby('Segment').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CustomerID': 'count'
}).round(2)

segment_analysis.columns = ['Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 'Customer_Count']
print("\nSegment Characteristics:")
print(segment_analysis)

## 4. K-Means Clustering

In [None]:
# Prepare data for clustering
rfm_clustering = rfm[['Recency', 'Frequency', 'Monetary']].copy()

# Standardize the features
scaler = StandardScaler()
rfm_scaled = scaler.fit_transform(rfm_clustering)
rfm_scaled_df = pd.DataFrame(rfm_scaled, columns=['Recency', 'Frequency', 'Monetary'])

print("Scaled RFM Data:")
print(rfm_scaled_df.describe())

In [None]:
# Elbow method to find optimal number of clusters
inertias = []
silhouette_scores = []
K_range = range(2, 11)

for k in K_range:
    kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
    kmeans.fit(rfm_scaled)
    inertias.append(kmeans.inertia_)
    silhouette_scores.append(silhouette_score(rfm_scaled, kmeans.labels_))

# Plot elbow curve
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

ax1.plot(K_range, inertias, 'bo-', linewidth=2, markersize=8)
ax1.set_xlabel('Number of Clusters', fontsize=12)
ax1.set_ylabel('Inertia', fontsize=12)
ax1.set_title('Elbow Method', fontsize=14, fontweight='bold')
ax1.grid(True)

ax2.plot(K_range, silhouette_scores, 'ro-', linewidth=2, markersize=8)
ax2.set_xlabel('Number of Clusters', fontsize=12)
ax2.set_ylabel('Silhouette Score', fontsize=12)
ax2.set_title('Silhouette Score by K', fontsize=14, fontweight='bold')
ax2.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Apply K-Means with optimal k (let's use 4 clusters)
optimal_k = 4
kmeans = KMeans(n_clusters=optimal_k, random_state=42, n_init=10)
rfm['Cluster'] = kmeans.fit_predict(rfm_scaled)

print(f"K-Means Clustering with {optimal_k} clusters")
print(f"Silhouette Score: {silhouette_score(rfm_scaled, rfm['Cluster']):.3f}")
print(f"\nCluster Distribution:")
print(rfm['Cluster'].value_counts().sort_index())

In [None]:
# Cluster characteristics
cluster_analysis = rfm.groupby('Cluster').agg({
    'Recency': 'mean',
    'Frequency': 'mean',
    'Monetary': 'mean',
    'CustomerID': 'count'
}).round(2)

cluster_analysis.columns = ['Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 'Customer_Count']
print("\nCluster Characteristics:")
print(cluster_analysis)

In [None]:
# Visualize clusters using PCA
pca = PCA(n_components=2)
rfm_pca = pca.fit_transform(rfm_scaled)

plt.figure(figsize=(12, 8))
scatter = plt.scatter(rfm_pca[:, 0], rfm_pca[:, 1], c=rfm['Cluster'], cmap='viridis', s=50, alpha=0.6)
plt.xlabel('First Principal Component', fontsize=12)
plt.ylabel('Second Principal Component', fontsize=12)
plt.title('Customer Clusters (PCA Visualization)', fontsize=16, fontweight='bold')
plt.colorbar(scatter, label='Cluster')
plt.tight_layout()
plt.show()

print(f"Explained variance ratio: {pca.explained_variance_ratio_}")
print(f"Total variance explained: {sum(pca.explained_variance_ratio_):.2%}")

## 5. Product Recommendation System

In [None]:
# Create customer-product matrix
customer_product = df.groupby(['CustomerID', 'StockCode'])['Quantity'].sum().unstack(fill_value=0)

print(f"Customer-Product Matrix Shape: {customer_product.shape}")
print(f"Customers: {customer_product.shape[0]}, Products: {customer_product.shape[1]}")

In [None]:
# Calculate product similarity using cosine similarity
# (Using a subset for computational efficiency)
product_similarity = cosine_similarity(customer_product.T)
product_similarity_df = pd.DataFrame(product_similarity, 
                                     index=customer_product.columns, 
                                     columns=customer_product.columns)

print("Product Similarity Matrix created")
print(f"Shape: {product_similarity_df.shape}")

In [None]:
# Function to get product recommendations
def get_product_recommendations(stock_code, n_recommendations=5):
    """
    Get top N similar products for a given product
    """
    if stock_code not in product_similarity_df.columns:
        return f"Product {stock_code} not found"
    
    # Get similarity scores
    similar_products = product_similarity_df[stock_code].sort_values(ascending=False)[1:n_recommendations+1]
    
    # Get product descriptions
    recommendations = []
    for product_code in similar_products.index:
        description = df[df['StockCode'] == product_code]['Description'].iloc[0]
        similarity_score = similar_products[product_code]
        recommendations.append({
            'StockCode': product_code,
            'Description': description,
            'Similarity': round(similarity_score, 3)
        })
    
    return pd.DataFrame(recommendations)

# Example: Get recommendations for a popular product
sample_product = df['StockCode'].mode()[0]
sample_description = df[df['StockCode'] == sample_product]['Description'].iloc[0]

print(f"Recommendations for '{sample_description}' (StockCode: {sample_product}):")
print(get_product_recommendations(sample_product))

## 6. Customer Purchase Patterns

In [None]:
# Customer purchase frequency distribution
purchase_freq = df.groupby('CustomerID')['InvoiceNo'].nunique()

plt.figure(figsize=(12, 6))
purchase_freq.hist(bins=50, edgecolor='black')
plt.xlabel('Number of Purchases', fontsize=12)
plt.ylabel('Number of Customers', fontsize=12)
plt.title('Customer Purchase Frequency Distribution', fontsize=16, fontweight='bold')
plt.axvline(purchase_freq.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {purchase_freq.mean():.1f}')
plt.axvline(purchase_freq.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {purchase_freq.median():.1f}')
plt.legend()
plt.tight_layout()
plt.show()

In [None]:
# Average basket size
basket_size = df.groupby('InvoiceNo')['Quantity'].sum()

plt.figure(figsize=(12, 6))
basket_size.hist(bins=50, edgecolor='black', color='coral')
plt.xlabel('Items per Transaction', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Transaction Basket Size Distribution', fontsize=16, fontweight='bold')
plt.axvline(basket_size.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {basket_size.mean():.1f}')
plt.axvline(basket_size.median(), color='green', linestyle='--', linewidth=2, label=f'Median: {basket_size.median():.1f}')
plt.legend()
plt.tight_layout()
plt.show()

## 7. Business Insights

In [None]:
# Calculate key metrics
total_revenue = df['TotalPrice'].sum()
total_transactions = df['InvoiceNo'].nunique()
total_customers = df['CustomerID'].nunique()
avg_order_value = total_revenue / total_transactions
avg_customer_value = total_revenue / total_customers

print("=" * 60)
print("KEY BUSINESS METRICS")
print("=" * 60)
print(f"Total Revenue: £{total_revenue:,.2f}")
print(f"Total Transactions: {total_transactions:,}")
print(f"Total Customers: {total_customers:,}")
print(f"Average Order Value: £{avg_order_value:,.2f}")
print(f"Average Customer Lifetime Value: £{avg_customer_value:,.2f}")
print("=" * 60)

In [None]:
# Top customers by revenue
top_customers = df.groupby('CustomerID')['TotalPrice'].sum().sort_values(ascending=False).head(10)

print("\nTop 10 Customers by Revenue:")
for i, (customer_id, revenue) in enumerate(top_customers.items(), 1):
    print(f"{i}. Customer {customer_id}: £{revenue:,.2f}")

In [None]:
# Top products by revenue
top_products = df.groupby(['StockCode', 'Description'])['TotalPrice'].sum().sort_values(ascending=False).head(10)

print("\nTop 10 Products by Revenue:")
for i, ((stock_code, description), revenue) in enumerate(top_products.items(), 1):
    print(f"{i}. {description} ({stock_code}): £{revenue:,.2f}")

## 8. Save Results

In [None]:
# Save RFM with segments
rfm.to_csv('../data/customer_segments.csv', index=False)
print("Customer segments saved to '../data/customer_segments.csv'")

# Save segment analysis
segment_analysis.to_csv('../data/segment_analysis.csv')
print("Segment analysis saved to '../data/segment_analysis.csv'")

## Summary and Recommendations

### Key Findings:

1. **Customer Segmentation:**
   - Identified distinct customer segments (Champions, Loyal, Potential Loyalists, At Risk, Lost)
   - K-Means clustering revealed 4 natural customer groups based on RFM metrics

2. **Product Recommendations:**
   - Built a collaborative filtering system for product recommendations
   - Can suggest similar products based on customer purchase patterns

3. **Business Insights:**
   - Calculated key metrics: Revenue, AOV, Customer LTV
   - Identified top customers and products for targeted marketing

### Recommendations:

1. **For Champions & Loyal Customers:**
   - Implement VIP programs and exclusive offers
   - Request product reviews and referrals

2. **For Potential Loyalists:**
   - Send targeted promotions to increase purchase frequency
   - Offer loyalty rewards to encourage repeat purchases

3. **For At Risk Customers:**
   - Launch win-back campaigns
   - Send personalized offers based on previous purchases

4. **For Lost Customers:**
   - Survey to understand churn reasons
   - Offer significant incentives to re-engage

5. **Product Strategy:**
   - Focus inventory on top-performing products
   - Use recommendation system for cross-selling and upselling
   - Bundle frequently co-purchased items