# Customer Segmentation - Exploratory Analysis

This notebook provides an interactive exploration of the customer segmentation analysis.

## Table of Contents
1. [Data Loading](#data-loading)
2. [Data Exploration](#data-exploration)
3. [RFM Analysis](#rfm-analysis)
4. [Segmentation](#segmentation)
5. [Insights & Recommendations](#insights)

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from pathlib import Path
import warnings
warnings.filterwarnings('ignore')

# Import project modules
import sys
sys.path.append('../src')
from data_loader import load_data_optimized
from data_cleaner import clean_data
from rfm_analysis import calculate_rfm, segment_customers, get_segment_summary

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
sns.set_style('whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)

print("✓ Libraries imported successfully")

## 1. Data Loading <a id='data-loading'></a>

In [None]:
# Load data
data_path = Path('../data/raw/OnlineRetail.xlsx')

if not data_path.exists():
    print("❌ Data file not found!")
    print("Please run: python scripts/download_data.py")
else:
    df = load_data_optimized(str(data_path))
    print(f"✓ Loaded {len(df):,} rows")

## 2. Data Exploration <a id='data-exploration'></a>

In [None]:
# Display first few rows
df.head()

In [None]:
# Basic statistics
df.describe()

In [None]:
# Data info
df.info()

In [None]:
# Missing values
missing = df.isnull().sum()
missing[missing > 0].sort_values(ascending=False)

### Clean the data

In [None]:
# Clean data
cleaned_df = clean_data(df)
print(f"\n✓ Cleaned data: {len(cleaned_df):,} rows")

## 3. RFM Analysis <a id='rfm-analysis'></a>

In [None]:
# Calculate RFM metrics
rfm = calculate_rfm(cleaned_df)
rfm.head()

In [None]:
# RFM distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))

axes[0].hist(rfm['Recency'], bins=50, color='skyblue', edgecolor='black')
axes[0].set_title('Recency Distribution')
axes[0].set_xlabel('Days')

axes[1].hist(rfm['Frequency'], bins=50, color='lightgreen', edgecolor='black')
axes[1].set_title('Frequency Distribution')
axes[1].set_xlabel('Number of Purchases')

axes[2].hist(rfm['Monetary'], bins=50, color='lightcoral', edgecolor='black')
axes[2].set_title('Monetary Distribution')
axes[2].set_xlabel('Total Spending ($)')

plt.tight_layout()
plt.show()

## 4. Segmentation <a id='segmentation'></a>

In [None]:
# Segment customers
rfm_segmented = segment_customers(rfm)
rfm_segmented.head(10)

In [None]:
# Segment summary
segment_summary = get_segment_summary(rfm_segmented)
segment_summary

In [None]:
# Visualize segment distribution
segment_counts = rfm_segmented['Segment'].value_counts()

fig = px.pie(
    values=segment_counts.values,
    names=segment_counts.index,
    title='Customer Segment Distribution',
    hole=0.4
)
fig.show()

In [None]:
# Revenue by segment
fig = px.bar(
    segment_summary,
    x='Segment',
    y='Total_Revenue',
    title='Revenue Contribution by Segment',
    color='Total_Revenue',
    color_continuous_scale='Viridis'
)
fig.update_layout(xaxis_tickangle=-45)
fig.show()

## 5. Insights & Recommendations <a id='insights'></a>

In [None]:
# Top segments by customer count
print("Top Segments by Customer Count:")
print(segment_summary.nlargest(5, 'Customer_Count')[['Segment', 'Customer_Count', 'Customer_Percentage']])

In [None]:
# Top segments by revenue
print("\nTop Segments by Revenue:")
print(segment_summary.nlargest(5, 'Total_Revenue')[['Segment', 'Total_Revenue', 'Revenue_Percentage']])

In [None]:
# Champions analysis
champions = rfm_segmented[rfm_segmented['Segment'] == 'Champions']
print(f"\nChampions (Best Customers):")
print(f"  Count: {len(champions):,}")
print(f"  Total Revenue: ${champions['Monetary'].sum():,.2f}")
print(f"  Avg Frequency: {champions['Frequency'].mean():.1f}")
print(f"  Avg Recency: {champions['Recency'].mean():.0f} days")

In [None]:
# 3D visualization of RFM scores
fig = px.scatter_3d(
    rfm_segmented,
    x='R_Score',
    y='F_Score',
    z='M_Score',
    color='Segment',
    title='3D RFM Score Distribution by Segment',
    opacity=0.7
)
fig.show()

### Save Results

In [None]:
# Save results
output_dir = Path('../data/processed')
output_dir.mkdir(parents=True, exist_ok=True)

rfm_segmented.to_csv(output_dir / 'rfm_results.csv', index=False)
segment_summary.to_csv(output_dir / 'segment_summary.csv', index=False)

print("✓ Results saved to data/processed/")