In [None]:
# Customer Segmentation Analysis

# This notebook performs a customer segmentation analysis using the RFM (Recency, Frequency, Monetary) model and k-means clustering. The goal is to identify different customer segments for targeted marketing strategies.

## 1. Introduction and Data Import
# In this section, I import the necessary libraries and load the customer transaction data.

# Importing necessary libraries
import pandas as pd  # For data manipulation and analysis
import numpy as np  # For numerical operations
import matplotlib.pyplot as plt  # For plotting graphs
import seaborn as sns  # For statistical data visualization
from sklearn.preprocessing import StandardScaler  # For feature scaling
from sklearn.cluster import KMeans  # For k-means clustering
from sklearn.metrics import silhouette_score  # For evaluating clustering performance

# Suppress warnings for clean output
import warnings
warnings.filterwarnings("ignore")

# Load the dataset
data = pd.read_csv('customer_data.csv')  # Replace 'customer_data.csv' with your actual file name

# Display the first few rows of the dataset
data.head()

## 2. Data Preprocessing
# In this step, I check for missing values, convert data types, and create a new feature called `TotalPrice` to facilitate further analysis.

# Check for missing values in the dataset
data.isnull().sum()

# Drop rows with missing values
data.dropna(inplace=True)

# Convert necessary columns to appropriate data types (if required)
data['InvoiceDate'] = pd.to_datetime(data['InvoiceDate'])  # Convert 'InvoiceDate' to datetime format
data['CustomerID'] = data['CustomerID'].astype('str')  # Convert 'CustomerID' to string

# Create new feature: 'TotalPrice' as quantity * unit price
data['TotalPrice'] = data['Quantity'] * data['UnitPrice']

## 3. Exploratory Data Analysis (EDA)
# Here, I explore the data by plotting the distribution of total prices and quantities to understand the data distribution and detect any anomalies.

# Descriptive statistics for the dataset
data.describe()

# Plot the distribution of total prices
plt.figure(figsize=(10, 6))  # Set the figure size
sns.histplot(data['TotalPrice'], bins=50)  # Plot the distribution of 'TotalPrice'
plt.title('Distribution of Total Prices')  # Set the title of the plot
plt.xlabel('Total Price')  # Set the x-axis label
plt.ylabel('Frequency')  # Set the y-axis label
plt.show()  # Display the plot

# Plot the distribution of quantities
plt.figure(figsize=(10, 6))  # Set the figure size
sns.histplot(data['Quantity'], bins=50)  # Plot the distribution of 'Quantity'
plt.title('Distribution of Quantities')  # Set the title of the plot
plt.xlabel('Quantity')  # Set the x-axis label
plt.ylabel('Frequency')  # Set the y-axis label
plt.show()  # Display the plot

## 4. Feature Engineering
# In this step, I create RFM (Recency, Frequency, Monetary) features for clustering analysis.

# Create RFM (Recency, Frequency, Monetary) features for clustering
import datetime as dt  # Import datetime for date manipulations

# Define the present date for recency calculation
present_date = dt.datetime(2011, 12, 10)  # Set the reference date for recency calculation

# Recency: Days since last purchase
rfm_data = data.groupby('CustomerID').agg({
    'InvoiceDate': lambda x: (present_date - x.max()).days,  # Calculate recency
    'InvoiceNo': 'count',  # Calculate frequency
    'TotalPrice': 'sum'  # Calculate monetary value
}).reset_index()

# Rename columns for clarity
rfm_data.columns = ['CustomerID', 'Recency', 'Frequency', 'Monetary']

# Log transformation for better distribution
rfm_data['Recency_log'] = np.log1p(rfm_data['Recency'])  # Log transform recency
rfm_data['Frequency_log'] = np.log1p(rfm_data['Frequency'])  # Log transform frequency
rfm_data['Monetary_log'] = np.log1p(rfm_data['Monetary'])  # Log transform monetary value

# Standardize the RFM features
scaler = StandardScaler()  # Initialize the scaler
rfm_scaled = scaler.fit_transform(rfm_data[['Recency_log', 'Frequency_log', 'Monetary_log']])  # Scale the RFM features

## 5. Clustering Analysis
# I determine the optimal number of clusters using the elbow method and fit the k-means algorithm.

# Determine the optimal number of clusters using the elbow method
sse = {}  # Initialize a dictionary to store the SSE for each k
for k in range(1, 11):  # Iterate over a range of cluster numbers
    kmeans = KMeans(n_clusters=k, random_state=42)  # Initialize k-means with k clusters
    kmeans.fit(rfm_scaled)  # Fit k-means to the scaled data
    sse[k] = kmeans.inertia_  # Store the SSE for the current k

# Plot the SSE for each k to identify the elbow point
plt.figure(figsize=(10, 6))  # Set the figure size
sns.lineplot(x=list(sse.keys()), y=list(sse.values()))  # Plot SSE values
plt.title('Elbow Method for Optimal k')  # Set the title of the plot
plt.xlabel('Number of clusters')  # Set the x-axis label
plt.ylabel('SSE')  # Set the y-axis label
plt.show()  # Display the plot

# Fit KMeans with the optimal number of clusters (assuming k=6 here)
kmeans = KMeans(n_clusters=6, random_state=42)  # Initialize k-means with 6 clusters
rfm_data['Cluster'] = kmeans.fit_predict(rfm_scaled)  # Fit k-means and assign cluster labels

# Calculate silhouette score for the clustering
silhouette_avg = silhouette_score(rfm_scaled, rfm_data['Cluster'])  # Calculate the silhouette score
print(f'Silhouette Score: {silhouette_avg}')  # Print the silhouette score

## 6. Cluster Interpretation and Business Recommendations
# I interpret the clusters based on their RFM values and provide business recommendations for each cluster.

# Summary statistics for each cluster
cluster_summary = rfm_data.groupby('Cluster').agg({
    'Recency': 'mean',  # Calculate the mean recency for each cluster
    'Frequency': 'mean',  # Calculate the mean frequency for each cluster
    'Monetary': 'mean',  # Calculate the mean monetary value for each cluster
    'CustomerID': 'count'  # Count the number of customers in each cluster
}).reset_index()

# Rename columns for clarity
cluster_summary.columns = ['Cluster', 'Avg_Recency', 'Avg_Frequency', 'Avg_Monetary', 'Num_Customers']
print(cluster_summary)  # Print the cluster summary

# Interpret each cluster based on RFM values
interpretation = """
Cluster Interpretation:
- Cluster 0: High recency, low frequency, and low monetary value.
- Cluster 1: Low recency, high frequency, and high monetary value.
- Cluster 2: Moderate recency, high frequency, and moderate monetary value.
- Cluster 3: High recency, low frequency, and very low monetary value.
- Cluster 4: Low recency, low frequency, and low monetary value.
- Cluster 5: Very low recency, very high frequency, and very high monetary value.
"""
print(interpretation)  # Print the cluster interpretation

# Business recommendations based on clusters
recommendations = """
Business Recommendations:
- For Clusters 0 & 4 (Low Engagement): Implement targeted marketing campaigns to increase their purchasing frequency. Offering personalized promotions and new product introductions can help increase engagement.
- For Cluster 3 (At Risk): Immediate action is needed to re-engage these customers. Consider sending "We Miss You" emails with special discounts or loyalty rewards to encourage them to shop again.
- For Clusters 1 & 2 (Loyal Customers): Focus on loyalty programs to maintain their current engagement level. Exclusive offers, early access to new products, and special loyalty discounts can help enhance their satisfaction and retention.
- For Clusters 5 & 6 (VIPs/Champions): Provide VIP treatment. This could include exclusive memberships, personal shopping assistants, and premium support services. Recognizing their value with exclusive VIP events or partnerships with other high-value brands could also be beneficial.
"""
print(recommendations)  # Print the business recommendations
