In [5]:
import pandas as pd
import plotly.express as px
from datetime import datetime

# Load the data with the corrected encoding to resolve UnicodeDecodeError
df = pd.read_csv('data.csv', encoding='latin1')

# Data Cleansing and Feature Engineering
df.dropna(subset=['CustomerID'], inplace=True)
df['CustomerID'] = df['CustomerID'].astype(int)
df = df[df['Quantity'] > 0]
df['InvoiceDate'] = pd.to_datetime(df['InvoiceDate'])
df['UnitPrice'] = pd.to_numeric(df['UnitPrice'], errors='coerce')
df.dropna(subset=['UnitPrice'], inplace=True)
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']

# Time Series Analysis: Monthly GMV Trend
monthly_sales = df.set_index('InvoiceDate').resample('M')['TotalPrice'].sum().reset_index()
monthly_sales['Month'] = monthly_sales['InvoiceDate'].dt.to_period('M').astype(str)

fig1 = px.line(
    monthly_sales,
    x='Month',
    y='TotalPrice',
    title='Monthly Gross Merchandise Value (GMV) Trend',
    labels={'TotalPrice': 'Aggregate GMV (£)'}
)
fig1.update_traces(mode='markers+lines')
# Use raw string 'r' to fix the SyntaxWarning
fig1.update_layout(xaxis_title="Temporal Period (Month)", yaxis_title=r"GMV ($\sum$ Monetary Value)")
fig1.show()

# RFM Segmentation and Analysis
PRESENT = df['InvoiceDate'].max() + pd.Timedelta(days=1)

rfm_df = df.groupby('CustomerID').agg(
    Recency=('InvoiceDate', lambda x: (PRESENT - x.max()).days),
    Frequency=('InvoiceNo', 'nunique'),
    Monetary=('TotalPrice', 'sum')
).reset_index()

# Assign RFM Scores
rfm_df['R_Score'] = pd.qcut(rfm_df['Recency'], q=4, labels=[4, 3, 2, 1])

# Use pd.cut with defined bins for Frequency to resolve the ValueError from pd.qcut
# Bins are defined to handle the concentration of 1-time buyers:
F_Bins = [0, 1, 2, 5, rfm_df['Frequency'].max() + 1]
F_Labels = [1, 2, 3, 4]

rfm_df['F_Score'] = pd.cut(
    rfm_df['Frequency'],
    bins=F_Bins,
    labels=F_Labels,
    right=False,
    include_lowest=True
).astype(int) # Cast to int for summation

rfm_df['M_Score'] = pd.qcut(rfm_df['Monetary'], q=4, labels=[1, 2, 3, 4])

rfm_df['RFM_Score'] = rfm_df[['R_Score', 'F_Score', 'M_Score']].astype(int).sum(axis=1)

# Define Segments
def rfm_level(score):
    if score >= 10: return '4) Champions'
    elif score >= 8: return '3) Loyal Customers'
    elif score >= 5: return '2) Potential Loyalists'
    else: return '1) At Risk/Hibernating'

rfm_df['Customer_Segment'] = rfm_df['RFM_Score'].apply(rfm_level)

# Visualization 2: Segment Distribution (Compositional Analysis)
segment_counts = rfm_df['Customer_Segment'].value_counts().reset_index()
segment_counts.columns = ['Customer_Segment', 'Count']

fig2 = px.bar(
    segment_counts,
    x='Customer_Segment',
    y='Count',
    color='Customer_Segment',
    title='Customer Cohort Distribution via RFM Segmentation',
    text='Count'
)
fig2.update_layout(xaxis_title="Customer Cohort", yaxis_title="Cardinality of Customers")
fig2.show()

# Visualization 3: Top N Product Categories (Pareto Analysis)
product_sales = df.groupby('Description')['TotalPrice'].sum().sort_values(ascending=False).head(10).reset_index()

fig3 = px.bar(
    product_sales,
    x='Description',
    y='TotalPrice',
    color='TotalPrice',
    title='Top 10 High-Contribution SKUs (GMV)',
    labels={'TotalPrice': 'GMV (£)'},
    color_continuous_scale=px.colors.sequential.Viridis
)
# Use raw string 'r' to fix the SyntaxWarning
fig3.update_layout(xaxis_title="Stock Keeping Unit (SKU) Description", yaxis_title=r"GMV ($\sum$ Monetary Value)")
fig3.update_xaxes(tickangle=45)
fig3.show()


'M' is deprecated and will be removed in a future version, please use 'ME' instead.

