In [None]:
"""
# ITS 2122 - Phase 2: Exploratory Data Analysis (EDA) & Insight Generation
**Author:** Kaveesha Rukshan
**Date:** 2025-08-17  
**Purpose:** Explore and analyze the cleaned Online Retail II dataset to uncover patterns, trends, and insights that support business decisions.

## Overview
This notebook performs exploratory data analysis (EDA) tasks including:
- Temporal analysis: sales trends over months, days of the week, and hours of the day
- Geographic analysis: revenue contribution by country
- Product performance analysis: top products by quantity sold and total revenue
- Generating visualizations and insights to inform marketing, inventory, and operational strategies
"""

In [None]:
# ---------------------------------------------------------
# Cell 1: Imports and basic setup
# ---------------------------------------------------------
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Plot style for all graphs
sns.set(style="whitegrid")

print("Libraries loaded successfully.")

In [None]:
# ---------------------------------------------------------
# Cell 2: Load the cleaned dataset
# ---------------------------------------------------------

# CSV file
df = pd.read_csv('../data/processed/online_retail_clean_phase1.csv')

# Quick look at the data
df.head()

In [None]:
# ---------------------------------------------------------
# Cell 3: Basic dataset info
# ---------------------------------------------------------
df.info()

In [None]:
# ---------------------------------------------------------
# Cell 4: Summary statistics
# ---------------------------------------------------------
df.describe(include='all')

In [None]:
# ---------------------------------------------------------
# Cell 5: Sales distribution plots
# ---------------------------------------------------------

# Plot the distribution of Quantity - shows how many items were sold per transaction
plt.figure(figsize=(12,5))
sns.histplot(df['Quantity'], bins=50, kde=False)  # bins=50 divides the data into 50 intervals
plt.title("Distribution of Quantity")
plt.show()

# Plot the distribution of UnitPrice - shows the range and frequency of item prices
plt.figure(figsize=(12,5))
sns.histplot(df['UnitPrice'], bins=50, kde=False)
plt.title("Distribution of Unit Price")
plt.show()

# Plot the distribution of TotalPrice - shows the revenue per transaction (Quantity * UnitPrice)
plt.figure(figsize=(12,5))
sns.histplot(df['TotalPrice'], bins=50, kde=False)
plt.title("Distribution of Total Price (Revenue per transaction)")
plt.show()


In [None]:
# ---------------------------------------------------------
# Cell 6: Time-based analysis
# ---------------------------------------------------------

# Group data by Year and Month to calculate total monthly sales revenue
monthly_sales = df.groupby(['Year', 'Month'])['TotalPrice'].sum().reset_index()

# Plot monthly sales trends for each year to observe growth patterns and seasonality
plt.figure(figsize=(12,5))
sns.lineplot(data=monthly_sales, x='Month', y='TotalPrice', hue='Year', marker='o')
plt.title("Monthly Sales Trend")
plt.show()

# Group sales by day of the week to identify which days have the highest revenue
plt.figure(figsize=(12,5))
sns.barplot(
    data=df.groupby('DayOfWeek')['TotalPrice'].sum().reset_index(),
    x='DayOfWeek', 
    y='TotalPrice',
    order=['Monday','Tuesday','Wednesday','Thursday','Friday','Saturday','Sunday']  # Keep logical day order
)
plt.title("Sales by Day of Week")
plt.show()


In [None]:
# ---------------------------------------------------------
# Cell 7: Top customers
# ---------------------------------------------------------

# Group sales by CustomerID and calculate total revenue per customer
# Sort the customers by revenue in descending order and select the top 10
top_customers = df.groupby('CustomerID')['TotalPrice'].sum().sort_values(ascending=False).head(10)

# Plot a bar chart of the top 10 customers by revenue
plt.figure(figsize=(12,5))
sns.barplot(x=top_customers.index.astype(str), y=top_customers.values)
plt.title("Top 10 Customers by Revenue")
plt.xlabel("Customer ID")
plt.ylabel("Total Revenue")
plt.xticks(rotation=45)  # Rotate x-axis labels for better readability
plt.show()


In [None]:
# ---------------------------------------------------------
# Cell 8: Top products
# ---------------------------------------------------------

# Group sales by product description and calculate total revenue per product
# Sort products by revenue in descending order and select the top 10
top_products = df.groupby('Description')['TotalPrice'].sum().sort_values(ascending=False).head(10)

# Plot a horizontal bar chart of the top 10 products by revenue
plt.figure(figsize=(12,5))
sns.barplot(x=top_products.values, y=top_products.index)
plt.title("Top 10 Products by Revenue")
plt.xlabel("Total Revenue")
plt.ylabel("Product Description")
plt.show()


In [None]:
# ---------------------------------------------------------
# Cell 9: Correlation heatmap
# ---------------------------------------------------------

# Plot a heatmap to examine the correlation between key numerical features
# Features included:
# - Quantity   : Number of items sold per transaction
# - UnitPrice  : Price of a single item
# - TotalPrice : Revenue per transaction (Quantity * UnitPrice)

plt.figure(figsize=(8,6))

# Compute correlation matrix and visualize it with a heatmap
sns.heatmap(
    df[['Quantity', 'UnitPrice', 'TotalPrice']].corr(),  # Correlation coefficients
    annot=True,      # Show numeric correlation values inside the cells
    cmap='coolwarm'  # Color scheme for highlighting positive vs. negative correlation
)

plt.title("Correlation Between Numerical Features")
plt.show()


In [None]:
# ---------------------------------------------------------
# Cell 10: Save outputs for presentation
# ---------------------------------------------------------

# Save the monthly sales trend data to a CSV file for use in reports/presentations
monthly_sales.to_csv('../reports/figures/monthly_sales.csv', index=False)

# Save the top customers data (by revenue) to a CSV file
top_customers.to_csv('../reports/figures/top_customers.csv')

# Save the top products data (by revenue) to a CSV file
top_products.to_csv('../reports/figures/top_products.csv')

print("EDA outputs saved.")
