# Exploratory Data Analysis - Online Shoppers Dataset

This notebook explores the UCI Online Shoppers Purchasing Intention Dataset.

In [None]:
import sys

sys.path.append("..")  # Add parent directory to path

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from src.data_loader import get_dataset_info, load_online_shoppers

# Plotting configuration
sns.set_style("whitegrid")
plt.rcParams["figure.figsize"] = (12, 6)
plt.rcParams["font.size"] = 10

print("EDA Notebook for Online Shoppers Dataset")

## 1. Load Data

In [None]:
# Load dataset
X, y = load_online_shoppers()
get_dataset_info(X, y)

## 2. Target Distribution

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
y.value_counts().plot(kind="bar", ax=axes[0], color=["#FF6B6B", "#4ECDC4"])
axes[0].set_title("Target Distribution (Revenue)", fontsize=14)
axes[0].set_xlabel("Revenue", fontsize=12)
axes[0].set_ylabel("Count", fontsize=12)
axes[0].tick_params(axis="x", rotation=0)

# Pie chart
y.value_counts().plot(kind="pie", ax=axes[1], autopct="%1.1f%%", colors=["#FF6B6B", "#4ECDC4"])
axes[1].set_ylabel("")
axes[1].set_title("Class Proportion", fontsize=14)

plt.tight_layout()
plt.show()

print(f"Class imbalance ratio: {y.value_counts().iloc[0] / y.value_counts().iloc[1]:.2f}:1")

## 3. Numeric Features Distribution

In [None]:
# Top 6 numeric features
numeric_cols = [
    "Administrative",
    "Informational",
    "ProductRelated",
    "BounceRates",
    "ExitRates",
    "PageValues",
]

fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.flatten()

for idx, col in enumerate(numeric_cols):
    X[col].hist(bins=30, ax=axes[idx], edgecolor="black", alpha=0.7, color="steelblue")
    axes[idx].set_title(f"{col} Distribution", fontsize=12, fontweight="bold")
    axes[idx].set_xlabel(col, fontsize=10)
    axes[idx].set_ylabel("Frequency", fontsize=10)
    axes[idx].grid(alpha=0.3)

plt.tight_layout()
plt.show()

## 4. Correlation Matrix

In [None]:
# Correlation matrix for numeric features
numeric_features = X.select_dtypes(include=[np.number])
corr_matrix = numeric_features.corr()

plt.figure(figsize=(12, 10))
sns.heatmap(
    corr_matrix,
    annot=False,
    cmap="coolwarm",
    center=0,
    square=True,
    linewidths=0.5,
    cbar_kws={"label": "Correlation"},
)
plt.title("Feature Correlation Matrix", fontsize=16, fontweight="bold")
plt.tight_layout()
plt.show()

# Show top correlations
corr_pairs = corr_matrix.unstack()
corr_pairs = corr_pairs[corr_pairs != 1.0]
corr_pairs = corr_pairs.sort_values(ascending=False)
print("\nTop 5 Feature Correlations:")
print(corr_pairs.head(5))

## 5. Categorical Features

In [None]:
categorical_cols = ["Month", "VisitorType"]
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for idx, col in enumerate(categorical_cols):
    value_counts = X[col].value_counts()
    value_counts.plot(kind="bar", ax=axes[idx], color="steelblue", edgecolor="black")
    axes[idx].set_title(f"{col} Distribution", fontsize=14, fontweight="bold")
    axes[idx].set_xlabel(col, fontsize=12)
    axes[idx].set_ylabel("Count", fontsize=12)
    axes[idx].tick_params(axis="x", rotation=45)
    axes[idx].grid(axis="y", alpha=0.3)

plt.tight_layout()
plt.show()

## 6. Feature vs Target Analysis

In [None]:
# PageValues vs Revenue (key discriminative feature)
X_with_target = X.copy()
X_with_target["Revenue"] = y

plt.figure(figsize=(10, 6))
X_with_target.boxplot(column="PageValues", by="Revenue", figsize=(10, 6), patch_artist=True)
plt.suptitle("")
plt.title("PageValues Distribution by Revenue", fontsize=14, fontweight="bold")
plt.xlabel("Revenue (Purchase)", fontsize=12)
plt.ylabel("PageValues", fontsize=12)
plt.show()

# Compare means
print("\nPageValues Statistics by Revenue:")
print(X_with_target.groupby("Revenue")["PageValues"].describe())

## 7. Summary Statistics

In [None]:
print("=" * 80)
print("DATASET SUMMARY")
print("=" * 80)
print(f"Total samples: {len(X):,}")
print(f"Total features: {X.shape[1]}")
print(f"Numeric features: {len(X.select_dtypes(include=[np.number]).columns)}")
print(f"Categorical features: {len(X.select_dtypes(include=['object', 'bool']).columns)}")
print("\nClass distribution:")
print(f"  No Purchase: {(~y).sum():,} ({(~y).sum() / len(y) * 100:.2f}%)")
print(f"  Purchase: {y.sum():,} ({y.sum() / len(y) * 100:.2f}%)")
print(f"\nClass imbalance ratio: {(~y).sum() / y.sum():.2f}:1")
print(f"Missing values: {X.isnull().sum().sum()}")
print("=" * 80)