# EDA â€” Week 2 Analytics

Load processed data and explore patterns in orders and users.

In [None]:
import sys
from pathlib import Path
ROOT = Path(__file__).resolve().parents[1]
if str(ROOT / 'src') not in sys.path:
    sys.path.insert(0, str(ROOT / 'src'))
import pandas as pd
orders = pd.read_parquet(ROOT / 'data' / 'processed' / 'orders_clean.parquet')
users = pd.read_parquet(ROOT / 'data' / 'processed' / 'users.parquet')
print('orders', orders.shape)
print('users', users.shape)

## Summary Statistics

In [None]:
print('Analytics Summary Stats:')
print(analytics.describe())

print('\nStatus breakdown:')
print(analytics['status_clean'].value_counts(dropna=False))

print('\nMissing values:')
print(analytics.isna().sum())

## Visualizations

In [None]:
# Figure 1: Amount distribution
fig, ax = plt.subplots()
analytics['amount'].dropna().hist(bins=10, ax=ax, edgecolor='black')
ax.set_xlabel('Amount')
ax.set_ylabel('Frequency')
ax.set_title('Distribution of Order Amounts')
fig_path = ROOT / 'reports' / 'figures' / 'amount_distribution.png'
fig_path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(fig_path, dpi=100, bbox_inches='tight')
plt.close()
print(f'Saved: {fig_path}')

In [None]:
# Figure 2: Status breakdown
fig, ax = plt.subplots(figsize=(8, 5))
status_counts = analytics['status_clean'].value_counts(dropna=False)
status_counts.plot(kind='bar', ax=ax, color='steelblue')
ax.set_xlabel('Status')
ax.set_ylabel('Count')
ax.set_title('Orders by Status')
ax.tick_params(axis='x', rotation=45)
fig.tight_layout()
fig_path = ROOT / 'reports' / 'figures' / 'status_breakdown.png'
fig.savefig(fig_path, dpi=100, bbox_inches='tight')
plt.close()
print(f'Saved: {fig_path}')

In [None]:
# Figure 3: Country coverage
fig, ax = plt.subplots(figsize=(8, 5))
country_counts = analytics['country'].value_counts(dropna=False)
country_counts.plot(kind='barh', ax=ax, color='coral')
ax.set_xlabel('Count')
ax.set_title('Orders by Country')
fig.tight_layout()
fig_path = ROOT / 'reports' / 'figures' / 'country_breakdown.png'
fig.savefig(fig_path, dpi=100, bbox_inches='tight')
plt.close()
print(f'Saved: {fig_path}')

In [None]:
# Figure 4: Amount by status (box plot)
fig, ax = plt.subplots(figsize=(9, 5))
analytics.boxplot(column='amount', by='status_clean', ax=ax)
ax.set_xlabel('Status')
ax.set_ylabel('Amount')
ax.set_title('Amount Distribution by Status')
plt.suptitle('')  # Remove the automatic title
fig.tight_layout()
fig_path = ROOT / 'reports' / 'figures' / 'amount_by_status.png'
fig.savefig(fig_path, dpi=100, bbox_inches='tight')
plt.close()
print(f'Saved: {fig_path}')

In [None]:
# Summary output
print('\n=== ETL Run Metadata ===')
meta_path = ROOT / 'data' / 'processed' / '_run_meta.json'
with open(meta_path) as f:
    meta = json.load(f)

for key, val in meta.items():
    if key != 'config':
        print(f'{key}: {val}')

print('\n=== Figures saved to reports/figures/ ===')
figures_dir = ROOT / 'reports' / 'figures'
for fig_file in sorted(figures_dir.glob('*.png')):
    print(f'  - {fig_file.name}')