In [None]:
"""

# ITS 2122 - Phase 5: Data Enrichment  
**Author:** Nethranjali Jayasanki  
**Date:** 2025-08-26  
**Purpose:** Enhance the Online Retail II dataset by integrating external data sources through API calls for enriched insights.

## Overview
This notebook focuses on enriching the cleaned and analyzed dataset by:
- Integrating external currency conversion data via a free API (e.g., ExchangeRate-API or Open Exchange Rates)  
- Converting top 100 transactions (by TotalPrice) into USD and EUR  
- Adding two new fields: `TotalPrice_USD` and `TotalPrice_EUR`  
- Demonstrating the business value of enrichment for tasks such as:
  - Financial reporting to international stakeholders  
  - Regional pricing and strategy development  
  - Comparative analysis across currencies

  """


In [None]:
import pandas as pd
import requests
from pathlib import Path

# Load cleaned dataset (Phase 1)
DATA_PATH = Path("../data/processed/online_retail_clean_phase1.csv")
if not DATA_PATH.exists():
    raise FileNotFoundError(f"{DATA_PATH} not found. Run Phase 1 first.")

df = pd.read_csv(DATA_PATH)

# Preview
print("Dataset shape:", df.shape)
df.head()

In [None]:
# Calculate top 100 by revenue (TotalPrice)
df['TotalPrice'] = df['Quantity'] * df['UnitPrice']
top_100 = df.sort_values(by='TotalPrice', ascending=False).head(100).copy()

top_100.head()

In [None]:
# API URL for GBP base currency
api_url = "https://api.exchangerate-api.com/v4/latest/GBP"

response = requests.get(api_url)
if response.status_code != 200:
    raise Exception("Failed to fetch exchange rates")

rates = response.json()['rates']

# Get USD and EUR rates
gbp_to_usd = rates['USD']
gbp_to_eur = rates['EUR']

print(f"GBP → USD: {gbp_to_usd}")
print(f"GBP → EUR: {gbp_to_eur}")

In [None]:
# Add new columns for converted prices
top_100['TotalPrice_USD'] = top_100['TotalPrice'] * gbp_to_usd
top_100['TotalPrice_EUR'] = top_100['TotalPrice'] * gbp_to_eur

top_100[['InvoiceNo','TotalPrice','TotalPrice_USD','TotalPrice_EUR']].head()

In [None]:
OUTPUT_DIR = Path("outputs")
OUTPUT_DIR.mkdir(exist_ok=True)

top_100.to_csv(OUTPUT_DIR / "phase5_top100_converted.csv", index=False)
print("Phase 5 enriched data saved to outputs/phase5_top100_converted.csv")

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(10,6))
sns.barplot(data=top_100.head(20), x='TotalPrice_USD', y='InvoiceNo')
plt.title("Top 20 Transactions in USD")
plt.xlabel("Value in USD")
plt.ylabel("Invoice No")
plt.show()