# Orders JSON → pandas: Step-by-step Notebook

This notebook shows how to work with a nested JSON dataset for an **order system** using **pandas**.

**What you'll learn:**
- Load JSON and normalize nested structures
- Build `orders` and `order_items` tables
- Compute KPIs (revenue, AOV)
- Aggregate by date, customer, and product
- Update/append and re-save to JSON/CSV
- Create simple charts with matplotlib


In [None]:
# Imports
import json
from pathlib import Path
from datetime import datetime, timedelta
import random

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

BASE = Path('.')  # adjust as needed


In [None]:
# (Skip if you already have a JSON file) — Create sample dataset
random.seed(42)
np.random.seed(42)

customers = [
    {"customer_id": "C001", "name": "Alice", "email": "alice@example.com"},
    {"customer_id": "C002", "name": "Bob", "email": "bob@example.com"},
    {"customer_id": "C003", "name": "Chai", "email": "chai@example.com"},
    {"customer_id": "C004", "name": "Dao", "email": "dao@example.com"},
]

catalog = [
    {"sku": "P001", "product_name": "Mechanical Keyboard", "unit_price": 1890.0, "category": "Accessories"},
    {"sku": "P002", "product_name": "Gaming Mouse", "unit_price": 890.0, "category": "Accessories"},
    {"sku": "P003", "product_name": "USB-C Hub", "unit_price": 1290.0, "category": "Accessories"},
    {"sku": "P004", "product_name": "27\" Monitor", "unit_price": 5990.0, "category": "Displays"},
    {"sku": "P005", "product_name": "Laptop Stand", "unit_price": 790.0, "category": "Accessories"},
]

def make_order(order_idx, start_date):
    order_dt = start_date + timedelta(days=int(np.random.randint(0, 10)), hours=int(np.random.randint(9, 20)), minutes=int(np.random.randint(0, 60)))
    order_id = f"ORD-{1000 + order_idx}"
    customer = random.choice(customers)
    n_items = int(np.random.randint(1, 4))
    items = []
    for _ in range(n_items):
        p = random.choice(catalog)
        qty = int(np.random.randint(1, 4))
        items.append({
            "sku": p["sku"],
            "product_name": p["product_name"],
            "unit_price": p["unit_price"],
            "quantity": qty,
            "category": p["category"]
        })
    subtotal = sum(i["unit_price"] * i["quantity"] for i in items)
    shipping_fee = float(np.random.choice([0.0, 60.0, 80.0, 100.0]))
    discount = float(np.random.choice([0.0, 50.0, 100.0, 150.0]))
    total = subtotal + shipping_fee - discount
    status = np.random.choice(["paid", "paid", "paid", "pending", "refunded"], p=[0.6, 0.15, 0.1, 0.1, 0.05])
    payment_method = np.random.choice(["credit_card", "promptpay", "bank_transfer"])
    currency = "THB"

    return {
        "order_id": order_id,
        "order_date": order_dt.isoformat(),
        "customer": customer,
        "status": str(status),
        "payment": {"method": payment_method, "currency": currency, "amount": round(float(total), 2)},
        "shipping_address": {"city": np.random.choice(["Bangkok", "Chiang Mai", "Khon Kaen", "Phuket"]), "country": "TH"},
        "items": items,
        "shipping_fee": shipping_fee,
        "discount": discount
    }

start_date = datetime(2025, 9, 5, 9, 0, 0)
orders = [make_order(i, start_date) for i in range(1, 16)]

raw_path = BASE / "orders.json"
with raw_path.open("w", encoding="utf-8") as f:
    json.dump(orders, f, ensure_ascii=False, indent=2)
raw_path

In [None]:
# Load JSON
import json
with open('orders.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

import pandas as pd
orders_df = pd.json_normalize(data, sep='_').drop(columns=['items'])
items_records = []
for order in data:
    for it in order['items']:
        items_records.append({
            'order_id': order['order_id'],
            'order_date': order['order_date'],
            'customer_id': order['customer']['customer_id'],
            'customer_name': order['customer']['name'],
            'status': order['status'],
            'sku': it['sku'],
            'product_name': it['product_name'],
            'category': it['category'],
            'unit_price': it['unit_price'],
            'quantity': it['quantity'],
            'line_total': it['unit_price'] * it['quantity'],
        })
items_df = pd.DataFrame(items_records)

orders_df['order_date'] = pd.to_datetime(orders_df['order_date'], errors='coerce')
items_df['order_date'] = pd.to_datetime(items_df['order_date'], errors='coerce')

order_line_totals = items_df.groupby('order_id', as_index=False)['line_total'].sum().rename(columns={'line_total':'items_subtotal'})
orders_df = orders_df.merge(order_line_totals, on='order_id', how='left')
orders_df['computed_total'] = orders_df['items_subtotal'] + orders_df['shipping_fee'] - orders_df['discount']
orders_df['payment_amount'] = orders_df['payment_amount'].astype(float)
orders_df['total_match'] = np.isclose(orders_df['computed_total'].round(2), orders_df['payment_amount'].round(2))

orders_df.head()

In [None]:
# KPIs
num_orders = orders_df.shape[0]
num_customers = orders_df.filter(like='customer').iloc[:,0].nunique()
total_revenue = orders_df.loc[orders_df['status'].isin(['paid']), 'payment_amount'].sum()
aov = orders_df.loc[orders_df['status'].isin(['paid']), 'payment_amount'].mean()
num_orders, num_customers, round(total_revenue,2), round(aov,2)

In [None]:
# Revenue by day
revenue_by_day = (
    orders_df.loc[orders_df['status'].isin(['paid'])]
    .assign(order_day=lambda d: d['order_date'].dt.date)
    .groupby('order_day', as_index=False)['payment_amount']
    .sum()
    .rename(columns={'payment_amount':'revenue_THB'})
)
revenue_by_day.head()

In [None]:
# Plot: revenue by day (one plot, default colors)
import matplotlib.pyplot as plt
plt.figure()
plt.plot(revenue_by_day['order_day'], revenue_by_day['revenue_THB'], marker='o')
plt.title('Revenue by Day (THB)')
plt.xlabel('Day')
plt.ylabel('Revenue (THB)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Top customers
top_customers = (
    orders_df.groupby(['customer_name' if 'customer_name' in orders_df.columns else 'customer.name'], as_index=False)['payment_amount']
    .sum()
    .sort_values('payment_amount', ascending=False)
    .head(5)
    .rename(columns={'payment_amount':'revenue_THB', 'customer.name':'customer_name'})
)
top_customers

In [None]:
# Plot: top customers
plt.figure()
plt.bar(top_customers['customer_name'], top_customers['revenue_THB'])
plt.title('Top Customers by Revenue (THB)')
plt.xlabel('Customer')
plt.ylabel('Revenue (THB)')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Product performance
product_perf = (
    items_df.groupby(['sku', 'product_name'], as_index=False)
    .agg(units_sold=('quantity','sum'), revenue_THB=('line_total','sum'))
    .sort_values(['revenue_THB','units_sold'], ascending=False)
)
product_perf.head()

In [None]:
# Update one pending order to paid (if any)
pending = orders_df.loc[orders_df['status'].isin(['pending', 'refunded'])]
if not pending.empty:
    target = pending.iloc[0]['order_id']
    orders_df.loc[orders_df['order_id'] == target, 'status'] = 'paid'
    print('Updated', target, 'to paid')
else:
    print('No pending/refunded orders to update.')

In [None]:
# Append a new order and re-save JSON/CSVs
new_order = {
    "order_id": "ORD-1999",
    "order_date": datetime(2025,9,15,12,0,0).isoformat(),
    "customer": {"customer_id":"C002","name":"Bob","email":"bob@example.com"},
    "status": "paid",
    "payment": {"method": "credit_card", "currency": "THB", "amount": 2990.0},
    "shipping_address": {"city": "Bangkok", "country": "TH"},
    "items": [
        {"sku":"P003","product_name":"USB-C Hub","unit_price":1290.0,"quantity":1,"category":"Accessories"},
        {"sku":"P005","product_name":"Laptop Stand","unit_price":790.0,"quantity":1,"category":"Accessories"}
    ],
    "shipping_fee": 60.0,
    "discount": 150.0
}

# Load original file, append, and save
with open('orders.json', 'r', encoding='utf-8') as f:
    arr = json.load(f)
arr.append(new_order)
with open('orders_updated.json', 'w', encoding='utf-8') as f:
    json.dump(arr, f, ensure_ascii=False, indent=2)

# Save tables too
orders_df.to_csv('orders_clean.csv', index=False)
items_df.to_csv('order_items.csv', index=False)

print('Wrote orders_updated.json, orders_clean.csv, order_items.csv')