In [1]:
import pandas as pd
import altair as alt

In [2]:
prices_df = pd.read_parquet('https://autocpi-public.s3.eu-west-2.amazonaws.com/lrpd/db_prices.parquet')
prices_df.head()

Unnamed: 0,quote_date,shop_code,item_id_raw,region,price,indicator_box,item_id
0,200102.0,808.0,210101,12.0,0.35,Q,210101
1,199603.0,32.0,210101,3.0,0.42,,210101
2,198905.0,3.0,210101,8.0,0.44,,210101
3,199511.0,52.0,210101,2.0,0.64,,210101
4,200105.0,126.0,210101,8.0,0.8,,210101


In [3]:
items_df = pd.read_parquet('https://autocpi-public.s3.eu-west-2.amazonaws.com/lrpd/db_item.parquet')
items_df.head()

Unnamed: 0,item_id,description,date_quote_s,date_quote_e,n_obs
0,210101,LARGE LOAF-WHITE-SLICED-800G,198802,200401,36039
1,210102,LARGE LOAF-WHITE-UNSLICED-800G,198802,202510,56917
2,210105,LARGE WHOLEMEAL LOAF-UNSLICED,198802,200301,27161
3,210106,SIX BREAD ROLLS-WHITE/BROWN,198802,202510,67469
4,210107,"BROWN LOAF,400G,SLICED-GRAN",198903,200401,29361


In [55]:
import altair as alt
import pandas as pd

# Define clothing comparisons with their item_ids and date ranges
comparisons = {
    'Jeans': {
        'mens': [510106],
        'womens': [510248, 510249],
        'start_date': '2004-04-01'
    },
    'Casual Shirt': {
        'mens': [510131, 510126],
        'womens': [510206, 510257],
        'start_date': '2013-02-01'
    },
    'Jacket': {
        'mens': [510104, 510116],
        'womens': [510208, 510219],
        'start_date': '1997-02-01'
    }
}

# Build combined dataset
all_data = []

for category, items in comparisons.items():
    # Filter data
    category_data = prices_df[
        prices_df['item_id'].isin(items['mens'] + items['womens'])
    ].copy()
    
    # Convert date
    category_data['quote_date'] = pd.to_datetime(category_data['quote_date'], format='%Y%m')
    
    # Apply date filters
    if 'start_date' in items:
        category_data = category_data[category_data['quote_date'] >= items['start_date']]
    if 'end_date' in items:
        category_data = category_data[category_data['quote_date'] <= items['end_date']]
    
    # Calculate averages by date
    category_avg = category_data.groupby(['item_id', 'quote_date']).agg({
        'price': 'mean'
    }).reset_index()
    
    # Separate and average men's
    mens = category_avg[category_avg['item_id'].isin(items['mens'])]
    mens_avg = mens.groupby('quote_date').agg({'price': 'mean'}).reset_index()
    mens_avg['gender'] = 'Men'
    mens_avg['category'] = category
    
    # Separate and average women's
    womens = category_avg[category_avg['item_id'].isin(items['womens'])]
    womens_avg = womens.groupby('quote_date').agg({'price': 'mean'}).reset_index()
    womens_avg['gender'] = 'Women'
    womens_avg['category'] = category
    
    # Combine
    all_data.append(mens_avg)
    all_data.append(womens_avg)

# Combine all categories
final_df = pd.concat(all_data, ignore_index=True)


# Create dropdown selection
category_dropdown = alt.binding_select(
    options=list(comparisons.keys()),
    name='Clothing Type: '
)
category_select = alt.selection_point(
    fields=['category'],
    bind=category_dropdown,
    value='Jeans'
)

# Create hover selection for the vertical line
hover = alt.selection_point(
    fields=['quote_date'],
    nearest=True,
    on='mouseover',
    empty=False
)

# Filter data based on selection
base = alt.Chart(final_df).transform_filter(
    category_select
)

# Create the main line chart
lines = base.mark_line(size=2).encode(
    x=alt.X('quote_date:T', 
            title='Date',
            axis=alt.Axis(format='%Y', grid=False)),
    y=alt.Y('price:Q', 
            title='Average Price (£)',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)),
    color=alt.Color('gender:N', 
                    title='Gender',
                    scale=alt.Scale(scheme='category10'),
                    legend=alt.Legend(symbolType='stroke', symbolStrokeWidth=3))
)

# Create points that appear on hover
points = base.mark_point(size=100).encode(
    x='quote_date:T',
    y='price:Q',
    color=alt.Color('gender:N', legend=None),  # No legend for points
    opacity=alt.condition(hover, alt.value(1), alt.value(0)),
    tooltip=[
        alt.Tooltip('category:N', title='Category'),
        alt.Tooltip('gender:N', title='Gender'),
        alt.Tooltip('quote_date:T', title='Date', format='%Y-%m'),
        alt.Tooltip('price:Q', title='Avg Price', format=',.2f')
    ]
).add_params(hover)

# Create vertical rule that follows mouse
rule = base.mark_rule(color='gray', strokeWidth=1).encode(
    x='quote_date:T',
    opacity=alt.condition(hover, alt.value(0.5), alt.value(0))
).transform_filter(hover)

# Combine all layers
chart1 = (lines + points + rule).add_params(
    category_select
).properties(
    width=800,
    height=450,
    title="Price Comparison: Men's vs Women's Clothing"
).configure_view(
    strokeWidth=0
)

chart1

In [54]:
# save to json
chart1.save('charts/clothing_price_comparison.json')

In [None]:
import altair as alt
import pandas as pd
import numpy as np

# Filter for Private Health Club Annual Fee
health_club = prices_df[prices_df['item_id'] == 640226].copy()

# Convert date
health_club['quote_date'] = pd.to_datetime(health_club['quote_date'], format='%Y%m')

# Map region codes to names - keep the order
region_names = {
    2: 'London',
    3: 'South East',
    4: 'South West',
    5: 'East Anglia',
    6: 'East Midlands',
    7: 'West Midlands',
    8: 'Yorkshire & Humber',
    9: 'North West',
    10: 'North',
    11: 'Wales',
    12: 'Scotland',
    13: 'Northern Ireland'
}

health_club['region_name'] = health_club['region'].map(region_names)

# Calculate deciles by date and region
deciles = health_club.groupby(['quote_date', 'region', 'region_name'])['price'].quantile([0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]).reset_index()
deciles.columns = ['quote_date', 'region', 'region_name', 'quantile', 'price']

# Create bands for the area chart
bands_data = []
band_pairs = [
    (0.1, 0.2, '10-20th percentile'),
    (0.2, 0.3, '20-30th percentile'),
    (0.3, 0.4, '30-40th percentile'),
    (0.4, 0.5, '40-50th percentile'),
    (0.5, 0.6, '50-60th percentile'),
    (0.6, 0.7, '60-70th percentile'),
    (0.7, 0.8, '70-80th percentile'),
    (0.8, 0.9, '80-90th percentile')
]

for lower_q, upper_q, band_name in band_pairs:
    lower_prices = deciles[deciles['quantile'] == lower_q][['quote_date', 'region', 'region_name', 'price']].rename(columns={'price': 'price_lower'})
    upper_prices = deciles[deciles['quantile'] == upper_q][['quote_date', 'region', 'region_name', 'price']].rename(columns={'price': 'price_upper'})
    
    band_df = pd.merge(lower_prices, upper_prices, on=['quote_date', 'region', 'region_name'])
    band_df['band'] = band_name
    bands_data.append(band_df)

final_bands = pd.concat(bands_data, ignore_index=True)

# Ordered region list (matching the region_names dict order)
region_name_list = [
    'London', 'South East', 'South West', 'East Anglia', 'East Midlands',
    'West Midlands', 'Yorkshire & Humber', 'North West', 'North', 
    'Wales', 'Scotland', 'Northern Ireland'
]

# Filter to only regions that exist in data
region_name_list = [r for r in region_name_list if r in final_bands['region_name'].unique()]

# Create region dropdown using region names
region_dropdown = alt.binding_select(
    options=region_name_list,
    name='Region: ', 
)
region_select = alt.selection_point(
    fields=['region_name'],
    bind=region_dropdown,
    value='London'
)

# Create the chart with green color scheme
chart = alt.Chart(final_bands).transform_filter(
    region_select
).mark_area(opacity=0.7).encode(
    x=alt.X('quote_date:T', 
            title='Date',
            axis=alt.Axis(format='%Y', grid=False)),
    y=alt.Y('price_upper:Q',
            title='Annual Fee (£)',
            scale=alt.Scale(zero=False),
            axis=alt.Axis(grid=False)),
    y2=alt.Y2('price_lower:Q'),
    color=alt.Color('band:N',
                    scale=alt.Scale(range=[
                        '#D4EDDA',  # Light green
                        '#A8D5BA',  # Medium-light green
                        '#7BC96F',  # Medium green
                        '#4CAF50',  # Green
                        '#285d2a',  # Dark green (middle)
                        '#4CAF50',  # Green
                        '#7BC96F',  # Medium green
                        '#A8D5BA',  # Medium-light green
                        '#D4EDDA'   # Light green
                    ]),
                    legend=None),
    tooltip=[
        alt.Tooltip('quote_date:T', title='Date', format='%Y-%m'),
        alt.Tooltip('region_name:N', title='Region'),
        alt.Tooltip('band:N', title='Percentile Range'),
        alt.Tooltip('price_lower:Q', title='Lower Bound (£)', format=',.2f'),
        alt.Tooltip('price_upper:Q', title='Upper Bound (£)', format=',.2f')
    ]
).add_params(
    region_select
).properties(
    width=800,
    height=450,
    title="Private Health Club Annual Fee by Region"
).configure_view(
    strokeWidth=0
)#.interactive()

chart

In [56]:
# save to json
chart.save('charts/health_club_fees_by_region.json')