This notebook can be run to analyse whether a specified event had any impact within the Visa data.

In [None]:
project_path = "/home/jupyter"
import sys
sys.path.append(project_path)
sys.path.append(f'{project_path}/ft-geographic-and-timing/src/')
import synthetic_control as sc

from google.cloud import bigquery
import numpy as np
from fintrans_toolbox.src import bq_utils as bq
from fintrans_toolbox.src import table_utils as t
import random
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer 
import pandas as pd
import matplotlib.pyplot as plt
import plotly.express as px

## Specify event details:

In [None]:
host_name = 'Holyhead' # Name of host city, used for formatting
host_district = 'LL56' # District of interest
nearby_districts = ['LL33'] + [ f'LL{i}' for i in range(54, 79)] # Surrounding districts that may also be effected
origin_districts = ['BT'] # Cardholder origin districts of interest
international_interest = True # Specify whether international metrics at host_district are of interest
origin_countries = ['REPUBLIC OF IRELAND'] # Cardholder origin country of interest

mcg_interest = 'All' # MCG of  interest

event_month = '202412' # Date of event
treatment_date = '202411' # Pre-event date
data_from = '202201' # Start of time series and index start, '201901' for full

donor_pool_size = 100 # Number of postal districts to use as the donor pool for synthetic control. Use 100 for quick running.

- Sum spend/cardholders/transactions at host district v region v rest of UK

- Y-Y / M-M

- If international_interest is True - international metrics + origin country metric

- Synthetic control

# Sum spend

In [None]:
def calc_index_yymm(df, group, need_datetime):
    
    if need_datetime is True:
        df['year'] = df['date_time'].dt.year
        df['month'] = df['date_time'].dt.month
    
    month_group = group + ['month']
    
    df = df.sort_values(['date_time']).reset_index(drop=True)
    # index spend
    df["index_spend"] = df.groupby(group)["spend"].transform(
        lambda x: x / x.iloc[0]
    )
    # index cardholders
    df["index_cardholders"] = df.groupby(group)["cardholders"].transform(
        lambda x: x / x.iloc[0]
    )
    # month-on-month
    
    df['mm_perc_spend'] = df.groupby(group)['spend'].pct_change(periods=1)*100
    df['mm_perc_cardholders'] = df.groupby(group)['cardholders'].pct_change(periods=1)*100

    df['mm_perc_spend_index'] = df.groupby(group)['index_spend'].pct_change(periods=1)*100
    df['mm_perc_cardholders_index'] = df.groupby(group)['index_cardholders'].pct_change(periods=1)*100

    # year-on-year
    df['yy_perc_spend'] = df.groupby(month_group)['spend'].pct_change(periods=1)*100
    df['yy_perc_cardholders'] = df.groupby(month_group)['cardholders'].pct_change(periods=1)*100
    
    df['yy_perc_spend_index'] = df.groupby(month_group)['index_spend'].pct_change(periods=1)*100
    df['yy_perc_cardholders_index'] = df.groupby(month_group)['index_cardholders'].pct_change(periods=1)*100
    
    return df

In [None]:
def load_data(mcg_interest, data_from):

    client = bigquery.Client()

    sql = f"""SELECT time_period_value, merchant_location, spend, transactions, cardholders
      FROM ons-fintrans-data-prod.fintrans_visa.spend_merchant_location
      WHERE time_period = 'Month' AND 
      merchant_location_level = 'POSTAL_DISTRICT' AND
      cardholder_issuing_level = 'All' AND
      mcg = '{mcg_interest}' AND
      time_period_value >= '{data_from}'
      ORDER BY time_period_value, merchant_location
      """

    df_full = client.query(sql).to_dataframe()
    df_full = t.create_date_time(df_full)
    
    return df_full

In [None]:
def prep_data(host_district, nearby_districts, df):
    # removing non-district locations 
    df["number"] = df["merchant_location"].str.extract(
        "(\d+)", expand=False
    )
    df = df.loc[df["number"].isna() == False].drop(
        "number", axis=1
    )
    
    host = df[df['merchant_location'] == host_district].copy()
    area = df[(df['merchant_location'].isin(nearby_districts)) & (df['merchant_location'] != host_district)].copy()
    uk = df[(~df['merchant_location'].isin(nearby_districts)) & (df['merchant_location'] != host_district)].copy()
    
    area = (
        area.groupby(["date_time"])
        .agg({"spend": "sum", "transactions": "sum", "cardholders": "sum"})
        .reset_index()
    )
    uk = (
            uk.groupby(["date_time"])
            .agg({"spend": "sum", "transactions": "sum", "cardholders": "sum"})
            .reset_index()
        )

    area['merchant_location'] = 'Surrounding Districts'
    uk['merchant_location'] = 'Rest of UK'

    area = calc_index_yymm(area, group = ['merchant_location'], need_datetime = True)
    uk = calc_index_yymm(uk, group = ['merchant_location'], need_datetime = True)
    host = calc_index_yymm(host, group = ['merchant_location'], need_datetime = True)

    all_dfs = pd.concat([host, uk, area])
    
    return all_dfs

In [None]:
df = load_data(mcg_interest, data_from)
all_dfs = prep_data(host_district, nearby_districts, df)

In [None]:
# from datetime import datetime

# date_object = datetime.strptime(data_from, '%Y%m').date()
# formatted_date = date_object.strftime('%Y-%d-%m')

In [None]:
def plot_metrics(df, metric, title, data_from, mcg_interest, host_district):
    fig = px.line(
    df,
    x="date_time",
    y=metric,
    color = 'merchant_location',
    title=title,
    template='simple_white',
    color_discrete_sequence=["#206095", "#A09FA0", "#871A5B"], 
    height = 500,
    width = 800
    )
    
    if 'index' in metric:
        fig.update_layout(title_subtitle_text=f'Indexed to {data_from} | MCG = {mcg_interest}',
                     legend_title_text='Location:')
    else:
        fig.update_layout(title_subtitle_text=f'MCG = {mcg_interest}',
                         legend_title_text='Location:')


    fig.update_traces(selector=dict(name='Rest of UK'), line=dict(dash='dash'))
    fig.update_traces(selector=dict(name='Surrounding Districts'), line=dict(width = 1.5))
    fig.update_traces(selector=dict(name=host_district), line=dict(width = 2.7))


    fig.show()

In [None]:
plot_metrics(all_dfs, metric = 'spend', 
             title = 'Indexed spend', 
             data_from = data_from, mcg_interest = mcg_interest, host_district = host_district)