In [1]:
import os
import scipy
import numpy as np
import pandas as pd
import geopandas as gpd
from tqdm import tqdm
from sklearn.impute import KNNImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

In [2]:
start_date = '2020-03-01'
end_date = '2022-03-31'
long_term_exposure_years = 5
N_neighbors_spatial_interpolation = 3
N_neighbors_temporal_interpolation = 3

In [None]:
mobility_2020 = pd.read_csv(os.path.join("mobility", "2020_GB_Region_mobility_report.csv"))
mobility_2021 = pd.read_csv(os.path.join("mobility", "2021_GB_Region_mobility_report.csv"))
mobility_2022 = pd.read_csv(os.path.join("mobility", "2022_GB_Region_mobility_report.csv"))

mobility_df = pd.concat([mobility_2020, mobility_2021, mobility_2022], axis=0)
mobility_variables = ['retail_and_recreation_percent_change_from_baseline',
                      'grocery_and_pharmacy_percent_change_from_baseline',
                      'parks_percent_change_from_baseline',
                      'transit_stations_percent_change_from_baseline',
                      'workplaces_percent_change_from_baseline',
                      'residential_percent_change_from_baseline']
mobility_df = mobility_df[['sub_region_1', 'sub_region_2', 'date'] + mobility_variables]
mobility_df['date'] = pd.to_datetime(mobility_df['date'])
mobility_df = mobility_df[(mobility_df["date"] >= start_date)& (mobility_df["date"] <= end_date)]
mobility_df['date'] = mobility_df['date'].dt.strftime('%Y-%m-%d')
mobility_df = mobility_df[~mobility_df['sub_region_1'].isnull()]

replace_values = mobility_df.loc[mobility_df['sub_region_2'] == 'Suffolk Coastal District', 'sub_region_2'].str.replace('Suffolk Coastal', 'East Suffolk')
mobility_df.loc[mobility_df['sub_region_2'] == 'Suffolk Coastal District', 'sub_region_2'] = replace_values

replace_values = mobility_df.loc[mobility_df['sub_region_2'] == 'Waveney District', 'sub_region_2'].str.replace('Waveney', 'East Suffolk')
mobility_df.loc[mobility_df['sub_region_2'] == 'Waveney District', 'sub_region_2'] = replace_values

replace_values = mobility_df.loc[mobility_df['sub_region_2'] == 'London Borough of Hackney', 'sub_region_2'].str.replace('London Borough of Hackney', 'Hackney and City of London')
mobility_df.loc[mobility_df['sub_region_2'] == 'London Borough of Hackney', 'sub_region_2'] = replace_values

replace_values = mobility_df.loc[mobility_df['sub_region_2'] == 'City of London', 'sub_region_2'].str.replace('City of London', 'Hackney and City of London')
mobility_df.loc[mobility_df['sub_region_2'] == 'City of London', 'sub_region_2'] = replace_values

mobility_df.head()

In [None]:
ltlas_gdf = gpd.GeoDataFrame.from_file(os.path.join("gis", 'lad19.geojson'))
districts = pd.unique(ltlas_gdf['district_name'])
district_ids = pd.unique(ltlas_gdf['district_id'])
ltlas_gdf.head()


In [5]:
name_mapping = {}
in_region_1 = []
missing_district_upsampling_map = {
    "Cornwall and Isles of Scilly": "Cornwall",
}
for district_name in pd.unique(ltlas_gdf['district_name']):
    regions_1 = pd.unique(mobility_df['sub_region_1'])
    regions_2 = pd.unique(mobility_df['sub_region_2'])
    if district_name not in regions_2:
        if district_name == "Kingston upon Hull, City of":
            name_mapping["Kingston upon Hull"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Bristol, City of":
            name_mapping["Bristol City"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Herefordshire, County of":
            name_mapping["Herefordshire"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Folkestone and Hythe":
            name_mapping["Folkestone & Hythe District"] = district_name

        elif district_name == "St Albans":
            name_mapping["Saint Albans District"] = district_name

        elif district_name == "St. Helens":
            name_mapping["Metropolitan Borough of St Helens"] = district_name

        elif district_name == "Derry City and Strabane":
            name_mapping["Derry and Strabane"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Na h-Eileanan Siar":
            name_mapping["Na h-Eileanan an Iar"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Orkney Islands":
            name_mapping["Orkney"] = district_name
            in_region_1.append(district_name)

        elif district_name == "City of Edinburgh":
            name_mapping["Edinburgh"] = district_name
            in_region_1.append(district_name)

        elif district_name == "Somerset West and Taunton":
            name_mapping["Taunton Deane"] = district_name
            name_mapping["West Somerset District"] = district_name

        elif district_name == "Rhondda Cynon Taf":
            name_mapping["Rhondda Cynon Taff"] = district_name
            in_region_1.append(district_name)

        elif "%s District" % district_name in regions_2:
            name_mapping["%s District" % district_name] = district_name
        elif "%s District" % district_name in regions_1:
            name_mapping["%s District" % district_name] = district_name
            in_region_1.append(district_name)

        elif "%s Borough" % district_name in regions_2:
            name_mapping["%s Borough" % district_name] = district_name
        elif "%s Borough" % district_name in regions_1:
            name_mapping["%s Borough" % district_name] = district_name
            in_region_1.append(district_name)

        elif "%s County Borough" % district_name in regions_2:
            name_mapping["%s County Borough" % district_name] = district_name
        elif "%s County Borough" % district_name in regions_1:
            name_mapping["%s County Borough" % district_name] = district_name
            in_region_1.append(district_name)

        elif "Borough of %s" % district_name in regions_2:
            name_mapping["Borough of %s" % district_name] = district_name
        elif "Borough of %s" % district_name in regions_1:
            name_mapping["Borough of %s" % district_name] = district_name
            in_region_1.append(district_name)

        elif "Metropolitan Borough of %s" % district_name in regions_2:
            name_mapping["Metropolitan Borough of %s" %
                         district_name] = district_name
        elif "Metropolitan Borough of %s" % district_name in regions_1:
            name_mapping["Metropolitan Borough of %s" %
                         district_name] = district_name
            in_region_1.append(district_name)

        elif "London Borough of %s" % district_name in regions_2:
            name_mapping["London Borough of %s" %
                         district_name] = district_name
        elif "London Borough of %s" % district_name in regions_1:
            name_mapping["London Borough of %s" %
                         district_name] = district_name
            in_region_1.append(district_name)

        elif "Royal Borough of %s" % district_name in regions_2:
            name_mapping["Royal Borough of %s" % district_name] = district_name
        elif "Royal Borough of %s" % district_name in regions_1:
            name_mapping["Royal Borough of %s" % district_name] = district_name
            in_region_1.append(district_name)

        elif "City of %s" % district_name in regions_2:
            name_mapping["City of %s" % district_name] = district_name
        elif "City of %s" % district_name in regions_1:
            name_mapping["City of %s" % district_name] = district_name
            in_region_1.append(district_name)

        elif "City of %s District" % district_name in regions_2:
            name_mapping["City of %s District" % district_name] = district_name
        elif "City of %s District" % district_name in regions_1:
            name_mapping["City of %s District" % district_name] = district_name
            in_region_1.append(district_name)

        elif "%s Council" % district_name in regions_2:
            name_mapping["%s Council" % district_name] = district_name
        elif "%s Council" % district_name in regions_1:
            name_mapping["%s Council" % district_name] = district_name
            in_region_1.append(district_name)

        elif "%s Principal Area" % district_name in regions_2:
            name_mapping["%s Principal Area" % district_name] = district_name
        elif "%s Principal Area" % district_name in regions_1:
            name_mapping["%s Principal Area" % district_name] = district_name
            in_region_1.append(district_name)

        elif "%s Principle Area" % district_name in regions_2:
            name_mapping["%s Principle Area" % district_name] = district_name
        elif "%s Principle Area" % district_name in regions_1:
            name_mapping["%s Principle Area" % district_name] = district_name
            in_region_1.append(district_name)

        elif district_name in regions_1:
            in_region_1.append(district_name)

        elif district_name in missing_district_upsampling_map:
            in_region_1.append(district_name)

        else:
            print(district_name)


In [None]:
mobility_df['sub_region_1'] = mobility_df['sub_region_1'].replace(name_mapping)
mobility_df['sub_region_2'] = mobility_df['sub_region_2'].replace(name_mapping)
mobility_df = mobility_df.groupby(['date', 'sub_region_1', 'sub_region_2'], dropna=False)[mobility_variables].mean().reset_index()

mobility_df_2 = mobility_df[~((mobility_df['sub_region_1'].isin(in_region_1)) & (mobility_df['sub_region_2'].isnull()))]
mobility_df_1 = mobility_df[(mobility_df['sub_region_1'].isin(in_region_1)) & (mobility_df['sub_region_2'].isnull())]
ltlas_gdf_2 = ltlas_gdf[~ltlas_gdf['district_name'].isin(in_region_1)]
ltlas_gdf_1 = ltlas_gdf[ltlas_gdf['district_name'].isin(in_region_1)]

mobility_df_2_merged = pd.merge(mobility_df_2, ltlas_gdf_2, left_on="sub_region_2", right_on="district_name", how="right")
mobility_df_1_merged = pd.merge(mobility_df_1, ltlas_gdf_1, left_on="sub_region_1", right_on="district_name", how="right")

mobility_df_merged = pd.concat([mobility_df_2_merged, mobility_df_1_merged], axis=0)
mobility_df_merged = mobility_df_merged[['district_id', 'district_name', 'date'] + mobility_variables]
mobility_df_merged.head()

In [None]:
dates = pd.date_range(start=start_date, end=end_date)

new_records = []
for district_name in districts:
    district_df = mobility_df_merged[mobility_df_merged['district_name']== district_name]
    district_id = district_df['district_id'].values[0]
    if len(dates) != len(pd.unique(district_df['date'])):
        for date in dates:
            if date.strftime('%Y-%m-%d') not in district_df['date'].unique():
                new_record = {
                    'district_id': district_id,
                    'district_name': district_name,
                    'date': date.strftime('%Y-%m-%d')
                }
                for mobility_variable in mobility_variables:
                    new_record[mobility_variable] = np.nan
                new_records.append(new_record)

missing_records_df = pd.DataFrame.from_records(new_records)
mobility_ltla_gdf = pd.concat([mobility_df_merged[~mobility_df_merged['district_name'].isin(
    missing_district_upsampling_map.keys())], missing_records_df], axis=0)
mobility_ltla_gdf = mobility_ltla_gdf.sort_values(['district_name', 'date'])
mobility_ltla_gdf = mobility_ltla_gdf.reset_index()
mobility_ltla_gdf.head()

In [8]:
# make sure no date is missing for each district
for district in districts:
    assert(len(dates) == len(mobility_ltla_gdf[mobility_ltla_gdf['district_name'] == district]['date'].unique()))

# make sure no district is missing for each date
for date in dates:
    one_day_df = mobility_ltla_gdf[mobility_ltla_gdf['date'] == date.strftime('%Y-%m-%d')]
    assert(len(one_day_df['district_name'].unique()) == len(ltlas_gdf['district_name'].unique()))

In [None]:
for missing_district in missing_district_upsampling_map:
    print("Upsampling missing values for district: %s " % missing_district)
    for mobility_variable in mobility_variables:
        for date in dates:
            date = date.strftime('%Y-%m-%d')
            upsampled_district = missing_district_upsampling_map[missing_district]
            upsampled_value = mobility_df[(mobility_df['date'] == date) & (
                mobility_df['sub_region_1'] == upsampled_district)][mobility_variable].values[0]
            if upsampled_value:
                mobility_ltla_gdf.loc[(mobility_ltla_gdf['date'] == date) & (
                    mobility_ltla_gdf['district_name'] == missing_district), mobility_variable] = mobility_ltla_gdf.loc[(mobility_ltla_gdf['date'] == date) & (
                        mobility_ltla_gdf['district_name'] == missing_district), mobility_variable].fillna(value=upsampled_value)

In [None]:
# Temporal interpolation
for district in mobility_ltla_gdf['district_name'].unique():
    for mobility_variable in mobility_variables:
        district_df = mobility_ltla_gdf[mobility_ltla_gdf['district_name'] == district].reset_index().copy()
        mobility_values = district_df[mobility_variable].interpolate(method="linear", limit=N_neighbors_temporal_interpolation)
        mobility_ltla_gdf.loc[(mobility_ltla_gdf['district_name'] == district), mobility_variable] = mobility_ltla_gdf.loc[(
            mobility_ltla_gdf['district_name'] == district), mobility_variable].fillna(value=mobility_values)
mobility_ltla_gdf.head()

In [None]:
# Spatial interpolation
def spatial_distance(X1, X2, missing_values=None):
    # X: ..., 'district_lon', 'district_lat'
    return scipy.spatial.distance.euclidean(X1[-2:], X2[-2:])


mobility_ltla_gdf = pd.merge(mobility_ltla_gdf, ltlas_gdf, left_on=(
    "district_name", 'district_id'), right_on=("district_name", "district_id"), how="left")
mobility_ltla_gdf = mobility_ltla_gdf[[
    'date', 'district_name', 'district_id'] + mobility_variables + ['district_lon', 'district_lat']]
for date in tqdm(dates, desc="Mobility data spatial interpolation"):
    date = date.strftime('%Y-%m-%d')
    mobility_matrix = mobility_ltla_gdf[mobility_ltla_gdf['date'] == date][mobility_variables + ['district_lon', 'district_lat']]
    imputer = KNNImputer(n_neighbors=N_neighbors_spatial_interpolation, weights='distance', metric=spatial_distance)
    imputed_matrix = imputer.fit_transform(mobility_matrix)
    imputed_df = pd.DataFrame(data=imputed_matrix[:, 0:len(
        mobility_variables)], index=mobility_matrix.index, columns=mobility_variables)
    mobility_ltla_gdf.loc[(mobility_ltla_gdf['date'] == date), mobility_variables] = mobility_ltla_gdf.loc[(
        mobility_ltla_gdf['date'] == date), mobility_variables].fillna(value=imputed_df)

In [12]:
mobility_ltla_gdf = mobility_ltla_gdf.drop('district_lon', axis=1)
mobility_ltla_gdf = mobility_ltla_gdf.drop('district_lat', axis=1)

In [None]:
mobility_ltla_gdf.head()

In [14]:
scaler = StandardScaler()
scaled_mobility_data= scaler.fit_transform(mobility_ltla_gdf[mobility_variables])
pca = PCA(.95)
pca.fit(scaled_mobility_data)

weight_vector = pca.explained_variance_ratio_ / pca.explained_variance_ratio_.sum()
transformed_mobility = pca.transform(scaled_mobility_data)
for idx, weight in enumerate(weight_vector):
    transformed_mobility[:, idx] = transformed_mobility[:, idx] * weight
mobility_ltla_gdf['mobility_index'] = transformed_mobility.sum(axis=1)

In [15]:
mobility_ltla_gdf[mobility_variables + ['mobility_index']] = mobility_ltla_gdf[mobility_variables + ['mobility_index']].astype(float)
mobility_ltla_gdf.to_csv("../mobility.csv", float_format="%.1f", na_rep="N/A", index=False)
