In [1]:
import os
import pandas as pd
import geopandas as gpd
import numpy as np

In [2]:
start_date = '2020-03-01'
end_date = '2022-03-31'

In [3]:
cases = pd.read_csv(os.path.join("covid", "cases.csv"))
deaths = pd.read_csv(os.path.join("covid", "deaths.csv"))
tests = pd.read_csv(os.path.join("covid", "tests.csv"))
vaccination = pd.read_csv(os.path.join("covid", "vaccination.csv"))

In [None]:
covid_df = pd.merge(cases, deaths, left_on=("areaCode", "areaName", "areaType", "date"), right_on=("areaCode", "areaName", "areaType", "date"), how="outer")
covid_df = pd.merge(covid_df, tests, left_on=("areaCode", "areaName", "areaType", "date"), right_on=("areaCode", "areaName", "areaType", "date"), how="outer")
covid_df = pd.merge(covid_df, vaccination, left_on=("areaCode", "areaName", "areaType", "date"), right_on=("areaCode", "areaName", "areaType", "date"), how="outer")
covid_df['date'] = pd.to_datetime(covid_df['date'])
covid_df = covid_df[(covid_df["date"] >= start_date) & (covid_df["date"] <= end_date)]
covid_df['date'] = covid_df['date'].dt.strftime('%Y-%m-%d')
covid_variables = ["newCasesBySpecimenDate", "newCasesPCROnlyBySpecimenDate",
                   "newDeaths28DaysByDeathDate", "newPCRTestsBySpecimenDate",
                   "newVirusTestsBySpecimenDate", "newPeopleVaccinatedFirstDoseByVaccinationDate",
                   "newPeopleVaccinatedSecondDoseByVaccinationDate", "newPeopleVaccinatedThirdInjectionByVaccinationDate"]
covid_df.head()

In [None]:
len(covid_df['areaCode'].unique())

In [None]:
len(covid_df['areaCode'][covid_df['areaCode'].str.startswith("E")].unique())

In [None]:
ltlas_gdf = gpd.GeoDataFrame.from_file(os.path.join("gis", 'lad19.geojson'))
districts = pd.unique(ltlas_gdf['district_name'])
district_ids = pd.unique(ltlas_gdf['district_id'])
ltlas_gdf.head()

In [None]:
covid_gdf = pd.merge(ltlas_gdf, covid_df, left_on="district_id", right_on="areaCode", how="left")
covid_gdf = covid_gdf[['district_name', 'district_id', 'date'] + covid_variables]
covid_gdf.head()


In [None]:
dates = pd.date_range(start=start_date, end=end_date)

new_records = []
for district_name in districts:
    district_df = covid_gdf[covid_gdf['district_name'] == district_name]
    district_id = district_df['district_id'].values[0]
    if len(dates) != len(pd.unique(district_df['date'])):
        for date in dates:
            if date.strftime('%Y-%m-%d') not in district_df['date'].unique():
                new_record = {
                    'district_id': district_id,
                    'district_name': district_name,
                    'date': date.strftime('%Y-%m-%d')
                }
                for covid_variable in covid_variables:
                    new_record[covid_variable] = np.nan
                new_records.append(new_record)
missing_records_df = pd.DataFrame.from_records(new_records)
covid_gdf = pd.concat([covid_gdf, missing_records_df], axis=0)
covid_gdf = covid_gdf.sort_values(['district_name', 'date'])
covid_gdf = covid_gdf.reset_index()
covid_gdf = covid_gdf.drop('index', axis=1)
covid_gdf = covid_gdf[~covid_gdf['date'].isnull()]
covid_gdf.head()


In [10]:
# make sure no date is missing for each district
for district in districts:
    assert(len(dates) == len(covid_gdf[covid_gdf['district_name'] == district]['date'].unique()))

# make sure no district is missing for each date
for date in dates:
    one_day_df = covid_gdf[covid_gdf['date'] == date.strftime('%Y-%m-%d')]
    assert(len(one_day_df['district_name'].unique()) == (len(ltlas_gdf['district_name'].unique())))

In [None]:
dates = pd.date_range(start=start_date, end=end_date)
test_data_nation_level = pd.read_csv(os.path.join("covid", "nation_pillar_tests.csv"))
test_data_nation_level = test_data_nation_level.drop('areaType', axis=1)
vacc_data_nation_level = pd.read_csv(os.path.join("covid","nation_vaccination.csv"))
vacc_data_nation_level = vacc_data_nation_level.drop('areaType', axis=1)
covid_nation_df = pd.merge(test_data_nation_level, vacc_data_nation_level, left_on=['areaCode', 'areaName', 'date'], right_on=['areaCode', 'areaName', 'date'], how="outer")

new_records = []
for nation_id in covid_nation_df['areaCode'].unique():
    nation_df = covid_nation_df[covid_nation_df['areaCode'] == nation_id]
    nation_name = nation_df['areaName'].values[0]
    if len(dates) != len(pd.unique(nation_df['date'])):
        for date in dates:
            if date.strftime('%Y-%m-%d') not in nation_df['date'].unique():
                new_record = {
                    'areaCode': nation_id,
                    'areaName': nation_name,
                    'date': date.strftime('%Y-%m-%d')
                }
                for covid_variable in ['newPillarOneTwoTestsByPublishDate',
                                       'newPeopleVaccinatedFirstDoseByPublishDate',
                                       'newPeopleVaccinatedSecondDoseByPublishDate',
                                       'newPeopleVaccinatedThirdInjectionByPublishDate']:
                    new_record[covid_variable] = np.nan
                new_records.append(new_record)
missing_records_df = pd.DataFrame.from_records(new_records)
covid_nation_df = pd.concat([covid_nation_df, missing_records_df], axis=0)
covid_nation_df = covid_nation_df.sort_values(['areaName', 'date'])
covid_nation_df = covid_nation_df.reset_index()
covid_nation_df = covid_nation_df.drop('index', axis=1)
covid_nation_df = covid_nation_df[~covid_nation_df['date'].isnull()]
covid_nation_df.head()


In [12]:
# For missing testing data (at the local authority level) in Scotland, Wales, Northern Ireland
census_df = pd.read_csv('../census.csv')
for district_id in covid_gdf['district_id'].unique():
    if not district_id.startswith("E"):
        district_population = census_df[census_df['district_id'] == district_id]['population'].values[0]
        if district_id.startswith("N"):
            nation_df = covid_nation_df[covid_nation_df['areaName'] == "Northern Ireland"]
            total_nation_population = census_df[census_df['district_id'].str.startswith("N")]['population'].sum()
        if district_id.startswith("S"):
            nation_df = covid_nation_df[covid_nation_df['areaName'] == "Scotland"]
            total_nation_population = census_df[census_df['district_id'].str.startswith("S")]['population'].sum()
        if district_id.startswith("W"):
            nation_df = covid_nation_df[covid_nation_df['areaName'] == "Wales"]
            total_nation_population = census_df[census_df['district_id'].str.startswith("W")]['population'].sum()

        total_test_nation_level = nation_df['newPillarOneTwoTestsByPublishDate'].values
        estimated_test_district_level = district_population / total_nation_population
        covid_gdf.loc[(covid_gdf['district_id'] == district_id), 'newVirusTestsBySpecimenDate'] = estimated_test_district_level

In [13]:
# For missing vaccination data (at the local authority level) in Wales, Northern Ireland
census_df = pd.read_csv('../census.csv')
vacc_data_nation_level = pd.read_csv(os.path.join("covid", "nation_vaccination.csv"))
for district_id in covid_gdf['district_id'].unique():
    if not district_id.startswith("E") and not district_id.startswith("S"):
        district_population = census_df[census_df['district_id'] == district_id]['population'].values[0]
        if district_id.startswith("N"):
            nation_df = covid_nation_df[covid_nation_df['areaName'] == "Northern Ireland"]
            total_nation_population = census_df[census_df['district_id'].str.startswith("N")]['population'].sum()
        if district_id.startswith("W"):
            nation_df = covid_nation_df[covid_nation_df['areaName'] == "Wales"]
            total_nation_population = census_df[census_df['district_id'].str.startswith("W")]['population'].sum()
        for variable in ['newPeopleVaccinatedFirstDoseByPublishDate', 'newPeopleVaccinatedSecondDoseByPublishDate', 'newPeopleVaccinatedThirdInjectionByPublishDate']:
            total_vacc_nation_level = nation_df[variable].values
            estimated_vacc_district_level = total_vacc_nation_level * district_population / total_nation_population
            covid_gdf.loc[covid_gdf['district_id'] == district_id, variable.replace("PublishDate", "VaccinationDate")] = estimated_vacc_district_level


In [14]:
covid_gdf = covid_gdf.rename(columns={
    "newCasesBySpecimenDate": "cases",
    "newCasesPCROnlyBySpecimenDate": "cases_PCR",
    "newDeaths28DaysByDeathDate": "deaths",
    "newPCRTestsBySpecimenDate": "tests_PCR",
    "newVirusTestsBySpecimenDate": "tests",
    "newPeopleVaccinatedFirstDoseByVaccinationDate": "vacc_first_dose",
    "newPeopleVaccinatedSecondDoseByVaccinationDate": "vacc_second_dose",
    "newPeopleVaccinatedThirdInjectionByVaccinationDate": "vacc_third_injection"
})


In [15]:
covid_variables = ['cases', 'cases_PCR', 'deaths', 'tests', 'tests_PCR',
                   'vacc_first_dose', 'vacc_second_dose', 'vacc_third_injection']
covid_gdf = covid_gdf[['district_name',
                       'district_id', 'date'] + covid_variables]
covid_gdf[covid_variables] = covid_gdf[covid_variables].astype(float)


In [16]:
covid_gdf.to_csv("../covid.csv", float_format="%.1f", na_rep="N/A", index=False)