In [1]:
import os
import scipy
import numpy as np
import pandas as pd
import geopandas as gpd
from sklearn.impute import KNNImputer

In [2]:
start_date = '2020-03-01'
end_date = '2022-03-31'
N_neighbors_spatial_interpolation = 3

In [3]:
# # https://opendata.camden.gov.uk/Maps/Postcodes-Local-Authorities-only-v01/g3bz-7ur8
# # load mapping from postcode to local authority
# postcode_mapping = pd.read_csv("../gis/postcodes_to_las.csv")
# postcode_mapping.head()

In [None]:
# https://geoportal.statistics.gov.uk/datasets/ons::nhs-postcode-directory-uk-full-may-2019/about
postcode_mapping = pd.read_csv("gis/NHSPD_MAY_2019_UK_FULL/Data/nhg19may.csv", low_memory=False, header=None)
postcode_mapping = postcode_mapping[[0, 1, 8]]
postcode_mapping.columns = ['Postcode 1', 'Postcode 2', 'Local Authority Code']
postcode_mapping.head()

In [5]:
disease_coding = {
    "Asthma": "ASTHMA 1",
    "COPD": "COPD 1",
    "Dementia": "DEM 1",
    "Diabetes": "DM 1",
    "Hypertension": "BP 1",
    "Obesity": "Obesity 1",
    "Stroke": "STROKE 1"
}

In [None]:
ltlas_gdf = gpd.GeoDataFrame.from_file(os.path.join("gis", 'lad19.geojson'))
districts = pd.unique(ltlas_gdf['district_name'])
district_ids = pd.unique(ltlas_gdf['district_id'])
ltlas_gdf.head()

In [None]:
ltlas_gdf[ltlas_gdf['district_id'].str.startswith("N")]

In [8]:
# https://www.nisra.gov.uk/sites/nisra.gov.uk/files/publications/Maps-of-NI-Health-Trusts-and-Local-Government-Districts.pdf
NI_health_trust_to_local_authority = {
    "Belfast": ["N09000003", "N09000007"],
    "Northern": ["N09000001", "N09000004", "N09000008", "N09000009"],
    "South Eastern": ["N09000011", "N09000007", "N09000010"],
    "Southern": ["N09000002", "N09000010"],
    "Western": ["N09000005", "N09000006"]
}

# https://www.health-ni.gov.uk/publications/health-survey-northern-ireland-first-results-201819
NI_smoking_prevalence = {
    "Belfast": 20.4688930448713,
    "Northern": 17.5673075712139,
    "South Eastern": 15.2428542967955,
    "Southern": 20.4448074256214,
    "Western": 19.6618106966883
}

In [None]:
GB_smoking_prevalence = pd.read_excel("comorbidity/Smoking/GB_smoking_data.xlsx")
GB_smoking_prevalence.head()

In [10]:
smoking_data = {}
for idx, row in GB_smoking_prevalence.iterrows():
    smoking_data[row['Local Authority Code']] = row['Smoking']
for health_trust in NI_health_trust_to_local_authority:
    for district_id in NI_health_trust_to_local_authority[health_trust]:
        smoking_data[district_id] = NI_smoking_prevalence[health_trust]

In [None]:
achievement_names = ["id", "year", 'numerator', 'denominator', 'ratio', 'centile', 'orgcode', 'areaid', 'active']
achievement_df = pd.read_csv("comorbidity/QOF/qofdb_achievement.csv.gz", header=None, names=achievement_names, low_memory=False)
achievement_df = achievement_df[achievement_df['areaid'].isin(disease_coding.values())]
achievement_df.head()

In [None]:
org_names = ["orgcode", 'level', 'name', 'addr', 'postcode', 'website']
org_df = pd.read_csv("comorbidity/QOF/qofdb_organisation.csv.gz", header=None, names=org_names, low_memory=False)
org_df = org_df[['orgcode', 'postcode']]
org_df.head()

In [None]:
df = pd.merge(achievement_df, org_df, left_on="orgcode", right_on="orgcode", how="left")
df = pd.merge(df, postcode_mapping, left_on="postcode", right_on="Postcode 2", how="left")
df = df[['id', 'year', 'numerator', 'denominator', 'areaid', 'Local Authority Code']]
df = df.groupby(['Local Authority Code', 'areaid', 'year'])[['numerator', 'denominator']].sum().reset_index()
df = df.sort_values(['Local Authority Code', 'areaid', 'year'])
df = df.groupby(['Local Authority Code', 'areaid']).last().reset_index()
df['prevalence'] = df['numerator'] / df['denominator'] * 100
df = df.rename(columns={"Local Authority Code": "district_id", "areaid": "disease"})
df = df[['district_id', 'disease', 'prevalence']]
df['disease'] = df['disease'].map(dict([(value, key) for key, value in disease_coding.items()]))
df = df.pivot(index="district_id", columns="disease", values="prevalence").reset_index()
df = pd.merge(df, ltlas_gdf, left_on="district_id", right_on="district_id", how="right")
df = df[['district_name', 'district_id'] + list(disease_coding.keys())]
df = df.sort_values(['district_name', 'district_id']).reset_index()
df = df.drop('index', axis=1)
df.head()

In [14]:
def get_smoking_data(district_id):
    if district_id in smoking_data:
        return smoking_data['district_id']
    else:
        return np.nan
df['Smoking'] = df['district_id'].apply(lambda x: smoking_data[x])

In [15]:
hiv_data_england = pd.read_csv("comorbidity/HIV/fingertips_api_hiv.csv")
hiv_data_england = hiv_data_england[['Area Code', 'Time period', 'Value']]
hiv_data_england = hiv_data_england.sort_values(['Area Code', 'Time period']).reset_index()
hiv_data_england = hiv_data_england.groupby(['Area Code']).last().reset_index()

df['HIV'] = np.nan
for district_id in district_ids:
    if district_id.startswith('N'):
        value = 1123/1890000 * 100 #https://www.publichealth.hscni.net/sites/default/files/2020-12/HIV%20%20Report%202020%20tables%20and%20charts%20%282019%20data%29.pdf
    elif district_id.startswith('S'):
        value = 6100/5460000 * 100 #https://www.hps.scot.nhs.uk/publications/hps-weekly-report/volume-54/issue-25/hiv-infection-in-scotland-summary-report-to-31-december-2019/
    elif district_id.startswith('W'):
        value = 49/100 #https://gov.wales/written-statement-hiv-aids-rates-prevention-and-treatment-services-wales
    elif district_id.startswith('E'):
        if len(hiv_data_england[hiv_data_england['Area Code'] == district_id].values):
            value = hiv_data_england[hiv_data_england['Area Code'] == district_id]['Value'].values[0]
    df.loc[df['district_id'] == district_id, 'HIV'] = value

In [16]:
# update England data using the latest Fingertips API
for disease_name in ["Asthma", "COPD", "Dementia", "Diabetes", "Hypertension", "Obesity", "Stroke"]:
    data_england = pd.read_csv("comorbidity/England/fingertips_api_%s.csv" % disease_name.lower(), low_memory=False)
    data_england = data_england[['Area Code', 'Time period', 'Value']]
    data_england = data_england.sort_values(['Area Code', 'Time period']).reset_index()
    data_england = data_england.groupby(['Area Code']).last().reset_index()

    for district_id in district_ids:
        if district_id.startswith('E'):
            original_district_id = district_id
            
            # district merging after 2019
            if district_id == "E07000004": # Aylesbury Vale
                district_id = "E06000060" # Buckinghamshire
            if district_id == "E07000005": # Chiltern
                district_id = "E06000060" # Buckinghamshire
            if district_id == "E07000006": # South Bucks
                district_id = "E06000060" # Buckinghamshire
            if district_id == "E07000007": # Wycombe
                district_id = "E06000060" # Buckinghamshire

            if district_id == "E07000151": # Daventry
                district_id = "E06000062" # West Northamptonshire
            if district_id == "E07000154": # Northampton
                district_id = "E06000062" # West Northamptonshire
            if district_id == "E07000155": # South Northamptonshire
                district_id = "E06000062" # West Northamptonshire

            if district_id == "E07000150": # Corby
                district_id = "E06000061" # North Northamptonshire
            if district_id == "E07000152": # East Northamptonshire
                district_id = "E06000061" # North Northamptonshire
            if district_id == "E07000153": # Kettering
                district_id = "E06000061" # North Northamptonshire
            if district_id == "E07000156": # Wellingborough
                district_id = "E06000061" # North Northamptonshire
            
            if len(data_england[data_england['Area Code'] == district_id].values):
                value = data_england[data_england['Area Code'] == district_id]['Value'].values[0]
                if not pd.isnull(value):
                    df.loc[df['district_id'] == original_district_id, disease_name] = value

In [17]:
columns = df.columns
new_df = pd.merge(df, ltlas_gdf, left_on=["district_id", 'district_name'], right_on=["district_id", 'district_name'], how="left")
new_df = new_df[list(columns)]

In [18]:
assert(len(ltlas_gdf['district_id'].unique())== len(new_df['district_id'].unique()))

In [19]:
new_df.columns = map(lambda x: x.lower(), list(new_df.columns))
new_df.to_csv("../comorbidity.csv", float_format="%.3f", na_rep="N/A", index=False)