## To Do

* IMD - get from postcode linkage?
* Do an education table 1 with some nice educationey variables
* ?? have a look at datasets table from protocol paper and re-imagine ??


In [None]:
import os
import pandas as pd
from tableone import TableOne

In [None]:
def get_condition_df(code_list_dir):
    # generate table of ids and conditions
    person_table = f"`yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v4.person`"
    output = pd.read_gbq(f"SELECT person_id FROM {person_table}")
    srcode_table = f"`yhcr-prd-phm-bia-core.CY_FDM_PrimaryCare_v4.tbl_SRCode`"
    for filename in os.listdir(code_list_dir):
        
        if filename[-4:] != ".csv":
            continue
        condition_name = filename[:-4]
            
        print(f"Getting {condition_name}")
        condition_df = pd.read_csv("condition_codes/" + filename)
        code_list = ", ".join([f'"{code}"' for code in condition_df.iloc[:,0]])
        
        if condition_df.iloc[:,0].dtype == "int":
            code_type = "src_snomedcode"
        else:
            code_type = "src_ctv3code"
            
        sql_query = f"""
            SELECT person_id, 
            FROM {srcode_table}
            WHERE {code_type} IN ({code_list})
            GROUP BY person_id
        """
        bq_result_df = pd.read_gbq(sql_query)
        bq_result_df[condition_name] = True
        output = output.merge(bq_result_df, on="person_id", how="outer").fillna(False)
    return output


def get_cohort_profile_data(dataset_id):
    # generate sql query to collect relevant person-level data
    age = "FLOOR(DATE_DIFF(obs.observation_period_end_date, demo.DOB_formatted, DAY) / 365.25) AS age"
    ethnic_group_regex = "REGEXP_EXTRACT(demo.census_ethnicity, r'^(.+?):')"
    ethnic_group = f"""
        CASE
            WHEN {ethnic_group_regex} IS NOT NULL THEN {ethnic_group_regex}
            ELSE "Unknown"
        END AS ethnic_group
    """
    sex = """
        CASE
            WHEN demo.remapped_gender = 45766034 THEN "Male"
            WHEN demo.remapped_gender = 45766035 THEN "Female"
            ELSE "Unknown"
        END AS sex
    """
    PROJECT = "yhcr-prd-phm-bia-core"
    person_table = f"{PROJECT}.{dataset_id}.person"
    demographics_table = f"{PROJECT}.CY_STAGING_DATABASE.src_DemoGraphics_MASTER"
    obs_period_table = f"{PROJECT}.{dataset_id}.observation_period"
    table_one_data_sql = f"""
        SELECT obs.person_id, {age}, {ethnic_group}, {sex},  
            demo.LSOA as LSOA_code, lsoas.lat_long, lsoas.LSOA_name, wards.ward_name, 
            wards.ward_code
        FROM `{obs_period_table}` obs
        LEFT OUTER JOIN `{demographics_table}` demo
        ON obs.person_id = demo.person_id
        LEFT OUTER JOIN `{PROJECT}.CY_LOOKUPS.tbl_lsoa_boundaries` lsoas
        ON lsoas.LSOA_code = demo.LSOA
        LEFT OUTER JOIN `{PROJECT}.CY_LOOKUPS.tbl_lsoa_to_ward` wards
        ON wards.LSOA_code = demo.LSOA
    """
    
    # download data
    print("Downloading data from GCP")
    df = pd.read_gbq(table_one_data_sql)
    
    # create boolean value for bradford LSOAs
    df["is_bradford"] = df.LSOA_name.str.contains("Bradford").fillna(False)
    
    # add conditions from codes
    condition_df = get_condition_df("/home/jupyter/c_brad_profile/condition_codes")
    df = df.merge(condition_df, on="person_id", how="left")
    
    return df


def get_table_one(data):
    
    condition_cols = list(data.columns[10:])
    columns = ["age", "ethnic_group", "sex", "ward_name"] + condition_cols
    categorical = ["ethnic_group", "sex", "ward_name"] + condition_cols
    print("Building Table 1")
    rest_of_country_wards = ~data.is_bradford & ~data.ward_name.isna()
    data.loc[rest_of_country_wards, ["ward_name"]] = "Rest of the Country"
    data.loc[data.ethnic_group == "Unknowm", ["ethnic_group"]] = None
    table_1 = TableOne(data, columns, categorical)
    return table_1

In [None]:
cohort_data = get_cohort_profile_data("CY_FDM_PrimaryCare_v4")
get_table_one(cohort_data)

In [None]:
from google.cloud import bigquery
import geopandas

sql = """
    SELECT *
    FROM `yhcr-prd-phm-bia-core.CY_LOOKUPS.tbl_ward_boundaries`
"""
ward_gdf = bigquery.Client().query(sql).to_geodataframe()


In [None]:
ward_gdf.info()

In [None]:
contains_bradford = lambda x: x.str.contains("Bradford").any()
ward_counts = (cohort_data[["ward_code", "LSOA_name"]]
               .groupby("ward_code")
               .agg([("n", "count"), ("contains_bradford", contains_bradford)])
               .reset_index())
ward_counts.columns = ["ward_code", "n", "contains_bradford"]
ward_counts = geopandas.GeoDataFrame(
    ward_counts.merge(ward_gdf)
)

In [None]:
ward_counts.info()

In [None]:
import contextily as cx

ward_counts = ward_counts.to_crs(epsg=3857)

ax = ward_counts[ward_counts.contains_bradford].plot(column="n",  
                      alpha=0.5,  
                      edgecolor="k",  
                      linewidth=2,   
                      cmap="OrRd",  
                      figsize=(20,20))
cx.add_basemap(ax, source=cx.providers.Stamen.TonerLite)
ax.set_axis_off()