# ASD Cohort - Initial Analysis

So I've re-done the "rough and ready" analysis using a cohort of ages 0-18 for the descriptive analyses, and a quick regression on ASD diagnoses from the education census cohort. I've also re-plotted the bars as boxplots now I have a decent plotting library at my disposal

In [None]:
import pandas as pd
from google.cloud import bigquery
import contextily as cx
import geopandas
import numpy as np
from tableone import TableOne
import matplotlib.pyplot as plt
import plotly.express as px
import statsmodels
import statsmodels.api as sm
import statsmodels.formula.api as smf

In [None]:
def add_regression_data(df_orig):
    df = df_orig.copy()
    df.loc[df.ethnic_group == "Unknown", "ethnic_group"] = None
    other_eth = df.ethnic_group.apply(
        lambda x: x is not None and x not in ["White", "Asian or Asian British"]
    )
    df.loc[other_eth, "ethnic_group"] = "other"
    df["ethnic_group"] = df.ethnic_group.apply(
        lambda x: x.split()[0].lower() if x is not None else x
    )
    df = df.join(df.ethnic_group.str.get_dummies())
    df.loc[df.ethnic_group.isna(), ["asian", "other", "white"]] = None
    df["male"] = (df.sex == "Male").astype(int)
    if "asd" in df.columns:
        df["asd"] = df.asd.astype(int)
    df = df[~df.ethnic_group.isna()]
    return df

def get_odds(model):
    params = model.params
    conf = model.conf_int()
    conf['Odds Ratio'] = params
    conf.columns = ['5%', '95%', 'Odds Ratio']
    return pd.DataFrame(np.exp(conf.iloc[1:,:][['Odds Ratio', '5%', '95%']]))

In [None]:
%%bigquery asd_data
SELECT * FROM `yhcr-prd-phm-bia-core.CY_ASD_data.ASD_master_tab`

## Basic Demographic Makeup

In [None]:
columns =  [
    'diagnosis_date', 'age', 'age_at_diagnosis', 'ethnic_group', 'sex', 
    'perm_exclusion', 'fixed_term_exclusion',   'has_protection_plan', 
    'in_care', 'child_in_need'
]
categorical = ["ethnic_group", "sex", "perm_exclusion", "fixed_term_exclusion",
               "has_protection_plan", "in_care", "child_in_need"] 
table_1 = TableOne(
    asd_data[asd_data.age <= 18], 
    columns, 
    categorical)
table_1

## Age at Diagnosis

### By gender:

In [None]:
fig = px.box(asd_data[asd_data.age <= 18], 
             y="age_at_diagnosis", 
             color="sex", 
             width=600, 
             height=500)
fig.show(renderer="png")

### By Ethnic Group:

In [None]:
fig = px.box(asd_data[asd_data.age <= 18], 
       y="age_at_diagnosis",
       color="ethnic_group",
       width=800,
       height=500)
fig.show(renderer="png")

## Gender subdivided by Ethnicity

In [None]:
fig = px.box(asd_data[(asd_data.age <= 18) & (asd_data.sex != "Unknown")], 
       x="sex",
       y="age_at_diagnosis",
       color="ethnic_group",
       width=800,
       height=500)
fig.show(renderer="png")

## Ethnicity subdivided by Gender

In [None]:
fig = px.box(asd_data[(asd_data.age <= 18) & (asd_data.sex != "Unknown")], 
       x="ethnic_group",
       y="age_at_diagnosis",
       color="sex",
       width=800,
       height=500)
fig.show(renderer="png")

## Regression on Age at Diagnosis

I've done a quick regression on age at diagnosis, the results are below but here's a quick ELI5 breakdown:

* The reference baseline is a female, white individual
* Males have an estimated age of diagnosis -1.33 years before females (controlling for ethnicity)
* Asians have an estimated age of diagnosis -5.03 years before white individuals (controlling for gender)
* Other ethnicities have an estimated age of diagnosis -4.83 years before white individuals (controlling for gender)

In [None]:
age_data = asd_data.pipe(add_regression_data)
age_reg = smf.glm("age_at_diagnosis ~ male + asian + other",   
                  data=age_data).fit()
age_reg.summary()

## Geographic Distribution (within Bradford)

In [None]:
sql = """
    SELECT *
    FROM `yhcr-prd-phm-bia-core.CY_LOOKUPS.tbl_ward_boundaries`
"""
ward_gdf = bigquery.Client().query(sql).to_geodataframe()

contains_bradford = lambda x: x.str.contains("Bradford").any()
ward_counts = (asd_data[asd_data.age <= 18][["ward_code", "lsoa_name"]]
               .groupby("ward_code")
               .agg([("n", "count"), ("contains_bradford", contains_bradford)])
               .reset_index())
ward_counts.columns = ["ward_code", "n", "contains_bradford"]
ward_counts = geopandas.GeoDataFrame(
    ward_counts.merge(ward_gdf)
)

### Top 20 Wards:

In [None]:
ward_counts[["ward_name", "n"]].sort_values("n", ascending=False).head(20)

### Choropleth Map of residence:

In [None]:
ward_counts = ward_counts.to_crs(epsg=3857)
ax = ward_counts[ward_counts.contains_bradford].plot(column="n",  
                      alpha=0.5,  
                      edgecolor="k",  
                      linewidth=1,   
                      cmap="OrRd",  
                      figsize=(10,10))
cx.add_basemap(ax, source=cx.providers.Stamen.TonerLite)

## Simple regression on diagnosis

In [None]:
def return_yr_date_diff_sql(from_date, to_date, var_name):
    diff_fn = f"DATE_DIFF({to_date}, {from_date}, DAY) / 365.25"
    return f"FLOOR({diff_fn}) AS {var_name}"
age = return_yr_date_diff_sql("demo.DOB_formatted", "CURRENT_DATE()", "age")

ethnic_group_regex = "REGEXP_EXTRACT(demo.census_ethnicity, r'^(.+?):')"
ethnic_group = f"""
    CASE
        WHEN {ethnic_group_regex} IS NOT NULL THEN {ethnic_group_regex}
        ELSE "Unknown"
    END AS ethnic_group
"""

sex = """
    CASE
        WHEN demo.remapped_gender = 45766034 THEN "Male"
        WHEN demo.remapped_gender = 45766035 THEN "Female"
        ELSE "Unknown"
    END AS sex
"""
project = "yhcr-prd-phm-bia-core"
census_table = f"{project}.CY_FDM_DepartmentForEducation.src_census"
demographics_table = f"`{project}.CY_STAGING_DATABASE.src_DemoGraphics_MASTER`"
# build SQL query
census_sql = f"""
    SELECT census.person_id, {age}, {sex}, {ethnic_group}, AcademicYear, 
        CensusDate, CensusTerm, FSMEligible, SENprovision,  SENprovisionMajor, 
        SENUnitIndicator, 
    FROM {census_table} census
    LEFT JOIN {demographics_table} demo
    ON census.person_id = demo.person_id
"""

In [None]:
%%bigquery census_data
SELECT census.person_id, FLOOR(DATE_DIFF(CURRENT_DATE(), demo.DOB_formatted, DAY) / 365.25) AS age, 
    CASE
        WHEN demo.remapped_gender = 45766034 THEN "Male"
        WHEN demo.remapped_gender = 45766035 THEN "Female"
        ELSE "Unknown"
    END AS sex, 
    CASE
        WHEN REGEXP_EXTRACT(demo.census_ethnicity, r'^(.+?):') IS NOT NULL THEN REGEXP_EXTRACT(demo.census_ethnicity, r'^(.+?):')
        ELSE "Unknown"
    END AS ethnic_group, AcademicYear, CensusDate, CensusTerm, FSMEligible, 
    SENprovision,  SENprovisionMajor, SENUnitIndicator, 
FROM yhcr-prd-phm-bia-core.CY_FDM_DepartmentForEducation.src_census census
LEFT JOIN `yhcr-prd-phm-bia-core.CY_STAGING_DATABASE.src_DemoGraphics_MASTER` demo
ON census.person_id = demo.person_id

In [None]:
census_data.info()

In [None]:
census_data = census_data[census_data.age <= 18]

In [None]:
make_list = lambda x: any([prov != "N" for prov in x])
asd_data["asd"] = True
census_agg = (census_data.
              groupby(["person_id", "sex", "ethnic_group", "age"])
              .agg({"FSMEligible": "any", "SENprovision": make_list})
              .reset_index()
              .merge(asd_data[["person_id", "asd"]], 
                     on="person_id", 
                     how="left")
              .fillna(False))

In [None]:
columns = ['sex', 'ethnic_group', 'age', 'FSMEligible', 'SENprovision', 'asd']
categorical = ['sex', 'ethnic_group', 'FSMEligible', 'SENprovision', 'asd']
table_1 = TableOne(
    census_agg, 
    columns, 
    categorical)
table_1

In [None]:
diag_data = census_agg.pipe(add_regression_data)
diag_reg = smf.logit("asd ~ male + asian + other", 
                    data=diag_data).fit()

In [None]:
diag_reg.summary()

The following are the odds ratios for diagnosis, with a white, female baseline - hopefully pretty self explanatory but let me know if there are any questions:

In [None]:
# To also get the confidence intervals (source):
get_odds(diag_reg)