# EYFSP-ASD Cohort 

In [None]:
import pandas as pd
from google.cloud import bigquery

## EYFSP Scores

* Only need data post 2013 EYFSP scores - 71,018 entries
* A few entries with NULL for post-2013 EYFSP scores  - 20 removed
* Some entries with "A" = "Not Assessed"  - assumed "NULL by another name" - 175 removed
* Some duplicate entries (15) after having removed NULLs and "A"s - seem to be repeated EYFSP tests over different academic years - all contradict one another - removed for simplicity given low number (< 0.01% of data)

Final Count 70,793 with full post 2013 EYFSP scores and no duplicates

In [None]:
PROJECT = "yhcr-prd-phm-bia-core"
CLIENT = bigquery.Client(project=PROJECT)
def run_sql_query(sql, destination=None):
    """Quick way to run sql queries with bigquery library
    
    Can be used to run sql queries exactly as they would run using the 
    BigQuery SQL Workspace. By setting the "destination" argument, the results
    of a query can be stored as a new table/overwrite an existing table at the
    table id specified.

    Args:
        sql: string, the SQL command to be run
        destination: string (default: None), a table id where the results
            of the SQL command will be stored, if None then results aren't 
            stored

    Returns:
        bigquery.table.Table, containing table object of the stored results of 
            the query if destination argument isn't none
        -- otherwise --
        bigquery.job.queryjob, if no destination is provided and results aren't
            stored

    Example:
    ```python
    # queries example table and stores results in new table
    table = run_sql_query(
        sql = "SELECT * FROM `example.table.id`",
        destination = "destination.for.results"
    )
    
    # caps "value" column at 100
    query = run_sql_query(
        sql = "UPDATE example.table.id SET value = 100 WHERE value > 100"
    )
    ```
    """
    
    if destination:
        job_config = bigquery.QueryJobConfig(
            destination=destination, 
            write_disposition="WRITE_TRUNCATE"
        )
    else:
        job_config=None
    
    query_job = CLIENT.query(sql, job_config=job_config)  # Make an API request.
    query_job.result()  # Wait for the job to complete.
    
    if destination:
        result_table = CLIENT.get_table(destination)
        return result_table
    else:
        return query_job

In [None]:
#### Start with required data from EYFSP_2003-2019 Table ######

eyfs_source_table = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_EYFSP"

# create new AcademicYearEnd column
ac_yr_end_col_string = """
    CAST(RIGHT(AcademicYear, 4) AS INT) 
"""

# list post 2013 EYFSP test columns
eyfsp_cols = ["COMG01", "COMG02", "COMG03", "PHYG04", "PHYG05", 
              "PSEG06", "PSEG07",  "PSEG08", "LITG09", "LITG10", 
              "MATG11", "MATG12", "UTWG13", "UTWG14",  "UTWG15", 
              "EXPG16", "EXPG17"]
# generate long string with all eyfsp columnsfor SELECT part of query
eyfsp_cols_string = ", ".join(eyfsp_cols)
# generate long string with all eyfsp columns casted as integers for SELECT part of query
eyfsp_int_cols_string = ", ".join([f"CAST({col} AS INT) AS {col}" for col in eyfsp_cols])

# create NOT NULL check string for eyfsp_cols
not_null_check = " IS NOT NULL AND ".join(eyfsp_cols) + " IS NOT NULL"
# create NOT NULL check string for eyfsp_cols
not_a_check = ' != "A" AND '.join(eyfsp_cols) + ' != "A"'
duplicate_rows_string = """
    COUNT(person_id) OVER 
        (PARTITION BY person_id)
    AS duplicate_count
"""
# build SQL query
sql = f"""
    WITH eyfsp AS
    (
        SELECT person_id, 
            {ac_yr_end_col_string} AS AcademicYearEnd, 
            {eyfsp_cols_string}, 
            {duplicate_rows_string} 
        FROM `{eyfs_source_table}`
        WHERE {ac_yr_end_col_string} >= 2013
            AND {not_null_check}
            AND ({not_a_check})
    )
    SELECT person_id, AcademicYearEnd, {eyfsp_int_cols_string} 
    FROM eyfsp
    WHERE duplicate_count = 1
"""

# set config to store results in new table
ey_asd_table = "yhcr-prd-phm-bia-core.CB_MYSPACE_SR.EYFSP_autism_study_data"
query = run_sql_query(sql, ey_asd_table)

## FSM

Census table doesn't look quite right:
* multiple duplicate person entries over several academic years and often duplicate entiries within academic years
* duplicates often contradict one another
* Different naming conventions for Ethnic groups among contradicting duplicate entries - e.g. same person will be recorded as both "Pakistani" and "ASIA" in different entries
* FSM data (of particular interest for autism study) has no "missingness" information - entries are either "Yes" or NULL - duplicate person entries often have mix of "Yes" and NULL over same census year 

Current appproach - probably needs revisiting:
* use DfE census for FSM - get gender / ethicity from person table
* join census data to EYFSP data on person and academic year to get relevant year's info
* treatment of contradicting duplicate entries - group multiple entries for person AND academic year - if any FSM fields recorded as "Yes" then record as "Yes", otherwise NULL if all agree NULL
* RESULT = 70,277 of 70,793 EYFSP entries have census data for person + academic year - FSM eligibility is ~20% of dataset - roughly in line with region average - anecdotally would expect Bradford to be higher than region average

In [None]:
#### Join person_id and FSM from DfE_Census_all ####

# find mapping errors in dfe data

dfe_census_table = "yhcr-prd-phm-bia-core.CB_FDM_DepartmentForEducation.src_census"
ac_yr_end_col_string = """
    CAST(RIGHT(AcademicYear, 4) AS INT) AS AcademicYearEnd
"""
all_cols_string = ", ".join(["all_data.person_id", "FSMeligible"] + eyfsp_cols)
### SOLUTION 1 - person and year ###
sql = f"""
    WITH census AS
    (
        SELECT person_id, 
            {ac_yr_end_col_string}, 
            MAX(EDRN) AS EDRN,
            CASE
                WHEN MAX(FSMeligible) = true THEN true
                ELSE false
            END AS FSMeligible
        FROM `{dfe_census_table}` as a
        GROUP BY person_id, AcademicYear
    )
    SELECT {all_cols_string} 
    FROM `{ey_asd_table}` AS all_data
    INNER JOIN census 
        ON all_data.person_id = census.person_id 
        AND all_data.AcademicYearEnd = census.AcademicYearEnd 
"""

query = run_sql_query(sql, ey_asd_table)

## ASD Diagnoses

Issues:
* SNOMED differs significantly from "read codes" provided with 2019 paper - far more codes from quick search under "autism"/"autistic" under SNOMED coding - also unsure of parent/child status of several codes/logic of tree structure
* Initial searches using conditions from OG read codes result in only ~180 kids in EYFSP+Census dataset with confirmed ASD - less than a fifth of what we'd expect (should be around 1%)
* Expanded search with more exhaustive list of codes only result in 181 persons from EYFSP+Census data
* including parent for Autistic Disorder - "Pervasive Developmental Disorder" finds 1230 diagnoses (about ~1.7% which seems about right) - CTV3 in primary care data suggests mapping of "Autistic Spectrum Disorder" to "Pervasive Developmental Disorder" - CTV3 code for "Autistic Spectrum Disorder" also apears in all Kelly studies - still need to clarify correctness of using this code

In [None]:
#### Collect Autism Read Codes ####

asd_snomed_codes = [
    "35919005", "442314000", "23560001", "231536004", "718393002", "408856003", 
    "373618009", "71961003", "702450004", "723332005", "712884004", 
    "39951000119105", "870307006", "870308001", "870305003", "870306002",
    "870303005", "870304004", "870269009", "870270005", "870268001", "870266002",
    "870267006", "870264004", "870265003", "870262000", "870263005", "870260008",
    "870261007", "870280009", "870282001", "68618008", "432091002", "708037001",
    "719600006", "766824003", "722287002", "771512003", "733623005", "43614003",
    "702732007", "408857007", "783089006", "191692007", "191693002", "191690004",
    "771448004", "770790004", "191689008", "191690004"
]

# build SQL query
sql = f"""
    WITH asd AS (
        SELECT person_id
        FROM `yhcr-prd-phm-bia-core.CB_FDM_PrimaryCare_v2.tbl_SRCode`
        WHERE SNOMEDCode IN ({", ".join([f'"{code}"' for code in asd_snomed_codes])})
        GROUP BY person_id
    )
    SELECT ey_asd.*, IF(asd.person_id IS NULL, false, true) AS asd_diagnosis
    FROM `{ey_asd_table}`as ey_asd
    LEFT JOIN asd ON asd.person_id = ey_asd.person_id
"""


query = run_sql_query(sql=sql, destination=ey_asd_table)

## Gender

Bit odd using master person table as  

Current solution - join genders from DfE entries that have agreeing gender across duplicates - contraditions recorded as NULL

In [None]:
sql =  f"""
    WITH non_error_genders AS (
        WITH genders AS (
            SELECT person_id, Gender, 
                COUNT(person_id) OVER (PARTITION BY person_id) AS person_count 
            FROM `{dfe_census_table}`
            GROUP BY person_id, Gender
        )
        SELECT person_id, Gender
        FROM genders
        WHERE person_count = 1
    )
    SELECT ey_asd.*, Gender AS gender
    FROM {ey_asd_table} AS ey_asd
    LEFT JOIN non_error_genders
    ON non_error_genders.person_id = ey_asd.person_id 
"""

query = run_sql_query(sql=sql, destination=ey_asd_table)

## Ethnicity 

Another faff:
1. race mapping in person master table is pretty odd and not that informative - majority NULL/equivalents - descriptions not too helpful
2. DfE census ethnicity very inconsistent and often contradictory - best obs seem to come from Ethnicity col - maps to census codes (https://www.gov.uk/guidance/alternative-provision-ap-census/codes)

One approach - collect obs that fall into codes that relate to two categories used in 2019 study "white british" and "pakistani" from edrecs census - any entry in census with obs matching one of the two groups of codes will be recorded as belonging to group - any entries with another ehtnicity recorded as "OTHER" - any NULLs as NULL

Codes are as follows:

Pakistani: 	
APKN - Pakistani 	
AMPK - Mirpuri Pakistani 	 
AKPA - Kashmiri Pakistani 	
AOPK - Other Pakistani 	 	

White British: 	
WBRI - White - British 	
WENG - White - English  	
WNIR - White – Northern Irish 	
WSCO - White - Scottish 	
WWEL - White - Welsh  	
WOWB - Other White British 	 	

3. Demographics table - has well coded ehtnicities but lots of nulls

Another approach - code all "White" group as "white british" - "Pakistani" group as "pakistani" - any other ethnicities as "other" - "Unknown" group as NULL

Use demographics approach for now

In [None]:
# collect dfe census codes for reference
pakistani_codes = ["APKN", "AMPK", "AKPA", "AOPK"]
white_brit_codes = ["WBRI", "WENG", "WNIR", "WSCO", "WWEL", "WOWB"]

pak_list = ", ".join(f'"{code}"' for code in pakistani_codes)
brit_list = ", ".join(f'"{code}"' for code in white_brit_codes)

all_eths = ", ".join(f'"{code}"' for code in pakistani_codes + white_brit_codes)

sql = f"""
    WITH eths AS (
        SELECT person_id,
            CASE WHEN census_ethnicity LIKE "White%" THEN "white_brit"
                WHEN census_ethnicity LIKE "%Pakistan%" THEN "pakistani" 
                WHEN census_ethnicity LIKE "Unknown%" THEN NULL 
                ELSE "other" 
            END AS ethnicity,
        FROM `yhcr-prd-phm-bia-core.CY_STAGING_DATABASE.src_DemoGraphics_MASTER`
    )
    SELECT ey_asd.*, eths.ethnicity
    FROM {ey_asd_table} AS ey_asd
    LEFT JOIN eths 
    ON eths.person_id = ey_asd.person_id
"""

query = run_sql_query(sql, destination=ey_asd_table)

## Age at extract

Actually pretty easy! Hurrah! - master person table has age as datetime - just subtract from current datetime and convert to age in years as int - no issues:

In [None]:
master_person_table = "yhcr-prd-phm-bia-core.CY_FDM_MASTER.person"
age_calc = "CAST(DATE_DIFF(CURRENT_DATE, birth_datetime, DAY) / 365.25 AS INT) AS extract_age"

sql = f"""
    SELECT ey_asd.*, {age_calc} 
    FROM `{ey_asd_table}` AS ey_asd
    LEFT JOIN `{master_person_table}` master
    ON ey_asd.person_id = master.person_id
"""

query = run_sql_query(sql=sql, destination=ey_asd_table)

In [None]:
%%bigquery ey_asd_df
SELECT * FROM `yhcr-prd-phm-bia-core.CY_MYSPACE_SR.EYFSP_autism_study_data`

In [None]:
ey_asd_df.info()

In [None]:
ey_asd_df.asd_diagnosis.sum()

In [None]:
%%bigquery 
SELECT * FROM yhcr-prd-phm-bia-core.CY_STAGING_DATABASE.src_DemoGraphics_MASTER
WHERE person_id = 368022