# EYFSP Scores + ASD Diagnoses Analysis

Load and format data for analysis:

In [None]:
import pandas as pd
import numpy as np

ey_asd = pd.read_gbq("yhcr-prd-phm-bia-core.CY_MYSPACE_SR.EYFSP_autism_study_data")

# sum over eyfsp scores for total score
eyfsp_cols = ["COMG01", "COMG02", "COMG03", "PHYG04", "PHYG05", 
              "PSEG06", "PSEG07",  "PSEG08", "LITG09", "LITG10", 
              "MATG11", "MATG12", "UTWG13", "UTWG14",  "UTWG15", 
              "EXPG16", "EXPG17"]
ey_asd["total_score"] = ey_asd[eyfsp_cols].sum(axis=1)

# sum over scores for weighted subscore tests for subscore
subscore_cols = ["COMG01", "PHYG05", "PSEG07", "PSEG08", "EXPG17"]
ey_asd["subscore"] = ey_asd[subscore_cols].sum(axis=1)

# create low score/subscore cols
def normalise(x):
    return (x - x.mean()) / x.std()
ey_asd["low_total_score"] = normalise(ey_asd.total_score) < -1
ey_asd["low_subscore"] = normalise(ey_asd.subscore) < -1

# convert gender col to "Male?"
mf_mask = {"M":"Yes", "F":"No"}
ey_asd["male"] = ey_asd.gender.map(mf_mask)

# Re-format ethnicities
eth_dtype = pd.api.types.CategoricalDtype(categories=["white_brit", 
                                                      "pakistani", 
                                                      "other"], 
                                                   ordered=False)
eth_map = {"white_brit": "White British",
           "pakistani": "Pakistani", 
           "other":"Other"}
ey_asd["ethnicity"] = ey_asd.ethnicity.astype(eth_dtype).map(eth_map)

ey_asd["age_binned"] = "<10"
for age in range(10,14):
    age_mask = ey_asd.extract_age == age
    ey_asd.loc[age_mask, "age_binned"] = str(age)
over_15_mask = ey_asd.extract_age > 13
ey_asd.loc[over_15_mask, "age_binned"] = ">13" 
ey_asd.loc[ey_asd.extract_age.isna(), "age_binned"] = np.nan
age_bin_dtype = pd.api.types.CategoricalDtype(categories=["<10", "10", "11", "12",
                                                          "13", ">13"],   
                                              ordered=False)
ey_asd["age_binned"] = ey_asd.age_binned.astype(age_bin_dtype)

for col in ey_asd.columns:
    if "bool" in str(ey_asd[col].dtype):
        yn_map = {True: "Yes", False: "No"}
        ey_asd[col] = ey_asd[col].map(yn_map)

drop_cols = ["person_id", "gender"]
ey_asd.drop(drop_cols, axis=1, inplace=True)

ey_asd.info()

In [None]:
colnames_map = {
    'FSMeligible': "Eligible for Free School Meals", 
    'COMG01': "EYFS - Listening and attention", 
    'COMG02': "EYFS - Understanding", 
    'COMG03': "EYFS - Speaking", 
    'PHYG04': "EYFS - Moving and handling", 
    'PHYG05': "EYFS - Health and self-care", 
    'PSEG06': "EYFS - Self-confidence and self-awareness", 
    'PSEG07': "EYFS - Managing feelings and behaviour", 
    'PSEG08': "EYFS - Making relationships", 
    'LITG09': "EYFS - Reading", 
    'LITG10': "EYFS - Writing", 
    'MATG11': "EYFS - Numbers", 
    'MATG12': "EYFS - Shape, space and measures", 
    'UTWG13': "EYFS - People and communities", 
    'UTWG14': "EYFS - The World", 
    'UTWG15': "EYFS - Technology", 
    'EXPG16': "EYFS - Exploring and using media and materials", 
    'EXPG17': "EYFS - Being imaginative", 
    'asd_diagnosis': "Has ASD Diagnosis", 
    'ethnicity': "Ethnicity", 
    'extract_age': "Age At Extract", 
    'total_score': "EYFS Score Total", 
    'subscore': "5-item EYFS Score", 
    'low_total_score': "Has Low Total Score", 
    'low_subscore': "Has Low Subscore", 
    'age_binned': "Age",
    'male': "Male" 
}

In [None]:
from tableone import TableOne

num_cols = ["total_score", "subscore"]
cols = list(ey_asd
            .drop(num_cols + eyfsp_cols + ["asd_diagnosis", "extract_age"], axis=1)
            .rename(colnames_map, axis=1)
            .columns)

tab = TableOne(ey_asd.rename(colnames_map, axis=1),
               columns=cols, 
               categorical=cols, 
               groupby="Has ASD Diagnosis")
tab

### Distribution of EYFSP Scores and Five-Item Subscore

Total EYFSP score is simple sum of the scores 1-3 (1="Emerging", 2="Expected", 3="Exceeding") across all 17 EYFS development areas.

The five-item subscore is taken from Kelly et. al 2019 developed by a small group of ASD assessment experts:

The social aspect mapped onto:
* Personal, social and emotional: managing feelings and behaviour/.
* Personal, social and emotional: making relationships.

Language and communications aspect mapped onto:
* Communication and language: listening and attention. Imagination aspect mapped onto:
* Expressive arts and design: being imaginative.

Repetitive and stereotyped behaviours mapped onto:
* Physical development: health and self-care.

Comparison of below visualisation with Kelly et al. equivalent highlights something a little odd about the former. The Kelly et al. histogram shows a max score for the 5-item subscore as 21 - the max score in any individual test is 3, so the max should be 15 (as in the histogram below), unless some weighting/other function of the score has been used.

The paper does refer to the subscore as a "weighted subscore" but, from the text, that appears to relate to the use of two EYFSP results for the social aspect of ASD which is considered more important. No alternative weighting calculation of the subscore is mentioned, which suggest it should be a simple sum of the 5 tests with a max score of 15 - might need revisiting:

In [None]:
tab.to_latex("table_1.tex")

In [None]:
import plotly.express as px
from IPython.display import Image

hist_fig = px.histogram(ey_asd, 
                   x=["total_score", "subscore"], 
                   width=800, 
                   height=500,
                   title="Distribution of EYFSP Scores")
hist_fig.update_yaxes(title="Count", ticks="", showticklabels=False)
hist_fig.update_xaxes(title="EYFSP Score", ticks="")
newnames = {"total_score":"Total Score", "subscore":"Subscore"}
hist_fig.for_each_trace(lambda x: x.update(name = newnames[x.name]))
hist_fig.update_layout(legend_title="",
                  title={'text': "Distribution of EYFSP Scores", 
                         'y':0.9,
                         'x':0.5, 
                         'xanchor': 'center', 
                         'yanchor': 'top'})
hist_img = hist_fig.to_image(format="png")
Image(hist_img)

Not sure how the density plots in original paper were generated - no explanation. I've had a go using a gaussian KDE and normalising the x and y values. Bit of a strange approach but seems to result in similar outputs:

In [None]:
import numpy as np
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import gaussian_kde

def return_eyfs_kde_plot(score_col, name):
    scores = ey_asd[score_col]
    normalised_scores = (scores - scores.mean()) / scores.std()
    kde = gaussian_kde(normalised_scores)
    score_steps = list(scores.sort_values().unique())
    score_steps = [scores.min() - (scores.std()/5)] + score_steps + [scores.max() + (scores.std()/5)]
    xx = (score_steps - scores.mean()) / scores.std()
    yy = kde.evaluate(xx)
    yy = yy/np.std(yy)/10
    return go.Scatter(x=xx, 
                      y=yy, 
                      name=name,
                      line=dict(shape="spline"), 
                      mode="lines")

density_fig = go.Figure()
for i, col_tuple in enumerate({"total_score": "Total Score", 
                               "subscore": "Subscore"}.items()):
    col, name = col_tuple
    plot = return_eyfs_kde_plot(col, name)
    density_fig.add_trace(plot)
    
sd_line = go.Scatter(x=[-1, -1], 
                     y=[0, 1],
                     mode="lines",
                     line=dict(color="black",
                               dash="dash"))
sd_line.showlegend = False
    
density_fig.add_trace(sd_line)
    
density_fig.layout.xaxis.update(range=[-2.7,2.7])
density_fig.layout.yaxis.update(range=[0,0.6])
density_fig.update_layout(height=600, 
                  width=500, 
                  xaxis_title="Normalised EYFSP Score",
                  title={'text': "Standardised Distribution of EYFSP Scores", 
                         'y':0.93,
                         'x':0.5, 
                         'xanchor': 'center', 
                         'yanchor': 'top'},
                  yaxis_title="Density Estimate")
density_fig.show()

In [None]:
joint_density_fig = make_subplots(rows=1, cols=2)
total_score_hist = go.Histogram(x=ey_asd.total_score)
joint_density_fig.add_trace(total_score_hist, row=1, col=1)
subscore_hist = go.Histogram(x=ey_asd.subscore)
joint_density_fig.add_trace(subscore_hist, row=1, col=1)

for i, col_tuple in enumerate({"total_score": "Total Score", 
                               "subscore": "Subscore"}.items()):
    col, name = col_tuple
    plot = return_eyfs_kde_plot(col, name)
    joint_density_fig.add_trace(plot, row=1, col=2)
    
sd_line = go.Scatter(x=[-1, -1], 
                     y=[0, 1],
                     mode="lines",
                     line=dict(color="black",
                               dash="dash"))
sd_line.showlegend = False
    
joint_density_fig.add_trace(sd_line, row=1, col=2)
    
# density_fig.layout.xaxis.update(range=[-2.7,2.7])
# density_fig.layout.yaxis.update(range=[0,0.6])
# density_fig.update_layout(height=600, 
#                   width=500, 
#                   xaxis_title="Normalised EYFSP Score",
#                   title={'text': "Standardised Distribution of EYFSP Scores", 
#                          'y':0.93,
#                          'x':0.5, 
#                          'xanchor': 'center', 
#                          'yanchor': 'top'},
#                   yaxis_title="Density Estimate")
joint_density_fig.show()

In [None]:
fig.write_image("scores_density.jpeg", scale=2)

In [None]:
joint_density_fig = make_subplots(rows=1, cols=2)
joint_density_fig.add_trace(hist_fig, row=1, col=1)
joint_density_fig.add_trace(density_fig, row=1, col=2)
joint_density_fig.show()

## Logistic Regression

### Prepping data:

The paper divides age at extract into 6/7/8+ - no explanation of the logic of this. Also age not included in basic demographic info, so hard to tell the range of ages for the dataset used in the paper. Great. I'll just stick all but 7/16 (n 1 and 2 respectively) into the model and see what happens.

In [None]:
int_map = {"No":0, "Yes":1}
ey_asd["asd_diagnosis"] = ey_asd.asd_diagnosis.map(int_map)
ey_asd.dropna(inplace=True)
ey_asd.info()

In [None]:
ey_asd.total_score.mean() - ey_asd.total_score.std()

In [None]:
import statsmodels.api as sm
from IPython.display import *

def get_model_odds(model_results, return_probs=False):
    model_odds = np.exp(model_results.conf_int())
    print(model_odds)
    if return_probs:
        model_odds = 1/(1+model_odds)
    print(model_odds)
    model_odds["coef"] = np.exp(model_results.params)
    lower_int = model_odds[0].round(3).astype(str) 
    upper_int = model_odds[1].round(3).astype(str) 
    model_odds["conf_int"] = "(" + lower_int + ", " + upper_int + ")"
    return model_odds[["coef", "conf_int"]]


def display_logit_results(cols):
    patsy_string = f"asd_diagnosis ~ {' + '.join(cols)}"
    mod = sm.Logit.from_formula(patsy_string, data=ey_asd)
    results = mod.fit(maxiter=100,
                      disp=True)
    display(results.summary())
    model_odds = get_model_odds(results)
    display(HTML("<h4>Odds:</h4>"))
    display(model_odds)
    model_probs = get_model_odds(results, return_probs=True)
    display(HTML("<h4>Probs:</h4>"))
    display(model_probs)
    

def save_model_odds(cols, name):
    patsy_string = f"asd_diagnosis ~ {' + '.join(cols)}"
    mod = sm.Logit.from_formula(patsy_string, data=ey_asd)
    results = mod.fit(maxiter=100,
                      disp=True)
    model_odds = get_model_odds(results)
    model_odds.to_latex(f"{name}.tex")

In [None]:
def get_model_margeff(cols):
    patsy_string = f"asd_diagnosis ~ {' + '.join(cols)}"
    mod = sm.Logit.from_formula(patsy_string, data=ey_asd)
    results = mod.fit(maxiter=100,
                      disp=True)
    return results.get_margeff().summary()

## Total Score Models

In [None]:
param_cols = ["low_total_score"]
display_logit_results(param_cols)

In [None]:
get_model_margeff(["low_total_score"])

In [None]:
print(get_model_margeff(["FSMeligible", "ethnicity", "low_total_score*male",  
                       "low_total_score", "male", "age_binned"]))

In [None]:
param_cols = ["low_total_score"]
save_model_odds(param_cols, "tot_total_effects")

In [None]:
display_logit_results(["FSMeligible", "ethnicity", "low_total_score*male",  
                       "age_binned"])

In [None]:
save_params_data(["FSMeligible", "ethnicity", "low_total_score*male", 
                  "age_binned"], "total_score_params")

In [None]:
ey_asd["extract_age"] = ey_asd.extract_age.astype("int32")
display_logit_results(["FSMeligible", "ethnicity", "low_total_score*male", 
                       "low_total_score", "male", "extract_age"])

In [None]:
display_logit_results(["male", "ethnicity", "low_total_score", "male*low_total_score"])

## Low Subscore Models

In [None]:
display_logit_results(["low_subscore"])

In [None]:
save_params_data(["low_subscore"], "sub_total_effects")

In [None]:
display_logit_results(["age_binned", "FSMeligible", "ethnicity", "male*low_subscore"])

In [None]:
print(get_model_margeff(["age_binned", "FSMeligible", "ethnicity", "male*low_subscore"]))

In [None]:
save_params_data(["age_binned", "FSMeligible", "ethnicity", "male", 
                       "low_subscore", "male*low_subscore"], "subscore_params_data")

In [None]:
display_logit_results(["ethnicity", "male", "low_subscore", "male*low_subscore"])

## Subject Area Models

In [None]:
eyfs_results_dtype = pd.api.types.CategoricalDtype(categories=["Expected",  
                                                               "Emerging",  
                                                               "Exceeding"], 
                                                   ordered=False)

main_subjects = ["COM", "LIT", "MAT", "PHY", "PSE"]

for subject in main_subjects:
    subject_cols = [col for col in ey_asd.columns if subject in col]
    ey_asd[f"{subject}_overall"] = "Expected"
    
    expected_mask = (ey_asd[subject_cols] == 1).any(axis=1)
    ey_asd.loc[expected_mask, f"{subject}_overall"] = "Emerging"
    
    exceeding_mask = (ey_asd[subject_cols] == 3).all(axis=1)
    ey_asd.loc[exceeding_mask, f"{subject}_overall"] = "Exceeding"
    
    ey_asd[f"{subject}_overall"] = (ey_asd[f"{subject}_overall"]
                                          .astype(eyfs_results_dtype))
    
    isna = np.any(ey_asd[subject_cols].isna(), axis=1)
    ey_asd.loc[isna, f"{subject}_overall"] = np.nan

In [None]:
for subject in main_subjects:
    overall_colname = f"{subject}_overall"
    print("="*80)
    print(overall_colname)
    print()
    display_logit_results(["ethnicity", overall_colname, "male", "male*"+overall_colname])
    print("="*80)

In [None]:
display_logit_results(["ethnicity", "male"] + [f"{col}_overall" for