## Neuropsych normalization
- This is currently shared by Rebekah Wickens

- Norms <b>must</b> be standardized to be hyphenated ranges (ex: 0-64, 64-100). 
- The second number in each hyphenated range should be interpreted as "up to but not including this number" <p>
    - ex: a 64 year old fits in the 64-100 range, not the 0-64 range. This is necessary for the script to derive the correct statistic for each norm/UDI pair.
- If all norms have a 'test' column that identifies which test they are, and follow the exact same format (ex: including all criteria, even the ones they dont use) then it will be very easy to loop through and generate the master table.

In [None]:
import pandas as pd
import numpy as np

# Build all possible UDIs into a dataframe

UDI_df = pd.DataFrame(columns=['Sex','Language','Education','Age'])

index_counter = 0
for sex in [1, 2]:
    for language in [1, 2]:
        for education in range(0,31):
            for age in range(18,101):                
                UDI_df.loc[index_counter] = {'Sex':sex,
                                            'Language':language,
                                            'Education':education,
                                            'Age':age}
                index_counter+=1

In [None]:
#create master identifier (as a string)

UDI_df.insert(loc = 4, column = 'UDI',
             value = "S"+UDI_df.Sex.astype(str) 
              + "-L" + UDI_df.Language.astype(str) 
              + "-E" + UDI_df.Education.astype(str)
              + "-A" + UDI_df.Age.astype(str))

In [None]:
# import norms, split ranges into individual low/high columns as floats.

df = pd.read_excel('RPQ-normes.xlsx', sheet_name='Sheet3')

df['age_low'] =  df['Age'].str.split('-').str[0].astype(float)
df['age_high']=  df['Age'].str.split('-').str[1].astype(float)
df['edu_low'] =  df['Education'].str.split('-').str[0].astype(float)
df['edu_high']=  df['Education'].str.split('-').str[1].astype(float)

In [None]:
df.head()

In [None]:
def BinBuilder(series, addmin=False, addmax = False, minvalue = 0, maxvalue = 100):
    # Takes a norm criteria as a pandas series and returns an ordered list of bin edges that pd.cut can work with
    # Optionally adds min/max values
        
    #convert hyphens to discrete
    split_list = list(series.str.split('-'))

    #flatten to single list, convert to float
    flat_list = list(np.concatenate(split_list).astype(np.float))

    # add min/max, if applicable
    if addmin == True:
        flat_list.insert(0,float(minvalue))
    if addmax == True:
        flat_list.append(float(maxvalue))
    
    # remove dupes and sort
    flat_list = list(np.sort(np.unique(flat_list)))
    
    # adjust bin edges such that number before hyphen always means >=
    float_list = []
    for i in flat_list:
        if i == maxvalue:
            float_list.append(i)
        else:
            float_list.append(i-0.001)
    return float_list

In [None]:
# get bins for BNT's criteria

BNT_age = BinBuilder(df.Age)
BNT_edu = BinBuilder(df.Education, addmax=True, maxvalue = 30)
BNT_edu

In [None]:
# apply bins to the norm dataframe, concatenate them, and create a melted norms dataframe

df['age_bin'] = pd.cut(x=df.age_low, bins = BNT_age)
df['edu_bin'] = pd.cut(x=df.edu_low, bins = BNT_edu)
df['BNT_age+edu_bin'] = df['age_bin'].astype(str) + df['edu_bin'].astype(str)
BNT_mean = df.melt('BNT_age+edu_bin',value_vars=['Mean'], value_name ='BNT_mean')
BNT_sd = df.melt('BNT_age+edu_bin',value_vars=['SD'], value_name = 'BNT_sd')

BNT = BNT_mean.merge(BNT_sd,on='BNT_age+edu_bin',how='right')
for col in list(BNT.columns):
    if 'variable' in col:
        BNT.drop(labels=[col], axis=1, inplace=True)
BNT

In [None]:
# apply bins to UDI frame, concatenate them

UDI_df['age_bin'] = pd.cut(x= UDI_df['Age'], bins = BNT_age)
UDI_df[UDI_df.Age >62]

UDI_df['edu_bin'] = pd.cut(x= UDI_df['Education'], bins = BNT_edu)
UDI_df[UDI_df.Education > 8]

UDI_df['BNT_age+edu_bin'] = UDI_df['age_bin'].astype(str) + UDI_df['edu_bin'].astype(str)

UDI_df.head(10)

In [None]:
# join each UDI to its relevant test statistic

UDI_df = UDI_df.merge(BNT, on='BNT_age+edu_bin', how='right')

In [None]:
# remove 'bin' columns
for col in list(UDI_df.columns):
    if 'bin' in col:
        UDI_df.drop(labels=[col], axis=1, inplace=True)
UDI_df.head(10)

In [None]:
UDI_df.to_excel('UDI_df.xlsx',index=False)

### Neuropsych norming with look-up tables

In [1]:
import pandas as pd
import numpy as np
import json

In [2]:
norming_config_file = "../workflow/tabular/DSF_norming_config.json"

In [8]:
def get_norming_config(config_file):
    """ Read config json for a given instrument
    """
    with open(config_file, 'r') as f:
        config = json.load(f)
    return config

def read_raw_scores(instrument):
    """ Read raw data tables for a specified instrument dict
    """
    raw_data = instrument["raw_data"]
    raw_sheet = instrument["raw_sheet"]
    df = pd.read_excel(raw_data, sheet_name=raw_sheet, engine='openpyxl',header=1).dropna(axis=0,how="all")

    return df

def read_baseline_scores(instrument):
    """ Read raw data tables for a specified instrument dict
    """
    raw_data = instrument["baseline_data"]
    raw_sheet = instrument["baseline_sheet"]
    df = pd.read_excel(raw_data, sheet_name=raw_sheet, engine='openpyxl').dropna(axis=0,how="all")

    return df

def get_valid_scores(df,instrument):
    """ Check and remove out of bound or NaN scores
    """
    name = instrument["name"]
    score_range = instrument["range"]
    nan_val = int(score_range["n/a"])
    min_val = int(score_range["min"])
    max_val = int(score_range["max"])

    n_participants = len(df)
    n_multiple_visits = len(df[df.index.duplicated()])

    n_nan_val = len(df[df[name] == nan_val])
    n_missing_val = len(df[df[name].isna()])
    print(f"n_participants: {n_participants}, n_multiple_visits: {n_multiple_visits}")
    print(f"n_nan_val (i.e. {nan_val}): {n_nan_val}, n_missing_val: {n_missing_val}")

    # clean-up
    df[name] = df[name].replace({nan_val:np.NaN})
    df = df[df[name].notna()]
    df = df[df[name].isin(range(min_val, max_val+1))] # (min <= score <= max)

    n_valid_scores = len(df)
    print(f"n_valid_scores: {n_valid_scores}")
    return df
    
def format_baseline_scores(df, strata_cols):
    """ Format baseline score sheet so it can be filtered in pandas
    """
    for col in strata_cols: 
        # check if column has ranges separate by "-" delimeter
        if df[col].str.contains("-").any():    
            df[f"{col}_min"] = df[col].astype(str).str.split("-",expand=True)[0].astype(int)
            df[f"{col}_max"] = df[col].astype(str).str.split("-",expand=True)[1]
            df.loc[df[f"{col}_max"].isna(), f"{col}_max"] = df[f"{col}_min"]
            df[f"{col}_max"] = df[f"{col}_max"].astype(int)
        else:
            df[f"{col}_min"] = df[col].astype(int)
            df[f"{col}_max"] = df[col].astype(int)
            
    return df

def get_scaled_score(participant, df):
    """ Filter baseline scores and return match for a given participant
    """
    participant_df = df.copy()

    # Filter rows matching participant values
    for k,v in participant.items():
        participant_df = participant_df[(participant_df[f"{k}_min"] <= v) & 
                                    (participant_df[f"{k}_max"] >= v) ]

    if len(participant_df) == 0:
        print(f"No matches found for participant: {participant.name}, {dict(participant)}")
        Scaled_score = np.nan
        
    elif len(participant_df) > 1:
        print(f"Multiple matches found for participant: {participant.name}, {dict(participant)}")
        Scaled_score = participant_df["Scaled_score"].values[0]

    else:
        Scaled_score = participant_df["Scaled_score"].values[0]

    return Scaled_score

In [9]:
config = get_norming_config(norming_config_file)

instrument = config["instrument"]
stratification = config["stratification"]

name = instrument["name"]
participant_id_col = instrument["participant_id_column"]
strata_cols = list(stratification.keys())

print(f"Instrument name: {name}")
print(f"Using {strata_cols} as stratification columns")
print("IMPORTANT: Instrument name should match the column in the raw data sheet and the baseline score sheet")

raw_data_df = read_raw_scores(instrument)
baseline_df = read_baseline_scores(instrument)

raw_data_df = raw_data_df[[participant_id_col,name] + strata_cols]
raw_data_df = raw_data_df.set_index(participant_id_col)
raw_data_df = get_valid_scores(raw_data_df,instrument)

baseline_df = format_baseline_scores(baseline_df, strata_cols + [name])

normed_data_df = raw_data_df.copy()

for idx, participant_data in raw_data_df.iterrows():
    Scaled_score = get_scaled_score(participant_data, baseline_df)
    normed_data_df.loc[idx,"Scaled_score"] = Scaled_score

Instrument name: Digit Span Total (Raw score)
Using ['Age at time of assessment'] as stratification columns
IMPORTANT: Instrument name should match the column in the raw data sheet and the baseline score sheet
n_participants: 296, n_multiple_visits: 31
n_nan_val (i.e. 999): 0, n_missing_val: 134
n_valid_scores: 127
No matches found for participant: PD01622, Digit Span Total (Raw score)    18.0
Age at time of assessment       69.0
Name: PD01622, dtype: float64
No matches found for participant: PD01565, Digit Span Total (Raw score)    18.0
Age at time of assessment       52.0
Name: PD01565, dtype: float64
No matches found for participant: PD01412, Digit Span Total (Raw score)    17.0
Age at time of assessment       32.0
Name: PD01412, dtype: float64
No matches found for participant: PD01623, Digit Span Total (Raw score)    17.0
Age at time of assessment       61.0
Name: PD01623, dtype: float64
No matches found for participant: PD01133, Digit Span Total (Raw score)    20.0
Age at time of 

{'Digit Span Total (Raw score)': 16.0, 'Age at time of assessment': 78.0}