## Neuropsych normalization
- This is currently shared by Rebekah Wickens

- Norms <b>must</b> be standardized to be hyphenated ranges (ex: 0-64, 64-100). 
- The second number in each hyphenated range should be interpreted as "up to but not including this number" <p>
    - ex: a 64 year old fits in the 64-100 range, not the 0-64 range. This is necessary for the script to derive the correct statistic for each norm/UDI pair.
- If all norms have a 'test' column that identifies which test they are, and follow the exact same format (ex: including all criteria, even the ones they dont use) then it will be very easy to loop through and generate the master table.

In [None]:
import pandas as pd
import numpy as np

# Build all possible UDIs into a dataframe

UDI_df = pd.DataFrame(columns=['Sex','Language','Education','Age'])

index_counter = 0
for sex in [1, 2]:
    for language in [1, 2]:
        for education in range(0,31):
            for age in range(18,101):                
                UDI_df.loc[index_counter] = {'Sex':sex,
                                            'Language':language,
                                            'Education':education,
                                            'Age':age}
                index_counter+=1

In [None]:
#create master identifier (as a string)

UDI_df.insert(loc = 4, column = 'UDI',
             value = "S"+UDI_df.Sex.astype(str) 
              + "-L" + UDI_df.Language.astype(str) 
              + "-E" + UDI_df.Education.astype(str)
              + "-A" + UDI_df.Age.astype(str))

In [None]:
# import norms, split ranges into individual low/high columns as floats.

df = pd.read_excel('RPQ-normes.xlsx', sheet_name='Sheet3')

df['age_low'] =  df['Age'].str.split('-').str[0].astype(float)
df['age_high']=  df['Age'].str.split('-').str[1].astype(float)
df['edu_low'] =  df['Education'].str.split('-').str[0].astype(float)
df['edu_high']=  df['Education'].str.split('-').str[1].astype(float)

In [None]:
df.head()

In [None]:
def BinBuilder(series, addmin=False, addmax = False, minvalue = 0, maxvalue = 100):
    # Takes a norm criteria as a pandas series and returns an ordered list of bin edges that pd.cut can work with
    # Optionally adds min/max values
        
    #convert hyphens to discrete
    split_list = list(series.str.split('-'))

    #flatten to single list, convert to float
    flat_list = list(np.concatenate(split_list).astype(np.float))

    # add min/max, if applicable
    if addmin == True:
        flat_list.insert(0,float(minvalue))
    if addmax == True:
        flat_list.append(float(maxvalue))
    
    # remove dupes and sort
    flat_list = list(np.sort(np.unique(flat_list)))
    
    # adjust bin edges such that number before hyphen always means >=
    float_list = []
    for i in flat_list:
        if i == maxvalue:
            float_list.append(i)
        else:
            float_list.append(i-0.001)
    return float_list

In [None]:
# get bins for BNT's criteria

BNT_age = BinBuilder(df.Age)
BNT_edu = BinBuilder(df.Education, addmax=True, maxvalue = 30)
BNT_edu

In [None]:
# apply bins to the norm dataframe, concatenate them, and create a melted norms dataframe

df['age_bin'] = pd.cut(x=df.age_low, bins = BNT_age)
df['edu_bin'] = pd.cut(x=df.edu_low, bins = BNT_edu)
df['BNT_age+edu_bin'] = df['age_bin'].astype(str) + df['edu_bin'].astype(str)
BNT_mean = df.melt('BNT_age+edu_bin',value_vars=['Mean'], value_name ='BNT_mean')
BNT_sd = df.melt('BNT_age+edu_bin',value_vars=['SD'], value_name = 'BNT_sd')

BNT = BNT_mean.merge(BNT_sd,on='BNT_age+edu_bin',how='right')
for col in list(BNT.columns):
    if 'variable' in col:
        BNT.drop(labels=[col], axis=1, inplace=True)
BNT

In [None]:
# apply bins to UDI frame, concatenate them

UDI_df['age_bin'] = pd.cut(x= UDI_df['Age'], bins = BNT_age)
UDI_df[UDI_df.Age >62]

UDI_df['edu_bin'] = pd.cut(x= UDI_df['Education'], bins = BNT_edu)
UDI_df[UDI_df.Education > 8]

UDI_df['BNT_age+edu_bin'] = UDI_df['age_bin'].astype(str) + UDI_df['edu_bin'].astype(str)

UDI_df.head(10)

In [None]:
# join each UDI to its relevant test statistic

UDI_df = UDI_df.merge(BNT, on='BNT_age+edu_bin', how='right')

In [None]:
# remove 'bin' columns
for col in list(UDI_df.columns):
    if 'bin' in col:
        UDI_df.drop(labels=[col], axis=1, inplace=True)
UDI_df.head(10)

In [None]:
UDI_df.to_excel('UDI_df.xlsx',index=False)

### Neuropsych norming with look-up tables

In [66]:
import pandas as pd
import numpy as np
import json

In [2]:
norming_config_file = "../workflow/tabular/DSF_norming_config.json"

In [122]:
import pandas as pd
import numpy as np
import json

def get_norming_config(config_file):
    """ Read config json for a given instrument
    """
    with open(config_file, 'r') as f:
        config = json.load(f)
    return config

def read_raw_scores(instrument):
    """ Read raw data tables for a specified instrument dict
    """
    raw_data = instrument["raw_data"]
    raw_sheet = instrument["raw_sheet"]    
    df = pd.read_excel(raw_data, sheet_name=raw_sheet, engine='openpyxl',header=1).dropna(axis=0,how="all")

    return df

def read_baseline_scores(instrument):
    """ Read raw data tables for a specified instrument dict
    """
    raw_data = instrument["baseline_data"]
    raw_sheet = instrument["baseline_sheet"]
    df = pd.read_excel(raw_data, sheet_name=raw_sheet, engine='openpyxl').dropna(axis=0,how="all")

    return df

def get_valid_scores(df,instrument):
    """ Check and remove out of bound or NaN scores
    """
    name = instrument["raw_score_name"]
    score_range = instrument["range"]
    nan_val = int(score_range["n/a"])
    min_val = int(score_range["min"])
    max_val = int(score_range["max"])

    n_participants = len(df)
    n_multiple_visits = len(df[df.index.duplicated()])

    n_nan_val = len(df[df[name] == nan_val])
    n_missing_val = len(df[df[name].isna()])
    print(f"n_listed_participants: {n_participants}, n_multiple_visits: {n_multiple_visits}")
    print(f"n_nan_val (i.e. {nan_val}): {n_nan_val}, n_missing_val: {n_missing_val}")
    print(f"Excluding ({n_missing_val}) participants with missing scores")
    # clean-up
    df[name] = df[name].replace({nan_val:np.NaN})
    df = df[df[name].notna()]
    
    max_available_val = np.max(df[name])
    min_available_val = np.min(df[name])

    print(f"\nPossible score range: ({min_val},{max_val})")
    print(f"Available score range: ({min_available_val},{max_available_val})")
    invalid_df = df[~df[name].isin(range(min_val, max_val+1))] # (min <= score < max)
    n_invalid_scores = len(invalid_df)
    if n_invalid_scores > 0:
        print(f"n_invalid_scores: {n_invalid_scores}")
        print(f"Using participants only with valid scores")
        df = df[df[name].isin(range(min_val, max_val+1))]

    return df
    
def format_baseline_scores(df, strata_cols):
    """ Format baseline score sheet so it can be filtered in pandas
    """
    baselines_ranges = {}
    for col in strata_cols: 
        # check if column has ranges separate by "-" delimeter
        # Convention: upper limit is not include for demographics and scores: e.g. (0-4) implies {0,1,2,3}
        if df[col].str.contains("-").any():    
            df[f"{col}_min"] = df[col].astype(str).str.split("-",expand=True)[0].astype(int)
            df[f"{col}_max"] = df[col].astype(str).str.split("-",expand=True)[1]
            df.loc[df[f"{col}_max"].isna(), f"{col}_max"] = df[f"{col}_min"].astype(int) + 1 #See Convention
            df[f"{col}_max"] = df[f"{col}_max"].astype(int)  
        else:
            df[f"{col}_min"] = df[col].astype(int)
            df[f"{col}_max"] = df[col].astype(int) + 1 #See Convention
        
        baselines_ranges[col] = (np.min(df[f"{col}_min"]), np.max(df[f"{col}_max"]))
    
    return df, baselines_ranges

def get_normed_score(participant, baseline_df, raw_score_name, norming_procedure):
    """ Filter baseline scores and return match for a given participant
    """
    baseline_match_df = baseline_df.copy()

    # Filter rows matching participant values
    # Convention: upper limit is not include for demographics and scores: e.g. (0-4) implies {0,1,2,3}
    for k,v in participant.items():
        baseline_match_df = baseline_match_df[(baseline_match_df[f"{k}_min"] <= v) & 
                                    (baseline_match_df[f"{k}_max"] > v) ] # see convention

    # Deal with zero or > 1 matches
    if len(baseline_match_df) == 0:
        # print(f"No matches found for participant: {participant.name}, {dict(participant)}")
        normed_score = np.nan
        note = "Strata not found"
        
    elif len(baseline_match_df) > 1:
        print(f"Multiple matches found for participant: {participant.name}, {dict(participant)}")
        print(f"Not assigning a scaled score for {participant.name}")
        normed_score = np.nan
        note = "Multiple strata matches found"

    else:
        # Select based on norming_procedure
        if norming_procedure.lower() in ["lookup_scaled_score","scaled_score"]:
            normed_score = baseline_match_df["Scaled_score"].values[0]
            note = "Scaled score"

        elif norming_procedure.lower() in ["zscore", "z-score", "z_score"]:
            participant_dict = {"raw_score":baseline_match_df[raw_score_name].values[0],
                                "Mean":baseline_match_df["Mean"].values[0],
                                "SD":baseline_match_df["SD"].values[0]}

            normed_score = z_score(participant_dict)
            note = "zscore"
        else:
            print(f"Unknown norming procedure")
            normed_score = np.nan
            note = "Unknown norming procedure"

    return normed_score, note

def z_score(participant):
    raw_score = participant["raw_score"]
    mean = participant["Mean"]
    SD = participant["SD"]
    z_score = (raw_score - mean)/SD
    return z_score


In [127]:
config = get_norming_config(norming_config_file)

instrument = config["instrument"]
data_paths = config["data_paths"]
stratification = config["stratification"]

norming_procedure = instrument["norming_procedure"]

raw_score_name = instrument["raw_score_name"]
normed_score_name = instrument["normed_score_name"]

participant_id_col = data_paths["participant_id_column"]
strata_cols = list(stratification.keys())

print("-"*80)
print("Starting score normalization process...\n")
print(f"Instrument name: {raw_score_name}")
print(f"Using {strata_cols} as stratification columns\n")
print("-"*60)
print("***IMPORTANT: Instrument and demograhic column names should match in the raw data sheet and the baseline score sheet***")
print("-"*60)
print("")

raw_data_df = read_raw_scores(data_paths)
baseline_df = read_baseline_scores(data_paths)

raw_data_df = raw_data_df[[participant_id_col,raw_score_name] + strata_cols]
raw_data_df = raw_data_df.set_index(participant_id_col)
valid_data_df = get_valid_scores(raw_data_df,instrument)

n_participants_to_normalized = len(valid_data_df)
print(f"\nn_participants to be normalized: {n_participants_to_normalized}")

baseline_df, baselines_ranges = format_baseline_scores(baseline_df, strata_cols + [raw_score_name])
n_strata = len(baseline_df)

print("-"*60)
print(f"n_starta: {n_strata}")
print(f"starta ranges:\n{baselines_ranges}")
print(f"***IMPORTANT: Any raw scores beyond these ranges will not be normalized***")
print("-"*60)

print(f"Starting score normalization based on {norming_procedure}...")
normed_data_df = valid_data_df.copy()
for idx, participant_data in normed_data_df.iterrows():
    normed_score, note = get_normed_score(participant_data, baseline_df,raw_score_name)
    normed_data_df.loc[idx,normed_score_name] = normed_score
    normed_data_df.loc[idx,"note"] = note

participants_missing_matches = list(normed_data_df[normed_data_df[normed_score_name].isna()].index)
n_missing_matches = len(participants_missing_matches)

print("-"*60)
print(f"Participants (n={n_missing_matches}) are missing stratification matches")
print("-"*60)

# Save data
normed_data = data_paths["normed_data"]
normed_sheet = data_paths["normed_sheet"]

print(f"Saving normed data to: {normed_data}")
save_df = pd.merge(raw_data_df[strata_cols + [raw_score_name]], 
                    normed_data_df[strata_cols + [normed_score_name, "note"]], 
                  on=[participant_id_col] + strata_cols, how="left")

save_df.to_excel(normed_data, sheet_name=normed_sheet)
print(f"Norming procedure completed for {raw_score_name}")
print("-"*80)
save_df.head()

--------------------------------------------------------------------------------
Starting score normalization process...

Instrument name: Digit Span Total (Raw score)
Using ['Age at time of assessment'] as stratification columns

------------------------------------------------------------
***IMPORTANT: Instrument and demograhic column names should match in the raw data sheet and the baseline score sheet***
------------------------------------------------------------

n_listed_participants: 296, n_multiple_visits: 31
n_nan_val (i.e. 999): 0, n_missing_val: 134
Excluding (134) participants with missing scores

Possible score range: (0,32)
Available score range: (9.0,32.0)

n_participants to be normalized: 162
------------------------------------------------------------
n_starta: 127
starta ranges:
{'Age at time of assessment': (30, 90), 'Digit Span Total (Raw score)': (0, 17)}
***IMPORTANT: Any raw scores beyond these ranges will not be normalized***
-----------------------------------

Unnamed: 0_level_0,Age at time of assessment,Digit Span Total (Raw score),Digit Span Total (Normed score),note
Patient #,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
PD00209,59.0,,,
PD00119/T1,66.0,,,
PD00820,69.0,,,
PD00262,71.0,,,
PD00523,84.0,,,
