In [6]:
import pandas as pd
from feature_calculation import build_biomarker_csv
import os

In [7]:
data = pd.read_csv("/Users/thomas.cong/Downloads/Voice_Data/W0000-NEU-CES-2157_Winterlight.csv")

subject_dict = data[["SUBJID", "X_DUID", "VISIT"]].drop_duplicates().set_index("X_DUID").to_dict()
for x in subject_dict["SUBJID"]:
    if subject_dict["VISIT"][x] == "V1":
        subject_dict["SUBJID"][x] += "OFF"
    else:
        subject_dict["SUBJID"][x] += "ON"
subject_dict = {x: subject_dict["SUBJID"][x] for x in subject_dict["SUBJID"]}
on_off_dict = {x: "OFF" if "OFF" in subject_dict[x] else "ON" for x in subject_dict}
subject_only = {x: subject_dict[x].split("O")[0] for x in subject_dict}

In [8]:
biomarkers_2157_with_clinical = pd.read_csv("./2157-Generated-Data/2157-Biomarkers.csv", index_col = "winterlight_id")
biomarkers_2157_with_clinical["subjid"] = biomarkers_2157_with_clinical.index.map(subject_dict)

In [10]:
UPDRS_Scores = pd.read_csv("Rolando CSV's/clinical-cumc.csv")
UPDRS_Scores = pd.concat([UPDRS_Scores, pd.read_csv("Rolando CSV's/clinical-rush.csv")])
UPDRS_Scores = UPDRS_Scores.pivot(index=["subjid", "state"], columns = ["mds_updrs_label"], values="mds_updrs_value")
UPDRS_Scores.reset_index(inplace=True)
UPDRS_Scores["subjid"] = UPDRS_Scores["subjid"] + UPDRS_Scores["state"]
UPDRS_Scores.drop(columns=["state"], inplace=True)
UPDRS_Scores.set_index("subjid", inplace=True)
UPDRSIII_Mapping = {x: sum(UPDRS_Scores.loc[x]) for x in UPDRS_Scores.index}
Speech_Mapping = {x: int(UPDRS_Scores.loc[x]["Speech"]) for x in UPDRS_Scores.index}
def add_clinical_info(biomarker_data_frame):
    biomarker_data_frame["subjid"] = biomarker_data_frame.index.map(subject_dict)
    biomarker_data_frame["UPDRSIII"] = biomarker_data_frame["subjid"].map(UPDRSIII_Mapping)
    biomarker_data_frame["Speech"] = biomarker_data_frame["subjid"].map(Speech_Mapping)
    if "task" in biomarker_data_frame.columns:
        biomarker_data_frame.drop(columns=["task"], inplace=True)
    try:
        biomarker_data_frame["Status"] = "O" + biomarker_data_frame["subjid"].str.split("O").str[1]
        biomarker_data_frame["Status"] = biomarker_data_frame["Status"].fillna("HC")
    except AttributeError:
        biomarker_data_frame["Status"] = "HC"
    return biomarker_data_frame
def normalize_biomarkers(biomarker_data_frame):
    for col in biomarker_data_frame.columns:
        if col:
            try:
                std = biomarker_data_frame[col].std()
                mean = biomarker_data_frame[col].mean()
                biomarker_data_frame[col] = biomarker_data_frame[col].apply(lambda x: max(min((x - mean) / std, 3), -3))
            except TypeError:
                pass
    return biomarker_data_frame
for file in os.listdir("2157-Generated-Data"):
    file_path = os.fsdecode(file)
    if not file_path.endswith(".csv"):
        continue
    info = "-".join(file_path.split("-")[1:])
    info = info.replace(".csv", "")
    print(info)
    df = pd.read_csv("./2157-Generated-Data/" + file_path, index_col="winterlight_id")
    df = add_clinical_info(df)
    df = normalize_biomarkers(df)
    print(df.shape)
    df.to_csv(f"2157-Generated-Data/Clinical/2157-Clinical-{info}-normalized.csv")


Sustained-Vowel-Phonation
(63, 34)
Paragraph-Reading
(63, 34)
Paragraph-Recall
(21, 34)
Picture-Description
(126, 34)
Object-Naming
(62, 34)
Biomarkers
(524, 34)
DDK
(63, 34)
Phonemic-Fluency
(63, 34)
Semantic-Fleuncy
(63, 34)


In [53]:
rush_data = pd.read_csv("Rolando CSV's/winterlight_rush_observations.csv")
cumc_data = pd.read_csv("Rolando CSV's/winterlight_cumc_observations.csv")
raw_data = pd.read_csv("/Users/thomas.cong/Downloads/Voice_Data/W0000-NEU-CES-2157_Winterlight-07-06-2023.csv")
rush_data.columns = rush_data.columns.str.upper()
cumc_data.columns = cumc_data.columns.str.upper()
raw_data.columns = raw_data.columns.str.upper()


In [None]:
# For rush_data.X_DUID
rush_mask = rush_data.X_DUID >= 100000
filtered_rush = rush_data[rush_mask]

# For cumc_data.X_DUID
cumc_mask = cumc_data.X_DUID >= 100000
filtered_cumc = cumc_data[cumc_mask]

# For raw_data.X_DUID
raw_mask = raw_data.X_DUID >= 100000
filtered_raw = raw_data[raw_mask]

print(filtered_rush.X_DUID.unique().shape)
print(filtered_cumc.X_DUID.unique().shape)
print(filtered_raw.X_DUID.unique().shape)
print(filtered_raw.SUBJID.unique())
print(filtered_rush.SUBJID.unique())


(217,)
(217,)
(322,)
['CU01' 'CU011' 'CU012' 'CU013' 'CU014' 'CU015' 'CU016' 'CU017' 'CU02'
 'CU03' 'CU04' 'CU05' 'CU06' 'CU07' 'CU08' 'CU09' 'CU10' 'RU001' 'RU002'
 'RU003' 'RU004' 'RU005']
['CU001' 'CU011' 'CU012' 'CU013' 'CU015' 'CU016' 'CU017' 'CU002' 'CU003'
 'CU004' 'CU005' 'CU006' 'CU007' 'CU008' 'CU009' 'CU010']


In [56]:
print(sorted(rush_data.ZWGRPID.unique()))
print(sorted(raw_data.ZWGRPID.unique()))


['ddk', 'paragraph_reading', 'phonemic_fluency', 'picture_description', 'semantic_fluency', 'sustained_vowel_phonation']
['ddk', 'paragraph_reading', 'phonemic_fluency', 'picture_description', 'semantic_fluency', 'sustained_vowel_phonation']


        STUDYID  DOMAIN  SUBJID  VISIT  ZWDAT  ZWTIM  ZWCAT  ZWGRPID  ZWRUNID  \
X_DUID                                                                          
112287      762     762     762    762    762    762    762      762      762   
112288      762     762     762    762    762    762    762      762      762   
112289      762     762     762    762    762    762    762      762      762   
112290      762     762     762    762    762    762    762      762      762   
112291      762     762     762    762    762    762    762      762      762   
...         ...     ...     ...    ...    ...    ...    ...      ...      ...   
130946      762     762     762    762    762    762    762      762      762   
130947      762     762     762    762    762    762    762      762      762   
130948      762     762     762    762    762    762    762      762      762   
130949      762     762     762    762    762    762    762      762      762   
130950      762     762     

22 (17 from cumc, 5 from rush) * 6 * 2 = 264 total files (12 per subject)