In [1]:
import pandas as pd
from feature_calculation import build_biomarker_csv
import os
import numpy as np

In [2]:
data = pd.read_csv(
    "/Users/thomas.cong/Downloads/Voice_Data/W0000-NEU-CES-2157_Winterlight-07-06-2023.csv"
)
subject_remapping = {
    "CU01": "CU001",
    "CU02": "CU002",
    "CU03": "CU003",
    "CU04": "CU004",
    "CU05": "CU005",
    "CU06": "CU006",
    "CU07": "CU007",
    "CU08": "CU008",
    "CU09": "CU009",
    "CU10": "CU010",
    "CU011": "CU011",
    "CU012": "CU012",
    "CU013": "CU013",
    "CU015": "CU015",
    "CU014": "CU014",
    "CU016": "CU016",
    "CU017": "CU017",
    "RU001": "RU001",
    "RU002": "RU002",
    "RU003": "RU003",
    "RU004": "RU004",
    "RU005": "RU005",
}
data["SUBJID"] = data["SUBJID"].map(subject_remapping)
subject_dict = (
    data[["SUBJID", "X_DUID", "VISIT"]].drop_duplicates().set_index("X_DUID").to_dict()
)
for x in subject_dict["SUBJID"]:
    if subject_dict["VISIT"][x] == "V1":
        subject_dict["SUBJID"][x] += "OFF"
    elif subject_dict["VISIT"][x] == "V2":
        subject_dict["SUBJID"][x] += "ON"
subject_dict = {x: subject_dict["SUBJID"][x] for x in subject_dict["SUBJID"]}
on_off_dict = {x: "OFF" if "OFF" in subject_dict[x] else "ON" for x in subject_dict}
subject_only = {x: subject_dict[x].split("O")[0] for x in subject_dict}

In [3]:
biomarkers_2157_with_clinical = pd.read_csv(
    "./2157-Generated-Data/2157-Biomarkers.csv", index_col="winterlight_id"
)
biomarkers_2157_with_clinical["subjid"] = biomarkers_2157_with_clinical.index.map(
    subject_dict
)

In [6]:
UPDRS_Scores = pd.read_csv("Rolando CSV's/clinical-cumc.csv")
UPDRS_Scores = pd.concat([UPDRS_Scores, pd.read_csv("Rolando CSV's/clinical-rush.csv")])
UPDRS_Scores.reset_index(drop=True, inplace=True)
age_mapping = {
    UPDRS_Scores.loc[x]["subjid"]
    + UPDRS_Scores.loc[x]["state"]: int(UPDRS_Scores.loc[x]["age"])
    for x in UPDRS_Scores.index
}
weight_mapping = {
    UPDRS_Scores.loc[x]["subjid"]
    + UPDRS_Scores.loc[x]["state"]: int(UPDRS_Scores.loc[x]["weight"])
    for x in UPDRS_Scores.index
}
sex_mapping = {
    UPDRS_Scores.loc[x]["subjid"]
    + UPDRS_Scores.loc[x]["state"]: UPDRS_Scores.loc[x]["sex"]
    for x in UPDRS_Scores.index
}
UPDRS_Scores = UPDRS_Scores.pivot(
    index=["subjid", "state"], columns=["mds_updrs_label"], values="mds_updrs_value"
)
UPDRS_Scores.reset_index(inplace=True)
UPDRS_Scores["subjid"] = UPDRS_Scores["subjid"] + UPDRS_Scores["state"]
UPDRS_Scores.drop(columns=["state"], inplace=True)
UPDRS_Scores.set_index("subjid", inplace=True)
UPDRSIII_Mapping = {x: sum(UPDRS_Scores.loc[x]) for x in UPDRS_Scores.index}
Speech_Mapping = {x: int(UPDRS_Scores.loc[x]["Speech"]) for x in UPDRS_Scores.index}


def add_clinical_info(biomarker_data_frame):
    biomarker_data_frame["subjid"] = biomarker_data_frame.index.map(subject_dict)
    biomarker_data_frame["UPDRSIII"] = biomarker_data_frame["subjid"].map(
        UPDRSIII_Mapping
    )
    biomarker_data_frame["Speech"] = biomarker_data_frame["subjid"].map(Speech_Mapping)
    biomarker_data_frame["Age"] = biomarker_data_frame["subjid"].map(age_mapping)
    biomarker_data_frame["Weight"] = biomarker_data_frame["subjid"].map(weight_mapping)
    biomarker_data_frame["Sex"] = biomarker_data_frame["subjid"].map(sex_mapping)

    if "task" in biomarker_data_frame.columns:
        biomarker_data_frame.drop(columns=["task"], inplace=True)
    try:
        biomarker_data_frame["Status"] = (
            "O" + biomarker_data_frame["subjid"].str.split("O").str[1]
        )
        biomarker_data_frame["Status"] = biomarker_data_frame["Status"].fillna("HC")
    except AttributeError:
        biomarker_data_frame["Status"] = "HC"
    return biomarker_data_frame


def normalize_biomarkers(biomarker_data_frame, mode="full"):
    if mode == "full":
        for col in biomarker_data_frame.columns:
            if col:
                try:
                    std = biomarker_data_frame[col].std(skipna=True)
                    mean = biomarker_data_frame[col].mean(skipna=True)
                    if not std or not mean:
                        continue
                    biomarker_data_frame[col] = biomarker_data_frame[col].apply(
                        lambda x: max(min((x - mean) / std, 3), -3)
                    )
                except TypeError:
                    pass
    elif mode == "2157":
        biomarker_data_frame_pd = biomarker_data_frame.copy()
        biomarker_data_frame_pd = biomarker_data_frame_pd[
            biomarker_data_frame_pd["Status"].isin(["ON", "OFF"])
        ]
        biomarker_data_frame_healthy = biomarker_data_frame.copy()
        biomarker_data_frame_healthy = biomarker_data_frame_healthy[
            biomarker_data_frame_healthy["Status"] == "HC"
        ]
        biomarker_data_frame_pd = normalize_biomarkers(
            biomarker_data_frame_pd, mode="full"
        )
        biomarker_data_frame_healthy = normalize_biomarkers(
            biomarker_data_frame_healthy, mode="full"
        )
        biomarker_data_frame = pd.concat(
            [biomarker_data_frame_pd, biomarker_data_frame_healthy]
        )
    return biomarker_data_frame


for file in os.listdir("2157-Generated-Data"):
    file_path = os.fsdecode(file)
    if not file_path.endswith(".csv"):
        continue
    info = "-".join(file_path.split("-")[1:])
    info = info.replace(".csv", "")
    print(info)
    df = pd.read_csv("./2157-Generated-Data/" + file_path, index_col="winterlight_id")
    df = add_clinical_info(df)
    df.to_csv(f"2157-Generated-Data/Clinical/Regular/2157-Clinical-{info}.csv")
    pd_df = normalize_biomarkers(df, mode="2157")
    pd_df.to_csv(f"2157-Generated-Data/Clinical/Normalized/2157-{info}-pd-normalized.csv")
    full_df = normalize_biomarkers(df, mode="full")
    full_df.to_csv(f"2157-Generated-Data/Clinical/Normalized/2157-{info}-full-normalized.csv")

Sustained-Vowel-Phonation
Paragraph-Reading
Paragraph-Recall
Picture-Description
Object-Naming
Biomarkers
DDK
Phonemic-Fluency
Semantic-Fleuncy


In [6]:
rush_data = pd.read_csv("Rolando CSV's/winterlight_rush_observations.csv")
cumc_data = pd.read_csv("Rolando CSV's/winterlight_cumc_observations.csv")
raw_data = pd.read_csv(
    "/Users/thomas.cong/Downloads/Voice_Data/W0000-NEU-CES-2157_Winterlight-07-06-2023.csv"
)
rush_data.columns = rush_data.columns.str.upper()
cumc_data.columns = cumc_data.columns.str.upper()
raw_data.columns = raw_data.columns.str.upper()

In [7]:
# For rush_data.X_DUID
rush_mask = rush_data.X_DUID >= 100000
filtered_rush = rush_data[rush_mask]

# For cumc_data.X_DUID
cumc_mask = cumc_data.X_DUID >= 100000
filtered_cumc = cumc_data[cumc_mask]

# For raw_data.X_DUID
raw_mask = raw_data.X_DUID >= 100000
filtered_raw = raw_data[raw_mask]

print(filtered_rush.X_DUID.unique().shape)
print(filtered_cumc.X_DUID.unique().shape)
print(filtered_raw.X_DUID.unique().shape)
print(filtered_raw.SUBJID.unique())
print(filtered_rush.SUBJID.unique())
print(sorted(rush_data.ZWGRPID.unique()))
print(sorted(raw_data.ZWGRPID.unique()))

(217,)
(217,)
(322,)
['CU01' 'CU011' 'CU012' 'CU013' 'CU014' 'CU015' 'CU016' 'CU017' 'CU02'
 'CU03' 'CU04' 'CU05' 'CU06' 'CU07' 'CU08' 'CU09' 'CU10' 'RU001' 'RU002'
 'RU003' 'RU004' 'RU005']
['CU001' 'CU011' 'CU012' 'CU013' 'CU015' 'CU016' 'CU017' 'CU002' 'CU003'
 'CU004' 'CU005' 'CU006' 'CU007' 'CU008' 'CU009' 'CU010']


22 (17 from cumc, 5 from rush) * 6 * 2 = 264 total files (12 per subject)

In [9]:
def get_samples_per_subject(dataframe):
    result = {}
    for visit in ["V1", "V2"]:
        status = "OFF" if visit == "V1" else "ON"
        for subject in dataframe[dataframe.VISIT == visit].SUBJID.unique():
            result[subject, status] = (
                dataframe[(dataframe.SUBJID == subject) & (dataframe.VISIT == visit)]
                .X_DUID.unique()
                .shape[0]
            )
    return result


print(get_samples_per_subject(filtered_raw))
print(get_samples_per_subject(filtered_rush))
print(get_samples_per_subject(filtered_cumc))

{('CU01', 'OFF'): 7, ('CU011', 'OFF'): 7, ('CU012', 'OFF'): 7, ('CU013', 'OFF'): 7, ('CU014', 'OFF'): 7, ('CU015', 'OFF'): 7, ('CU016', 'OFF'): 7, ('CU017', 'OFF'): 7, ('CU02', 'OFF'): 7, ('CU03', 'OFF'): 7, ('CU04', 'OFF'): 7, ('CU05', 'OFF'): 7, ('CU06', 'OFF'): 7, ('CU07', 'OFF'): 7, ('CU08', 'OFF'): 7, ('CU09', 'OFF'): 7, ('CU10', 'OFF'): 7, ('RU001', 'OFF'): 14, ('RU002', 'OFF'): 7, ('RU003', 'OFF'): 7, ('RU004', 'OFF'): 7, ('RU005', 'OFF'): 7, ('CU01', 'ON'): 7, ('CU011', 'ON'): 7, ('CU012', 'ON'): 7, ('CU013', 'ON'): 7, ('CU014', 'ON'): 7, ('CU015', 'ON'): 7, ('CU016', 'ON'): 7, ('CU017', 'ON'): 7, ('CU02', 'ON'): 7, ('CU03', 'ON'): 7, ('CU04', 'ON'): 7, ('CU05', 'ON'): 7, ('CU06', 'ON'): 7, ('CU07', 'ON'): 7, ('CU08', 'ON'): 7, ('CU09', 'ON'): 7, ('CU10', 'ON'): 7, ('RU001', 'ON'): 14, ('RU002', 'ON'): 7, ('RU003', 'ON'): 7, ('RU004', 'ON'): 7, ('RU005', 'ON'): 7}
{('CU001', 'OFF'): 7, ('CU011', 'OFF'): 7, ('CU012', 'OFF'): 7, ('CU013', 'OFF'): 7, ('CU015', 'OFF'): 7, ('CU016',

In [10]:
CLINICAL = pd.read_csv("Rolando CSV's/clinical-cumc.csv")
CLINICAL = pd.concat([CLINICAL, pd.read_csv("Rolando CSV's/clinical-rush.csv")])