In [1]:
import pandas as pd

In [2]:
### read in the qualified features dataframe ###
with open("../explore_data/intermediate_files/qualified_patients_features_nc_mild_window_df.csv", "rt") as fin:
    qualified_patients_features_df = pd.read_csv(fin, low_memory=False)

In [6]:
### get PURE AD, PURE LBD, MIXED AD + LBD patient dataframe ###
pure_ad_patients_df = qualified_patients_features_df.loc[(qualified_patients_features_df["NPADNC"].isin([2, 3])) & (qualified_patients_features_df["NACCLEWY"]==0)]
pure_lbd_patients_df = qualified_patients_features_df.loc[(qualified_patients_features_df["NPADNC"].isin([0, 1])) & (qualified_patients_features_df["NACCLEWY"]==3)]
mixed_ad_lbd_patients_df = qualified_patients_features_df.loc[(qualified_patients_features_df["NPADNC"].isin([2, 3])) & (qualified_patients_features_df["NACCLEWY"].isin([1, 2, 3]))]

In [27]:
### get PURE AD, PURE LBD, MIXED AD + LBD patient list and patient of other diseases list###
pure_ad_patients_list = list(pd.unique(pure_ad_patients_df["NACCID"]))
pure_lbd_patients_list = list(pd.unique(pure_lbd_patients_df["NACCID"]))
mixed_ad_lbd_patients_list = list(pd.unique(mixed_ad_lbd_patients_df["NACCID"]))
other_patients_list = list(set(pd.unique(qualified_patients_features_df["NACCID"])) - set(pure_ad_patients_list) - set(pure_lbd_patients_list) - set(mixed_ad_lbd_patients_list))

In [30]:
len(pd.unique(qualified_patients_features_df["NACCID"])), len(pure_ad_patients_list), len(pure_lbd_patients_list), len(mixed_ad_lbd_patients_list), len(other_patients_list)

(1887, 757, 46, 572, 512)

In [10]:
from sklearn.model_selection import train_test_split
import numpy as np

In [12]:
random_state = 1234

In [14]:
# for PURE AD patient
valid_test_num_pure_ad = int(np.ceil(len(pure_ad_patients_list) * 0.2))
pure_ad_patient_list_train, pure_ad_patient_list_test = train_test_split(pure_ad_patients_list, test_size=valid_test_num_pure_ad, random_state=random_state, shuffle=True)
pure_ad_patient_list_train, pure_ad_patient_list_valid = train_test_split(pure_ad_patient_list_train, test_size=valid_test_num_pure_ad, random_state=random_state, shuffle=True)

In [15]:
# for PURE LBD patient
valid_test_num_pure_lbd = int(np.ceil(len(pure_lbd_patients_list) * 0.2))
pure_lbd_patient_list_train, pure_lbd_patient_list_test = train_test_split(pure_lbd_patients_list, test_size=valid_test_num_pure_lbd, random_state=random_state, shuffle=True)
pure_lbd_patient_list_train, pure_lbd_patient_list_valid = train_test_split(pure_lbd_patient_list_train, test_size=valid_test_num_pure_lbd, random_state=random_state, shuffle=True)

In [16]:
# for PURE AD patient
valid_test_num_mixed_ad_lbd = int(np.ceil(len(mixed_ad_lbd_patients_list) * 0.2))
mixed_ad_lbd_patient_list_train, mixed_ad_lbd_patient_list_test = train_test_split(mixed_ad_lbd_patients_list, test_size=valid_test_num_mixed_ad_lbd, random_state=random_state, shuffle=True)
mixed_ad_lbd_patient_list_train, mixed_ad_lbd_patient_list_valid = train_test_split(mixed_ad_lbd_patient_list_train, test_size=valid_test_num_mixed_ad_lbd, random_state=random_state, shuffle=True)

In [32]:
# for other diseases patient
valid_test_num_others = int(np.ceil(len(other_patients_list) * 0.2))
other_patient_list_train, other_patient_list_test = train_test_split(other_patients_list, test_size=valid_test_num_others, random_state=random_state, shuffle=True)
other_patient_list_train, other_patient_list_valid = train_test_split(other_patient_list_train, test_size=valid_test_num_others, random_state=random_state, shuffle=True)

In [35]:
len(pure_ad_patient_list_train), len(pure_ad_patient_list_valid), len(pure_ad_patient_list_test)

(453, 152, 152)

In [36]:
len(pure_lbd_patient_list_train), len(pure_lbd_patient_list_valid), len(pure_lbd_patient_list_test)

(26, 10, 10)

In [37]:
len(mixed_ad_lbd_patient_list_train), len(mixed_ad_lbd_patient_list_valid), len(mixed_ad_lbd_patient_list_test)

(342, 115, 115)

In [38]:
len(other_patient_list_train), len(other_patient_list_valid), len(other_patient_list_test)

(306, 103, 103)

In [39]:
### get train, valid, test set ###
train_patients_list = pure_ad_patient_list_train + pure_lbd_patient_list_train + mixed_ad_lbd_patient_list_train + other_patient_list_train
valid_patients_list = pure_ad_patient_list_valid + pure_lbd_patient_list_valid + mixed_ad_lbd_patient_list_valid + other_patient_list_valid
test_patients_list = pure_ad_patient_list_test + pure_lbd_patient_list_test + mixed_ad_lbd_patient_list_test + other_patient_list_test

In [40]:
len(train_patients_list), len(valid_patients_list), len(test_patients_list)

(1127, 380, 380)

In [44]:
### get the label dataframe from the original feature dataframe ###
def label_disease(row):
    # PURE AD
    if row["NPADNC"] in [2, 3] and row["NACCLEWY"] == 0:
        return 0
    
    # PURE LBD
    elif row["NPADNC"] in [0, 1] and row["NACCLEWY"] == 3:
        return 1
    
    # MIXED AD + LBD
    elif row["NPADNC"] in [2, 3] and row["NACCLEWY"] in [1, 2, 3]:
        return 2
    
    # OTHERS
    else:
        return 3
    
qualified_patients_features_df["label"] = qualified_patients_features_df.apply (lambda row: label_disease(row), axis=1)    

In [55]:
### get the label dataframe from train, valid and test ###
train_general_df = qualified_patients_features_df.loc[qualified_patients_features_df["NACCID"].isin(train_patients_list)]
valid_general_df = qualified_patients_features_df.loc[qualified_patients_features_df["NACCID"].isin(valid_patients_list)]
test_general_df = qualified_patients_features_df.loc[qualified_patients_features_df["NACCID"].isin(test_patients_list)]

train_labels = train_general_df.groupby("NACCID").head(1)[["NACCID","label"]]
valid_labels = valid_general_df.groupby("NACCID").head(1)[["NACCID","label"]]
test_labels = test_general_df.groupby("NACCID").head(1)[["NACCID", "label"]]

train_labels = train_labels.reset_index(drop=True)
valid_labels = valid_labels.reset_index(drop=True)
test_labels = test_labels.reset_index(drop=True)

In [56]:
len(train_labels), len(valid_labels), len(test_labels)

(1127, 380, 380)

In [62]:
### Condition 1: use the first visit of each MILD window as the input ###
train_first_visit_features_df = train_general_df.loc[train_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").head(1)
valid_first_visit_features_df = valid_general_df.loc[valid_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").head(1)
test_first_visit_features_df = test_general_df.loc[test_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").head(1)

In [68]:
### Condition 2: use the last visit of each MILD window as the input ###
train_last_visit_features_df = train_general_df.loc[train_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").tail(1)
valid_last_visit_features_df = valid_general_df.loc[valid_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").tail(1)
test_last_visit_features_df = test_general_df.loc[test_general_df["CDRGLOB"].isin([0.5, 1])].groupby("NACCID").tail(1)

In [69]:
### Condition 3: use all the visits of 0.5 or 1 in each MILD window as the input ###
train_all_visits_features_df = train_general_df.loc[train_general_df["CDRGLOB"].isin([0.5, 1])]
valid_all_visits_features_df = valid_general_df.loc[valid_general_df["CDRGLOB"].isin([0.5, 1])]
test_all_visits_features_df = test_general_df.loc[test_general_df["CDRGLOB"].isin([0.5, 1])]

In [73]:
### Reset their index ###
train_first_visit_features_df = train_first_visit_features_df.reset_index(drop=True)
valid_first_visit_features_df = valid_first_visit_features_df.reset_index(drop=True)
test_first_visit_features_df = test_first_visit_features_df.reset_index(drop=True)

train_last_visit_features_df = train_last_visit_features_df.reset_index(drop=True)
valid_last_visit_features_df = valid_last_visit_features_df.reset_index(drop=True)
test_last_visit_features_df = test_last_visit_features_df.reset_index(drop=True)

train_all_visits_features_df = train_all_visits_features_df.reset_index(drop=True)
valid_all_visits_features_df = valid_all_visits_features_df.reset_index(drop=True)
test_all_visits_features_df = test_all_visits_features_df.reset_index(drop=True)


In [84]:
### write all features and labels into pandas file ###
train_first_visit_features_df.to_csv("./processed_data/first_visit_features/train_first_visit_features.csv", index=False)
valid_first_visit_features_df.to_csv("./processed_data/first_visit_features/valid_first_visit_features.csv", index=False)
test_first_visit_features_df.to_csv("./processed_data/first_visit_features/test_first_visit_features.csv", index=False)

In [85]:
train_last_visit_features_df.to_csv("./processed_data/last_visit_features/train_last_visit_features.csv", index=False)
valid_last_visit_features_df.to_csv("./processed_data/last_visit_features/valid_last_visit_features.csv", index=False)
test_last_visit_features_df.to_csv("./processed_data/last_visit_features/test_last_visit_features.csv", index=False)

In [86]:
train_all_visits_features_df.to_csv("./processed_data/all_visits_features/train_all_visits_features.csv", index=False)
valid_all_visits_features_df.to_csv("./processed_data/all_visits_features/valid_all_visits_features.csv", index=False)
test_all_visits_features_df.to_csv("./processed_data/all_visits_features/test_all_visits_features.csv", index=False)

In [87]:
train_labels.to_csv("./processed_data/labels/train_labels.csv", index=False)
valid_labels.to_csv("./processed_data/labels/valid_labels.csv", index=False)
test_labels.to_csv("./processed_data/labels/test_labels.csv", index=False)