In [1]:
import pandas as pd
import pickle

In [2]:
with open("../data/data_with_date/nacc_csv_with_dates.csv", "rt") as fin:
    nacc_csv = pd.read_csv(fin, low_memory=False)

In [3]:
pd.unique(nacc_csv["NPADNC"])  # [0, 1, 2, 3] indicates valid data

array([-4,  2,  3,  1,  0,  8,  9])

In [4]:
pd.unique(nacc_csv["NACCLEWY"])  # [0, 1, 2, 3] indicates valid data

array([-4,  0,  3,  2,  1,  4,  8,  9])

In [5]:
nacc_csv_all_autopsy = nacc_csv.loc[(nacc_csv["NACCLEWY"].isin([0, 1, 2, 3])) & (nacc_csv["NPADNC"].isin([0, 1, 2, 3]))]

In [6]:
len(pd.unique(nacc_csv_all_autopsy["NACCID"])),len(nacc_csv_all_autopsy)

(2512, 12321)

In [7]:
# to see how many patient for each category
pure_ad = nacc_csv_all_autopsy.loc[(nacc_csv["NPADNC"].isin([2, 3])) & (nacc_csv["NACCLEWY"]==0)]
pure_lbd = nacc_csv_all_autopsy.loc[(nacc_csv["NPADNC"].isin([0, 1])) & (nacc_csv["NACCLEWY"]==3)]
mixed = nacc_csv_all_autopsy.loc[(nacc_csv["NPADNC"].isin([2, 3])) & (nacc_csv["NACCLEWY"].isin([1,2,3]))]

In [8]:
len(pd.unique(pure_ad["NACCID"])), len(pd.unique(pure_lbd["NACCID"])), len(pd.unique(mixed["NACCID"]))

(970, 53, 751)

In [9]:
len(pure_ad), len(pure_lbd), len(mixed)

(4937, 265, 3541)

In [10]:
# to get the patients for each category
pure_ad_patient_list = list(pd.unique(pure_ad["NACCID"]))
pure_lbd_patient_list = list(pd.unique(pure_lbd["NACCID"]))
mixed_patient_list = list(pd.unique(mixed["NACCID"]))
others_patient_list = list(set(pd.unique(nacc_csv_all_autopsy["NACCID"])) - set(pure_ad_patient_list) - set(pure_lbd_patient_list) - set(mixed_patient_list))

In [11]:
len(pd.unique(nacc_csv_all_autopsy["NACCID"])), len(pure_ad_patient_list), len(pure_lbd_patient_list), len(mixed_patient_list), len(others_patient_list)

(2512, 970, 53, 751, 738)

In [12]:
# to read in the valid and test dataset of which has been preprocessed before for the first visit

In [13]:
with open("./processed_data/processed_csv/first_visit_features/valid_first_visit_features.csv", "rt") as fin:
    valid_csv_before = pd.read_csv(fin)
with open("./processed_data/processed_csv/first_visit_features/test_first_visit_features.csv", "rt") as fin:
    test_csv_before = pd.read_csv(fin)

In [14]:
len(pd.unique(valid_csv_before["NACCID"])), len(pd.unique(test_csv_before["NACCID"])), len(valid_csv_before), len(test_csv_before)

(380, 380, 380, 380)

Note we need to re-select valid and test set, since those patients can have also other records which are not the first early stage visit.

In [15]:
# for our all autopsy data, we get train, valid and test patients
train_csv = nacc_csv_all_autopsy.loc[~nacc_csv_all_autopsy["NACCID"].isin(list(valid_csv_before["NACCID"]) + list(test_csv_before["NACCID"]))]
valid_csv = nacc_csv_all_autopsy.loc[(nacc_csv_all_autopsy["NACCID"].isin(list(valid_csv_before["NACCID"]))) & (nacc_csv_all_autopsy["CDRGLOB"].isin([0.5, 1]))].groupby("NACCID").head(1)
test_csv = nacc_csv_all_autopsy.loc[(nacc_csv_all_autopsy["NACCID"].isin(list(test_csv_before["NACCID"]))) & (nacc_csv_all_autopsy["CDRGLOB"].isin([0.5, 1]))].groupby("NACCID").head(1)

In [16]:
train_csv = train_csv.reset_index(drop=True)
valid_csv = valid_csv.reset_index(drop=True)
test_csv = test_csv.reset_index(drop=True)

In [17]:
len(pd.unique(valid_csv["NACCID"])), len(pd.unique(test_csv["NACCID"])), len(valid_csv), len(test_csv)

(380, 380, 380, 380)

In [18]:
### get the label dataframe from the original feature dataframe ###
def label_disease(row):
    # PURE AD
    if row["NPADNC"] in [2, 3] and row["NACCLEWY"] == 0:
        return 0
    
    # PURE LBD
    elif row["NPADNC"] in [0, 1] and row["NACCLEWY"] == 3:
        return 1
    
    # MIXED AD + LBD
    elif row["NPADNC"] in [2, 3] and row["NACCLEWY"] in [1, 2, 3]:
        return 2
    
    # OTHERS
    else:
        return 3
    
train_csv["label"] = train_csv.apply (lambda row: label_disease(row), axis=1)  
valid_csv["label"] = valid_csv.apply (lambda row: label_disease(row), axis=1)  
test_csv["label"] = test_csv.apply (lambda row: label_disease(row), axis=1)  

In [19]:
# get the labels for it
train_labels = train_csv[["NACCID", "DATE", "label"]]
valid_labels = valid_csv[["NACCID", "DATE", "label"]]
test_labels = test_csv[["NACCID", "DATE", "label"]]

In [20]:
# write features and labels into csv file
### write all features and labels into pandas file ###
train_csv.to_csv("./data_augmentation/all_autopsy/csv/features/train_first_visit_features.csv", index=False)
valid_csv.to_csv("./data_augmentation/all_autopsy/csv/features/valid_first_visit_features.csv", index=False)
test_csv.to_csv("./data_augmentation/all_autopsy/csv/features/test_first_visit_features.csv", index=False)

In [21]:
train_csv.to_csv("./data_augmentation/all_autopsy/csv/labels/train_labels.csv", index=False)
valid_csv.to_csv("./data_augmentation/all_autopsy/csv/labels/valid_labels.csv", index=False)
test_csv.to_csv("./data_augmentation/all_autopsy/csv/labels/test_labels.csv", index=False)

In [22]:
# statistics
len(train_labels[train_labels["label"]==0]), len(train_labels[train_labels["label"]==1]), len(train_labels[train_labels["label"]==2]), len(train_labels[train_labels["label"]==3])

(3311, 169, 2340, 2560)