Strategy: <br />
Add all stages (CDRGLOB is in \[0, 0.5, 1, 2, 3\]) in the broader dataset as training samples, and use the clinician diagnosis as the label if there is no autopsy.

In [1]:
import pickle
import pandas as pd

In [2]:
# read in nacc_csv file
with open("../../../data/data_with_date/nacc_csv_with_dates.csv", "rt") as fin:
    nacc_csv = pd.read_csv(fin, low_memory=False)

In [3]:
# read in valid and test set of preprocessed first visit set
with open("../../processed_data/processed_csv/first_visit_features/valid_first_visit_features.csv", "rt") as fin:
    valid_csv = pd.read_csv(fin, low_memory=False)

In [4]:
with open("../../processed_data/processed_csv/first_visit_features/test_first_visit_features.csv", "rt") as fin:
    test_csv = pd.read_csv(fin, low_memory=False)

In [5]:
# get valid and test patient lists
valid_patients_list = list(pd.unique(valid_csv["NACCID"]))
test_patients_list = list(pd.unique(test_csv["NACCID"]))

In [6]:
# remove all valid patients in the original dataset
nacc_csv_remove_valid = nacc_csv.loc[~(nacc_csv["NACCID"].isin(valid_patients_list))]

In [9]:
# remove all test patients in the original dataset
nacc_csv_remove_valid_test = nacc_csv_remove_valid.loc[~(nacc_csv_remove_valid["NACCID"].isin(test_patients_list))]

In [11]:
# get the number of column of valid_csv test_csv and nacc_csv_remove_valid_rest
len(pd.unique(nacc_csv_remove_valid_test.columns)), len(pd.unique(valid_csv.columns)), len(pd.unique(test_csv.columns))

(696, 697, 697)

In [12]:
# to get the values of NACCALZD and NACCLEWY
pd.unique(nacc_csv_remove_valid_test["NACCALZD"])

array([0, 1, 8])

In [13]:
pd.unique(nacc_csv_remove_valid_test["NACCLBDE"])

array([0, 1, 8])

In [14]:
# to test whether there is some intersection between train/valid, train/test, valid/test
set(list(pd.unique(nacc_csv_remove_valid_test["NACCID"]))).intersection(set(valid_patients_list))

set()

In [15]:
set(list(pd.unique(nacc_csv_remove_valid_test["NACCID"]))).intersection(set(test_patients_list))

set()

In [17]:
set(valid_patients_list).intersection(set(test_patients_list))

set()

In [18]:
# to get label for each line in nacc_csv_remove_valid_test
def label_disease(row):
    # if there is autopsy, we use autopsy as the label
    if row["NPADNC"] in [0, 1, 2, 3] and row["NACCLEWY"] in [0, 1, 2, 3]:
        # PURE AD
        if row["NPADNC"] in [2, 3] and row["NACCLEWY"] == 0:
            return 0
        # PURE LBD
        elif row["NPADNC"] in [0, 1] and row["NACCLEWY"] == 3:
            return 1
        # MIXED AD + LBD
        elif row["NPADNC"] in [2, 3] and row["NACCLEWY"] in [1, 2, 3]:
            return 2
        # OTHERS
        else:
            return 3
    # else use the clinician diagnosis
    else:
        # PURE AD
        if row["NACCALZD"]==1 and row["NACCLBDE"] in [0, 8]:
            return 0
        # PURE LBD
        elif row["NACCALZD"] in [0, 8] and row["NACCLBDE"]==1:
            return 1
        # MIXED AD + LBD
        elif row["NACCALZD"]==1 and row["NACCLBDE"]==1 :
            return 2
        # OTHERS
        else:
            return 3
    
nacc_csv_remove_valid_test["label"] = nacc_csv_remove_valid_test.apply (lambda row: label_disease(row), axis=1)  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [19]:
nacc_csv_remove_valid_test[["NACCID", "DATE", "NPADNC", "NACCLEWY", "NACCALZD", "NACCLBDE", "label"]]

Unnamed: 0,NACCID,DATE,NPADNC,NACCLEWY,NACCALZD,NACCLBDE,label
0,NACC000011,2006-04-17,-4,-4,0,0,3
1,NACC000011,2007-06-18,-4,-4,0,0,3
2,NACC000011,2008-06-03,-4,-4,1,0,0
3,NACC000011,2009-08-03,-4,-4,0,0,3
4,NACC000034,2015-07-16,-4,-4,1,0,0
5,NACC000034,2016-11-01,-4,-4,1,0,0
6,NACC000034,2017-09-18,-4,-4,1,0,0
7,NACC000034,2018-11-05,-4,-4,1,0,0
8,NACC000067,2012-05-23,-4,-4,0,1,1
9,NACC000067,2013-05-15,-4,-4,0,1,1


In [20]:
# remove those records which are healthy based on clinician diagnosis
train_csv = nacc_csv_remove_valid_test

In [21]:
train_csv.loc[(train_csv["NACCALZD"]==8)&(train_csv["NACCLBDE"]==8)&(train_csv["label"]!=3)][["NACCID", "DATE", "NACCALZD", "NACCLBDE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NACCALZD,NACCLBDE,NPADNC,NACCLEWY,label
602,NACC004234,2007-08-17,8,8,2,2,2
603,NACC004234,2009-09-10,8,8,2,2,2
604,NACC004234,2013-04-09,8,8,2,2,2
963,NACC007130,2007-11-16,8,8,3,0,0
964,NACC007130,2008-11-13,8,8,3,0,0
965,NACC007130,2009-11-09,8,8,3,0,0
966,NACC007130,2010-11-08,8,8,3,0,0
967,NACC007130,2011-12-02,8,8,3,0,0
1273,NACC009307,2005-10-21,8,8,2,2,2
1274,NACC009307,2007-01-09,8,8,2,2,2


In [22]:
pd.unique(train_csv["NPADNC"])

array([-4,  3,  2,  1,  0,  8,  9])

In [23]:
drop_rows_index = train_csv.loc[(train_csv["NPADNC"].isin([-4, 8, 9])) & (train_csv["NACCLEWY"].isin([-4, 8, 9])) & (train_csv["NACCALZD"]==8) & (train_csv["NACCLBDE"]==8) & (train_csv["label"]==3)].index

In [24]:
train_csv.drop(drop_rows_index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [25]:
train_csv.loc[(train_csv["NACCALZD"]==0) & (train_csv["NACCLBDE"]==0)][["NACCID", "DATE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NPADNC,NACCLEWY,label
0,NACC000011,2006-04-17,-4,-4,3
1,NACC000011,2007-06-18,-4,-4,3
3,NACC000011,2009-08-03,-4,-4,3
18,NACC000162,2011-06-01,-4,-4,3
26,NACC000184,2007-01-29,-4,-4,3
27,NACC000184,2008-02-20,-4,-4,3
28,NACC000225,2009-09-23,-4,-4,3
34,NACC000236,2008-09-29,-4,-4,3
45,NACC000304,2006-04-13,-4,-4,3
52,NACC000382,2005-09-27,-4,-4,3


In [26]:
train_csv.loc[(train_csv["NACCALZD"]==8)&(train_csv["NACCLBDE"]==8)&(train_csv["label"]!=3)][["NACCID", "DATE", "NACCALZD", "NACCLBDE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NACCALZD,NACCLBDE,NPADNC,NACCLEWY,label
602,NACC004234,2007-08-17,8,8,2,2,2
603,NACC004234,2009-09-10,8,8,2,2,2
604,NACC004234,2013-04-09,8,8,2,2,2
963,NACC007130,2007-11-16,8,8,3,0,0
964,NACC007130,2008-11-13,8,8,3,0,0
965,NACC007130,2009-11-09,8,8,3,0,0
966,NACC007130,2010-11-08,8,8,3,0,0
967,NACC007130,2011-12-02,8,8,3,0,0
1273,NACC009307,2005-10-21,8,8,2,2,2
1274,NACC009307,2007-01-09,8,8,2,2,2


In [27]:
len(pd.unique(train_csv["NACCID"])), len(train_csv)

(27926, 77595)

In [28]:
len(train_csv.loc[train_csv["NACCID"]=="NACC000011"])

4

In [29]:
# drop the index 
train_csv = train_csv.reset_index(drop=True)
valid_csv = valid_csv.reset_index(drop=True)
test_csv = test_csv.reset_index(drop=True)

In [30]:
len(train_csv), len(pd.unique(train_csv["NACCID"]))

(77595, 27926)

In [31]:
# get the labels for it
train_labels = train_csv[["NACCID", "DATE", "label"]]
valid_labels = valid_csv[["NACCID", "DATE", "label"]]
test_labels = test_csv[["NACCID", "DATE", "label"]]

In [33]:
# write them into csv files
with open("./csv/features/train_features.csv", "wt") as fout:
    train_csv.to_csv(fout, index=False)
with open("./csv/features/valid_features.csv", "wt") as fout:
    valid_csv.to_csv(fout, index=False)
with open("./csv/features/test_features.csv", "wt") as fout:
    test_csv.to_csv(fout, index=False)

In [34]:
# write the labels into csv files
with open("./csv/labels/train_labels.csv", "wt") as fout:
    train_labels.to_csv(fout, index=False)
with open("./csv/labels/valid_labels.csv", "wt") as fout:
    valid_labels.to_csv(fout, index=False)
with open("./csv/labels/test_labels.csv", "wt") as fout:
    test_labels.to_csv(fout, index=False)

In [35]:
# get the statistics for the training labels
label_list = ["PURE AD", "PURE LBD", "MIXED", "OTHERS"]
for index, label in enumerate(label_list):
    print("the number of {} is {}".format(label, len(train_labels.loc[train_labels["label"]==index])))

the number of PURE AD is 47684
the number of PURE LBD is 3270
the number of MIXED is 4066
the number of OTHERS is 22575
