Strategy: <br />
Add first early stages (CDRGLOB is in \[0.5, 1\]) of each patient in the broader dataset as training samples, and use the clinician diagnosis as the label if there is no autopsy.

In [1]:
import pickle
import pandas as pd

In [2]:
# read in nacc_csv file
with open("../../../data/data_with_date/nacc_csv_with_dates.csv", "rt") as fin:
    nacc_csv = pd.read_csv(fin, low_memory=False)

In [3]:
# read in valid and test set of preprocessed first visit set
with open("../../processed_data/processed_csv/first_visit_features/valid_first_visit_features.csv", "rt") as fin:
    valid_csv = pd.read_csv(fin, low_memory=False)

In [4]:
with open("../../processed_data/processed_csv/first_visit_features/test_first_visit_features.csv", "rt") as fin:
    test_csv = pd.read_csv(fin, low_memory=False)

In [5]:
# get valid and test patient lists
valid_patients_list = list(pd.unique(valid_csv["NACCID"]))
test_patients_list = list(pd.unique(test_csv["NACCID"]))

In [6]:
# remove all valid patients in the original dataset
nacc_csv_remove_valid = nacc_csv.loc[~(nacc_csv["NACCID"].isin(valid_patients_list))]

In [7]:
test_patients_list

['NACC014159',
 'NACC017967',
 'NACC020729',
 'NACC021496',
 'NACC027247',
 'NACC027345',
 'NACC030229',
 'NACC031229',
 'NACC037296',
 'NACC047473',
 'NACC050056',
 'NACC053349',
 'NACC061943',
 'NACC063353',
 'NACC068046',
 'NACC073093',
 'NACC073194',
 'NACC074965',
 'NACC075828',
 'NACC077205',
 'NACC077658',
 'NACC079538',
 'NACC080364',
 'NACC080581',
 'NACC080709',
 'NACC081476',
 'NACC082277',
 'NACC083765',
 'NACC084374',
 'NACC092115',
 'NACC095374',
 'NACC101745',
 'NACC103600',
 'NACC107103',
 'NACC107456',
 'NACC107598',
 'NACC110086',
 'NACC110957',
 'NACC113665',
 'NACC117899',
 'NACC121171',
 'NACC122750',
 'NACC125288',
 'NACC128659',
 'NACC129197',
 'NACC132175',
 'NACC134274',
 'NACC135034',
 'NACC136227',
 'NACC137825',
 'NACC139670',
 'NACC143647',
 'NACC147329',
 'NACC147405',
 'NACC148139',
 'NACC151594',
 'NACC152181',
 'NACC155776',
 'NACC157282',
 'NACC157449',
 'NACC160100',
 'NACC164764',
 'NACC167363',
 'NACC171592',
 'NACC172579',
 'NACC174946',
 'NACC1780

In [8]:
# remove all test patients in the original dataset
nacc_csv_remove_valid_test = nacc_csv_remove_valid.loc[~(nacc_csv_remove_valid["NACCID"].isin(test_patients_list))]

In [9]:
valid_patients_list

['NACC000385',
 'NACC007563',
 'NACC009523',
 'NACC015874',
 'NACC017758',
 'NACC025427',
 'NACC025640',
 'NACC025848',
 'NACC026620',
 'NACC026968',
 'NACC029070',
 'NACC030247',
 'NACC033302',
 'NACC033913',
 'NACC034300',
 'NACC036265',
 'NACC050238',
 'NACC052909',
 'NACC053794',
 'NACC054296',
 'NACC054465',
 'NACC055050',
 'NACC055459',
 'NACC056886',
 'NACC057481',
 'NACC058984',
 'NACC059294',
 'NACC063230',
 'NACC064289',
 'NACC065400',
 'NACC070680',
 'NACC072682',
 'NACC073228',
 'NACC077112',
 'NACC077349',
 'NACC079032',
 'NACC080654',
 'NACC083033',
 'NACC085852',
 'NACC092461',
 'NACC093958',
 'NACC094822',
 'NACC095509',
 'NACC096116',
 'NACC096362',
 'NACC098998',
 'NACC103831',
 'NACC108805',
 'NACC109092',
 'NACC112218',
 'NACC116581',
 'NACC120808',
 'NACC124042',
 'NACC124673',
 'NACC125493',
 'NACC135801',
 'NACC137084',
 'NACC143228',
 'NACC143569',
 'NACC144344',
 'NACC144862',
 'NACC147188',
 'NACC153368',
 'NACC158324',
 'NACC160184',
 'NACC161199',
 'NACC1677

In [10]:
nacc_csv_remove_valid_test.loc[nacc_csv_remove_valid_test["NACCID"]=="NACC031229"]

Unnamed: 0,NACCID,DATE,NACCADC,PACKET,FORMVER,NACCVNUM,NACCAVST,NACCNVST,NACCDAYS,NACCFDYS,...,NPCPRION,NPPOTH1,NPCOTH1,NPOTH1X,NPPOTH2,NPCOTH2,NPOTH2X,NPPOTH3,NPCOTH3,NPOTH3X


In [11]:
# to check whether there is no intersected patients between valid/train, test/train pair
set(list(pd.unique(nacc_csv_remove_valid_test["NACCID"]))).intersection(set(valid_patients_list))

set()

In [12]:
set(list(pd.unique(nacc_csv_remove_valid_test["NACCID"]))).intersection(set(test_patients_list))

set()

In [13]:
set(test_patients_list).intersection(set(valid_patients_list))

set()

In [14]:
# get the number of column of valid_csv test_csv and nacc_csv_remove_valid_rest
len(pd.unique(nacc_csv_remove_valid_test.columns)), len(pd.unique(valid_csv.columns)), len(pd.unique(test_csv.columns))

(696, 697, 697)

In [15]:
# to get the values of NACCALZD and NACCLEWY
pd.unique(nacc_csv_remove_valid_test["NACCALZD"])

array([0, 1, 8])

In [16]:
pd.unique(nacc_csv_remove_valid_test["NACCLBDE"])

array([0, 1, 8])

In [17]:
# to get label for each line in nacc_csv_remove_valid_test
def label_disease(row):
    # if there is autopsy, we use autopsy as the label
    if row["NPADNC"] in [0, 1, 2, 3] and row["NACCLEWY"] in [0, 1, 2, 3]:
        # PURE AD
        if row["NPADNC"] in [2, 3] and row["NACCLEWY"] == 0:
            return 0
        # PURE LBD
        elif row["NPADNC"] in [0, 1] and row["NACCLEWY"] == 3:
            return 1
        # MIXED AD + LBD
        elif row["NPADNC"] in [2, 3] and row["NACCLEWY"] in [1, 2, 3]:
            return 2
        # OTHERS
        else:
            return 3
    # else use the clinician diagnosis
    else:
        # PURE AD
        if row["NACCALZD"]==1 and row["NACCLBDE"] in [0, 8]:
            return 0
        # PURE LBD
        elif row["NACCALZD"] in [0, 8] and row["NACCLBDE"]==1:
            return 1
        # MIXED AD + LBD
        elif row["NACCALZD"]==1 and row["NACCLBDE"]==1 :
            return 2
        # OTHERS
        else:
            return 3
    
nacc_csv_remove_valid_test["label"] = nacc_csv_remove_valid_test.apply (lambda row: label_disease(row), axis=1)  

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [18]:
nacc_csv_remove_valid_test[["NACCID", "DATE", "NPADNC", "NACCLEWY", "NACCALZD", "NACCLBDE", "label"]]

Unnamed: 0,NACCID,DATE,NPADNC,NACCLEWY,NACCALZD,NACCLBDE,label
0,NACC000011,2006-04-17,-4,-4,0,0,3
1,NACC000011,2007-06-18,-4,-4,0,0,3
2,NACC000011,2008-06-03,-4,-4,1,0,0
3,NACC000011,2009-08-03,-4,-4,0,0,3
4,NACC000034,2015-07-16,-4,-4,1,0,0
5,NACC000034,2016-11-01,-4,-4,1,0,0
6,NACC000034,2017-09-18,-4,-4,1,0,0
7,NACC000034,2018-11-05,-4,-4,1,0,0
8,NACC000067,2012-05-23,-4,-4,0,1,1
9,NACC000067,2013-05-15,-4,-4,0,1,1


In [19]:
# only get the first CDRGLOB = [0.5, 1] as the training sample
train_csv = nacc_csv_remove_valid_test.loc[nacc_csv_remove_valid_test["CDRGLOB"].isin([0.5, 1])]

In [20]:
# remove those records which are healthy based on clinician diagnosis

In [21]:
train_csv.loc[(train_csv["NACCALZD"]==8)&(train_csv["NACCLBDE"]==8)&(train_csv["label"]!=3)][["NACCID", "DATE", "NACCALZD", "NACCLBDE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NACCALZD,NACCLBDE,NPADNC,NACCLEWY,label
965,NACC007130,2009-11-09,8,8,3,0,0
966,NACC007130,2010-11-08,8,8,3,0,0
2486,NACC018152,2013-01-18,8,8,3,0,0
3547,NACC025829,2008-08-28,8,8,2,0,0
5786,NACC041587,2008-05-06,8,8,3,0,0
5787,NACC041587,2009-05-12,8,8,3,0,0
11159,NACC079389,2014-09-18,8,8,2,0,0
14281,NACC101241,2014-01-14,8,8,2,0,0
24866,NACC177734,2012-11-28,8,8,2,2,2
24868,NACC177734,2014-11-19,8,8,2,2,2


In [22]:
pd.unique(train_csv["NPADNC"])

array([-4,  3,  2,  1,  0,  8,  9])

In [23]:
drop_rows_index = train_csv.loc[(train_csv["NPADNC"].isin([-4, 8, 9])) & (train_csv["NACCLEWY"].isin([-4, 8, 9])) & (train_csv["NACCALZD"]==8) & (train_csv["NACCLBDE"]==8) & (train_csv["label"]==3)].index

In [24]:
train_csv.drop(drop_rows_index, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  errors=errors)


In [25]:
train_csv.loc[(train_csv["NACCALZD"]==0) & (train_csv["NACCLBDE"]==0)][["NACCID", "DATE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NPADNC,NACCLEWY,label
0,NACC000011,2006-04-17,-4,-4,3
1,NACC000011,2007-06-18,-4,-4,3
3,NACC000011,2009-08-03,-4,-4,3
18,NACC000162,2011-06-01,-4,-4,3
26,NACC000184,2007-01-29,-4,-4,3
27,NACC000184,2008-02-20,-4,-4,3
34,NACC000236,2008-09-29,-4,-4,3
52,NACC000382,2005-09-27,-4,-4,3
53,NACC000382,2006-09-27,-4,-4,3
54,NACC000382,2007-09-25,-4,-4,3


In [26]:
train_csv.loc[(train_csv["NACCALZD"]==8)&(train_csv["NACCLBDE"]==8)&(train_csv["label"]!=3)][["NACCID", "DATE", "NACCALZD", "NACCLBDE", "NPADNC", "NACCLEWY", "label"]]

Unnamed: 0,NACCID,DATE,NACCALZD,NACCLBDE,NPADNC,NACCLEWY,label
965,NACC007130,2009-11-09,8,8,3,0,0
966,NACC007130,2010-11-08,8,8,3,0,0
2486,NACC018152,2013-01-18,8,8,3,0,0
3547,NACC025829,2008-08-28,8,8,2,0,0
5786,NACC041587,2008-05-06,8,8,3,0,0
5787,NACC041587,2009-05-12,8,8,3,0,0
11159,NACC079389,2014-09-18,8,8,2,0,0
14281,NACC101241,2014-01-14,8,8,2,0,0
24866,NACC177734,2012-11-28,8,8,2,2,2
24868,NACC177734,2014-11-19,8,8,2,2,2


In [27]:
len(pd.unique(train_csv["NACCID"])), len(train_csv)

(22458, 53509)

In [28]:
len(train_csv.loc[train_csv["NACCID"]=="NACC000011"])

4

In [29]:
# get the first record of CDRGLOB == [0.5, 1]
train_csv = train_csv.groupby("NACCID").head(1)

In [30]:
len(train_csv), len(pd.unique(train_csv["NACCID"]))

(22458, 22458)

In [31]:
# get the labels for it
train_labels = train_csv[["NACCID", "DATE", "label"]]
valid_labels = valid_csv[["NACCID", "DATE", "label"]]
test_labels = test_csv[["NACCID", "DATE", "label"]]

In [32]:
# write them into csv files
with open("./csv/features/train_features.csv", "wt") as fout:
    train_csv.to_csv(fout, index=False)
with open("./csv/features/valid_features.csv", "wt") as fout:
    valid_csv.to_csv(fout, index=False)
with open("./csv/features/test_features.csv", "wt") as fout:
    test_csv.to_csv(fout, index=False)

In [33]:
# write the labels into csv files
with open("./csv/labels/train_labels.csv", "wt") as fout:
    train_labels.to_csv(fout, index=False)
with open("./csv/labels/valid_labels.csv", "wt") as fout:
    valid_labels.to_csv(fout, index=False)
with open("./csv/labels/test_labels.csv", "wt") as fout:
    test_labels.to_csv(fout, index=False)

In [34]:
# get the statistics for the training labels
label_list = ["PURE AD", "PURE LBD", "MIXED", "OTHERS"]
for index, label in enumerate(label_list):
    print("the number of {} is {}".format(label, len(train_labels.loc[train_labels["label"]==index])))

the number of PURE AD is 13029
the number of PURE LBD is 1039
the number of MIXED is 748
the number of OTHERS is 7642
