In [2]:
import pandas as pd
import numpy as np
import os
import re
from icecream import ic
from scipy import stats
import toml

In [3]:
import warnings
warnings.filterwarnings("ignore")

#### data correction

In [5]:
read_base_dir = '/home/skowshik/ADRD_repo/data/'
save_base_dir = '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/'

def get_zip_files(df, fea):
    zip_files = []
    for fn in list(df[fea]):
        if isinstance(fn, str):
            if fn.endswith('.zip'):
                zip_files.append(fn)
            else:
                zip_files.append(np.NaN) 
        else:
            zip_files.append(np.NaN)
    return zip_files

def correction(filepath, fea):
    df = pd.read_csv(read_base_dir + filepath)
    # print(df[~df['filename'].isna()])
    df['mri_zip'] = get_zip_files(df, fea)
    df.drop('filename', axis=1, inplace=True)
    if 'path' in df:
        df.drop('path', axis=1, inplace=True)
    if 'filename_vit_emb' in df:
        df.drop('filename_vit_emb', axis=1, inplace=True)
    df['bat_OTRLARR'] = df['bat_OTRLARR'].replace({88: np.NaN, 95: np.NaN, 96: np.NaN, 97: np.NaN, 98: np.NaN, 888: np.NaN, 995: np.NaN, 996: np.NaN, 997: np.NaN, 998: np.NaN, -4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_OTRLALI'] = df['bat_OTRLALI'].replace({88: np.NaN, 95: np.NaN, 96: np.NaN, 97: np.NaN, 98: np.NaN, 888: np.NaN, 995: np.NaN, 996: np.NaN, 997: np.NaN, 998: np.NaN, -4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_OTRLBRR'] = df['bat_OTRLBRR'].replace({88: np.NaN, 95: np.NaN, 96: np.NaN, 97: np.NaN, 98: np.NaN, 888: np.NaN, 995: np.NaN, 996: np.NaN, 997: np.NaN, 998: np.NaN, -4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_OTRLBLI'] = df['bat_OTRLBLI'].replace({88: np.NaN, 95: np.NaN, 96: np.NaN, 97: np.NaN, 98: np.NaN, 888: np.NaN, 995: np.NaN, 996: np.NaN, 997: np.NaN, 998: np.NaN, -4: np.NaN, '-4':np.NaN}).astype(float)

    df.to_csv(save_base_dir + filepath, index=False)
    return df
    

df1 = correction('training_cohorts/new_nacc_revised_selection.csv', fea='filename')
df2 = correction('train_vld_test_split_updated/nacc_neuropath.csv', fea='filename')
df3 = correction('train_vld_test_split_updated/nacc_test_with_np_cli.csv', fea='mri_zip')
df4 = correction('train_vld_test_split_updated/nacc_neuropath_test.csv', fea='mri_zip')
df5 = correction('train_vld_test_split_updated/clinician_review_cases_test.csv', fea='mri_zip')
df6 = correction('train_vld_test_split_updated/radiologist_review_cases_test.csv', fea='mri_zip')

### load data

In [11]:
old_nacc = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/nacc.csv')
# Train set
nacc_all = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/new_nacc_revised_selection.csv')

aibl = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/AIBL.csv')
nifd = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/NIFD.csv')
ppmi = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/PPMI.csv')
stanford = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/Stanford.csv')
oasis_ov = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/OASIS.csv') # has overlapping cases
oasis_wo_nacc = pd.read_excel('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/OASIS_without_NACC.xlsx')
oasis = oasis_ov[oasis_ov['filename'].isin(list(oasis_wo_nacc['filename']))]

# Test set
adni1 = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/ADNI1.csv')
adni2 = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/ADNI2.csv')
adni3 = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/ADNI3.csv')
adni_go = pd.read_csv('/home/skowshik/ADRD_repo/other_data/adni_aibl_nacc_nifd_oasis_ppmi/ADNIGO.csv')
# fhs = pd.read_csv('/data_1/csv_files/FHS.csv')
nacc_np = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/nacc_neuropath.csv')
nacc_np = nacc_all[nacc_all['ID'].isin(nacc_np['ID'])]

cli = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/clinician_review_cases_test.csv')
cli = nacc_all[nacc_all['ID'].isin(cli['ID'])]

nacc = nacc_all[(~nacc_all['ID'].isin(nacc_np['ID'])) & (~nacc_all['ID'].isin(cli['ID']))]
# FHS
# BMC

### process

In [7]:
# adni1['age'].value_counts()

In [13]:
# nacc['path'] = np.NaN
adni1['path'] = '/data_1/ADNI1/'
adni2['path'] = '/data_1/ADNI2/'
adni3['path'] = '/data_1/ADNI3/'
adni_go['path'] = '/data_1/ADNIGO/'
aibl['path'] = '/data_1/AIBL/'
nifd['path'] = '/data_1/NIFD/'
ppmi['path'] = '/data_1/PPMI/'
stanford['path'] = '/data_1/Stanford/npy/'
oasis['path'] = '/data_1/OASIS_/'

In [14]:
def lbd(row):
    if row['PDD'] == 1 or row['DLB'] == 1:
        return 1
    elif row['PDD'] == 0 and row['DLB'] == 0:
        return 0
    else:
        return np.NaN
    
def extract_id(row):
    if ('ID' not in dict(row).keys() ) | pd.isna(row['ID']):
        if 'adni' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:4])
        elif 'aibl' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:2])
        elif 'nifd' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:4])
        elif 'oasis' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:2])
        elif 'ppmi' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:2])
        elif 'stanford' in row['path'].lower():
            return 'STANFORD_' + row['filename'].split('.')[0]
        else:
            return np.NaN
    else:
        return row['ID']

def convert_dataset(df):
    if 'ID' not in df.columns:
        df['ID'] = np.NaN
    df['ID'] = df.apply(extract_id, axis=1)
    if 'age' in df.columns:
        df['his_NACCAGE'] = df['age'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('age', axis=1, inplace=True)
    if 'gender' in df.columns:
        df['his_SEX'] = df['gender'].replace({-4: np.NaN, '-4':np.NaN})
        df.drop('gender', axis=1, inplace=True)
    if 'education' in df.columns:
        df['his_EDUC'] = df['education'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('education', axis=1, inplace=True)
    if 'hispanic' in df.columns:
        df['his_HISPANIC'] = df['hispanic'].replace({-4: np.NaN, '-4':np.NaN}).replace({1: 'yes', 2: 'no', 3: np.NaN, '1': 'yes', '2': 'no', '3': np.NaN, '1.0': 'yes', '2.0': 'no','3.0': np.NaN})
        df.drop('hispanic', axis=1, inplace=True)
    if 'race' in df.columns:
        df['his_NACCNIHR'] = df['race'].replace({-4: np.NaN, '-4':np.NaN}).replace({'mix': 'mul', 'ans': 'asi'})
        df.drop('race', axis=1, inplace=True)
    if 'mmse' in df.columns:
        df['bat_NACCMMSE'] = df['mmse'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('mmse', axis=1, inplace=True)
    if 'moca' in df.columns:
        df['bat_NACCMOCA'] = df['moca'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('moca', axis=1, inplace=True)
    if 'apoe' in df.columns:
        df['apoe_NACCNE4S'] = df['apoe'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('apoe', axis=1, inplace=True)
    if 'DLB' in df.columns and 'PDD' in df.columns:
        df['LBD'] = df.apply(lbd, axis=1)
    if 'trailA' in df.columns:
        df['bat_TRAILA'] = df['trailA'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('trailA', axis=1, inplace=True)
    if 'trailB' in df.columns:
        df['bat_TRAILB'] = df['trailB'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('trailB', axis=1, inplace=True)
    if 'boston' in df.columns:
        df['bat_BOSTON'] = df['boston'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('boston', axis=1, inplace=True)
    if 'digitB' in df.columns:
        df['bat_DIGIB'] = df['digitB'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('digitB', axis=1, inplace=True)
    if 'digitBL' in df.columns:
        df['bat_DIGIBLEN'] = df['digitBL'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('digitBL', axis=1, inplace=True)
    if 'digitF' in df.columns:
        df['bat_DIGIF'] = df['digitF'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('digitF', axis=1, inplace=True)
    if 'digitFL' in df.columns:
        df['bat_DIGIFLEN'] = df['digitFL'].replace({-4: np.NaN, '-4':np.NaN, 9: np.NaN}).astype(float)
        df.drop('digitFL', axis=1, inplace=True)
    if 'animal' in df.columns:
        df['bat_ANIMALS'] = df['animal'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('animal', axis=1, inplace=True)
    if 'his_Alcohol' in df.columns:
        df['his_ALCOHOL'] = df['his_Alcohol'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('his_Alcohol', axis=1, inplace=True)
    if 'gds' in df.columns:
        df['gds_NACCGDS'] = df['gds'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('gds', axis=1, inplace=True)
    if 'lm_imm' in df.columns:
        df['bat_LOGIMEM'] = df['lm_imm'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('lm_imm', axis=1, inplace=True)
    if 'lm_del' in df.columns:
        df['bat_MEMUNITS'] = df['lm_del'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('lm_del', axis=1, inplace=True)
    if 'his_TOBAC100' in df.columns:
        df['his_TOBAC100'] = df['his_TOBAC100'].replace({-4: np.NaN, '-4':np.NaN}).replace({2: np.NaN, 3: np.NaN}).astype(float)
    if 'cdr' in df.columns:
        df['cdr_CDRGLOB'] = df['cdr'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('cdr', axis=1, inplace=True)
    if 'cdrSum' in df.columns:
        df['cdr_CDRSUM'] = df['cdrSum'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
        df.drop('cdrSum', axis=1, inplace=True)

    if 'faq_BILLS' in df.columns:
        df['faq_BILLS'] = df['faq_BILLS'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_TAXES' in df.columns:
        df['faq_TAXES'] = df['faq_TAXES'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_SHOPPING' in df.columns:
        df['faq_SHOPPING'] = df['faq_SHOPPING'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_GAMES' in df.columns:
        df['faq_GAMES'] = df['faq_GAMES'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_STOVE' in df.columns:
        df['faq_STOVE'] = df['faq_STOVE'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_MEALPREP' in df.columns:
        df['faq_MEALPREP'] = df['faq_MEALPREP'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_EVENTS' in df.columns:
        df['faq_EVENTS'] = df['faq_EVENTS'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_PAYATTN' in df.columns:
        df['faq_PAYATTN'] = df['faq_PAYATTN'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_REMDATES' in df.columns:
        df['faq_REMDATES'] = df['faq_REMDATES'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    if 'faq_TRAVEL' in df.columns:
        df['faq_TRAVEL'] = df['faq_TRAVEL'].replace({-4: np.NaN, '-4':np.NaN}).replace({-1: np.NaN}).astype(float)
    # df['ODE'] = df['OTHER']
    df.drop('OTHER', axis=1, inplace=True)
    df.drop('PD', axis=1, inplace=True)
    df.drop('filename', axis=1, inplace=True)
    df.drop('path', axis=1, inplace=True)
    # df.drop('FTD', axis=1, inplace=True)
    
    
    return df

In [15]:
labels = ['NC', 'MCI', 'DE', 'AD', 'LBD', 'DLB', 'PDD', 'VD', 'PRD', 'FTD', 'NPH', 'SEF', 'PSY', 'TBI', 'ODE']

In [16]:
def save_convert_dataset(df, filepath):
    df = convert_dataset(df)
    df_lbl = df[[label for label in labels if label in df.columns]]
    df_lbl = df_lbl.dropna(how='all')
    print('Out of {} samples, {} are dropped due to complete label missing for {}.'.format(len(df), len(df) - len(df_lbl), filepath.split('/')[-1]))
    df = df[df.index.isin(df_lbl.index)]
    avail_columns = [col for col in df.columns if col in nacc.columns]
    intersect_columns = [label for label in labels if label in avail_columns]
    other_columns = list(set(avail_columns) - set(intersect_columns))
    df = df[intersect_columns + other_columns].reset_index(drop=True)
    # print(df)
    df.to_csv(filepath, index=False)
    print(len(df[(df['NC'] == 0) & (df['MCI'] == 0) & (df['DE'] == 0)]))
    print(len(df[df['NC'] == 1]) + len(df[df['MCI'] == 1]) + len(df[df['DE'] == 1]) == len(df))
    print(set(df[df['NC'] == 1]['ID']).intersection(set(df[df['MCI'] == 1]['ID'])))
    print(set(df[df['NC'] == 1]['ID']).intersection(set(df[df['DE'] == 1]['ID'])))
    print(set(df[df['DE'] == 1]['ID']).intersection(set(df[df['MCI'] == 1]['ID'])))
    return df

In [17]:
adni1 = save_convert_dataset(adni1, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni1_revised.csv')
adni2 = save_convert_dataset(adni2, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni2_revised.csv')
adni3 = save_convert_dataset(adni3, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni3_revised.csv')
adni_go = save_convert_dataset(adni_go, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni_go_revised.csv')
aibl = save_convert_dataset(aibl, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/aibl_revised.csv')
nifd = save_convert_dataset(nifd, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/nifd_revised.csv')
ppmi = save_convert_dataset(ppmi, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/ppmi_revised.csv')
stanford = save_convert_dataset(stanford, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/stanford_revised.csv')
oasis = save_convert_dataset(oasis, '/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/oasis_revised.csv')

Out of 819 samples, 0 are dropped due to complete label missing for adni1_revised.csv.
0
True
set()
set()
set()
Out of 722 samples, 0 are dropped due to complete label missing for adni2_revised.csv.
0
True
set()
set()
set()
Out of 145 samples, 0 are dropped due to complete label missing for adni3_revised.csv.
0
True
set()
set()
set()
Out of 135 samples, 0 are dropped due to complete label missing for adni_go_revised.csv.
0
True
set()
set()
set()
Out of 664 samples, 3 are dropped due to complete label missing for aibl_revised.csv.
0
True
set()
set()
set()
Out of 307 samples, 54 are dropped due to complete label missing for nifd_revised.csv.
0
True
set()
set()
set()
Out of 625 samples, 427 are dropped due to complete label missing for ppmi_revised.csv.
0
True
set()
set()
set()
Out of 182 samples, 0 are dropped due to complete label missing for stanford_revised.csv.
0
True
set()
set()
set()
Out of 491 samples, 0 are dropped due to complete label missing for oasis_revised.csv.
0
True
set()

### 4RTNI

In [18]:
rtni = pd.read_csv('/home/skowshik/ADRD_repo/other_data/4RTNI/4RTNI_DATA.csv')

In [19]:
print(list(rtni.columns))

['SUBID', 'DX', 'DDURATION', 'AUTOPSYDATE', 'AUTOPSYPX', 'SEX', 'AGE_AT_TP0', 'EDUCATION', 'RACE', 'LATINO', 'SCANID_0', 'BIOSPECIMENS_0_SERUM', 'BIOSPECIMENS_0_PLASMA', 'BIOSPECIMENS_0_URINE', 'BIOSPECIMENS_0_CSF', 'PSPRS_0_DATE', 'PSPRS_0_IMPUTED', 'PSPRS_0_TOTAL', 'PSPRS_0_SUBSCORE_HISTORY', 'PSPRS_0_SUBSCORE_MENTATION', 'PSPRS_0_SUBSCORE_BULBAR', 'PSPRS_0_SUBSCORE_OCULARMOTOR', 'PSPRS_0_SUBSCORE_LIMBMOTOR', 'PSPRS_0_SUBSCORE_GAITMIDLINE', 'PSPRS_0_MEDS', 'SEADL_0_DATE', 'SEADL_0', 'CGI_S_0_DATE', 'CGI_S_0', 'UPDRS_0_DATE', 'UPDRS_0_IMPUTED', 'UPDRS_0_TOTAL', 'UPDRS_0_PDNORMAL', 'MOCA_0_DATE', 'MOCA_0_MOCATOTWITHEDUC', 'MOCA_0_BEFAFTNP', 'MOCA_0_LNGTH', 'MMSE_0_DATE', 'MMSE_0_MMSETOT', 'CVLT_0_DATE', 'CVLT_0_TRCOTOT', 'CVLT_0_CORR30', 'CVLT_0_CORR10', 'CVLT_0_CUEDCOR', 'CVLT_0_RECOG', 'CVLT_0_CORRLONG', 'BENSON_0_DATE', 'BENSON_0_MODREY', 'BENSON_0_REY10M', 'BENSON_0_REYRECG', 'DSYM_0_DATE', 'DSYM_0', 'MTRAILS_0_DATE', 'MTRAILS_0_MTTIME', 'MTRAILS_0_MTCORR', 'DSPAN_0_DATE', 'DSPAN_0

In [20]:
len(set(rtni['SUBID']))

124

In [21]:
# def ftldvar(row):
#     if row['DX'] == 'PSP' or row['DX'] == 'CBS':
#         return 1
#     else:
#         return np.NaN

def rtni_labels(row):
    if row['CDR_0_CDRTOT'] == 0:
        row['NC'] = 1
    else:
        row['NC'] = 0
    if (row['CDR_0_CDRTOT'] >= 0.5) & (row['FAQ_0_FAQTOT'] < 9):
        row['MCI'] = 1
    else:
        row['MCI'] = 0
    if (row['CDR_0_CDRTOT'] >= 1.0) & (row['FAQ_0_FAQTOT'] >= 9):
        if row['DX'] == 'PSP' or row['DX'] == 'CBS':
            row['DE'] = 1
            row['FTD'] = 1
        else:
            row['DE'] = np.NaN
            row['FTD'] = np.NaN
    else:
        row['DE'] = 0
        row['FTD'] = 0
    return row

def extract_rtni_id(row):
    return '4RTNI_' + row['SUBID']

def data_convert_4rtni(df):
    if 'ID' not in df.columns:
        df['ID'] = np.NaN
    df['ID'] = df.apply(extract_rtni_id, axis=1)
    df['his_SEX'] = df['SEX'].replace({-4: np.NaN, '-4':np.NaN}).replace({'F': 'female', 'M': 'male'})
    df['his_NACCAGE'] = df['AGE_AT_TP0'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['his_EDUC'] = df['EDUCATION'].replace({-4: np.NaN, '-4':np.NaN}).replace({99.0: np.NaN}).astype(float)
    df['his_NACCNIHR'] = df['RACE'].replace({-4: np.NaN, '-4':np.NaN}).replace({1:'whi', 2:'blk', 3:'asi', 4:'haw', 5:'mul', 6:np.NaN})
    df['his_HISPANIC'] = df['LATINO'].replace({-4: np.NaN, '-4':np.NaN}).replace({0: 'no', 1: 'yes', 2:np.NaN})
    df['bat_NACCMOCA'] = df['MOCA_0_MOCATOTWITHEDUC'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_NACCMMSE'] = df['MMSE_0_MMSETOT'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_TRAILA'] = df['UDSTRAILS_0_TRAILA'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['bat_TRAILALI'] = df['UDSTRAILS_0_TRAILALI'].replace({-4: np.NaN, '-4':np.NaN, 71:np.NaN, 93:np.NaN}).astype(float)
    df['bat_TRAILB'] = df['UDSTRAILS_0_TRAILB'].replace({-4: np.NaN, '-4':np.NaN, 995: np.NaN}).astype(float)
    df['bat_TRAILBLI'] = df['UDSTRAILS_0_TRAILBLI'].replace({-4: np.NaN, '-4':np.NaN, 88:np.NaN, 300:np.NaN}).astype(float)
    df['gds_NACCGDS'] = df['GDS_0_GDS15TO'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['updrs_PDNORMAL'] = df['UPDRS_0_PDNORMAL'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['cdr_CDRGLOB'] = df['CDR_0_CDRTOT'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['cdr_CDRSUM'] = df['CDR_0_BOXSCORE'].replace({-4: np.NaN, '-4':np.NaN}).astype(float)
    df['npiq_DEL'] = np.where(df['NPI_Q_0_DELUSN'] == 2, 0, df['NPI_Q_0_DELSEV']).astype(float)
    df['npiq_HALL'] = np.where(df['NPI_Q_0_HLCNTNS'] == 2, 0, df['NPI_Q_0_HALSEV']).astype(float)
    df['npiq_AGIT'] = np.where(df['NPI_Q_0_AGITATE'] == 2, 0, df['NPI_Q_0_AGSEV']).astype(float)
    df['npiq_DEPD'] = np.where(df['NPI_Q_0_DPRSSN'] == 2, 0, df['NPI_Q_0_DEPSEV']).astype(float)
    df['npiq_ANX'] = np.where(df['NPI_Q_0_ANXIETY'] == 2, 0, df['NPI_Q_0_ANXSEV']).astype(float)
    df['npiq_ELAT'] = np.where(df['NPI_Q_0_EUPHORIA'] == 2, 0, df['NPI_Q_0_EUPSEV']).astype(float)
    df['npiq_APA'] = np.where(df['NPI_Q_0_APATHY'] == 2, 0, df['NPI_Q_0_APTHSEV']).astype(float)
    df['npiq_DISN'] = np.where(df['NPI_Q_0_DISINHIBITION'] == 2, 0, df['NPI_Q_0_DISSEV']).astype(float)
    df['npiq_IRR'] = np.where(df['NPI_Q_0_IRRITBLE'] == 2, 0, df['NPI_Q_0_IRRSEV']).astype(float)
    df['npiq_MOT'] = np.where(df['NPI_Q_0_MOTOR'] == 2, 0, df['NPI_Q_0_MOTSEV']).astype(float)
    df['npiq_NITE'] = np.where(df['NPI_Q_0_SLEEP'] == 2, 0, df['NPI_Q_0_SLESEV']).astype(float)
    df['npiq_APP'] = np.where(df['NPI_Q_0_EAT'] == 2, 0, df['NPI_Q_0_EATSEV']).astype(float)
    df = df.apply(rtni_labels, axis=1)

    return df
    

In [22]:
rtni = data_convert_4rtni(rtni)[['ID', 'NC', 'MCI', 'DE', 'FTD', 'his_SEX', 'his_NACCAGE', 'his_EDUC', 'his_NACCNIHR', 'his_HISPANIC', 'bat_NACCMOCA', 'bat_NACCMMSE', 'bat_TRAILA', 'bat_TRAILALI', 'bat_TRAILB', 'bat_TRAILBLI', 'gds_NACCGDS', 'updrs_PDNORMAL', 'cdr_CDRGLOB', 'cdr_CDRSUM', 'npiq_DEL', 'npiq_HALL', 'npiq_AGIT', 'npiq_DEPD', 'npiq_ANX', 'npiq_ELAT', 'npiq_APA', 'npiq_DISN', 'npiq_IRR', 'npiq_MOT', 'npiq_NITE', 'npiq_APP', 'CDR_0_CDRTOT', 'FAQ_0_FAQTOT']]

In [24]:
rtni = rtni[(rtni['NC'] == 1) | (rtni['MCI'] == 1) | (rtni['DE'] == 1)]
rtni = rtni[~rtni['DE'].isna()]

In [19]:
rtni.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/rtni_revised.csv', index=False)

In [32]:
print(len(rtni[(rtni['NC'] == 0) & (rtni['MCI'] == 0) & (rtni['DE'] == 0)]))

0


### Combine cohorts

In [25]:
train_df = pd.concat([nacc, nifd, stanford, aibl, ppmi, oasis, rtni], axis=0)[nacc.columns]
adni_df = pd.concat([adni1, adni2, adni3, adni_go], axis=0)

##### ADNI further conversion

In [27]:
def extract_id(row):
    if ('ID' not in dict(row).keys() ) | pd.isna(row['ID']):
        if 'adni' in row['path'].lower():
            return '_'.join(row['filename'].split('_')[0:4])
        else:
            return np.NaN
    else:
        return row['ID']

def data_convert_adni(df):
    # if 'ID' not in df.columns:
    #     df['ID'] = np.NaN
    # df['ID'] = df.apply(extract_id, axis=1)
    df['faq_BILLS'] = df['faq_BILLS'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_TAXES'] = df['faq_TAXES'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_SHOPPING'] = df['faq_SHOPPING'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_GAMES'] = df['faq_GAMES'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_STOVE'] = df['faq_STOVE'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_MEALPREP'] = df['faq_MEALPREP'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_EVENTS'] = df['faq_EVENTS'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_PAYATTN'] = df['faq_PAYATTN'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_REMDATES'] = df['faq_REMDATES'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['faq_TRAVEL'] = df['faq_TRAVEL'].replace({1: 0, 2: 1, 3: 1, 4: 2, 5: 3}).astype(float)
    df['npiq_ANX'] = df['npiq_ANX'].replace({4: np.NaN}).astype(float)
    df.drop('his_PACKSPER', axis=1, inplace=True)

    return df

In [28]:
adni_df = data_convert_adni(adni_df)
adni1 = data_convert_adni(adni1)
adni2 = data_convert_adni(adni2)
adni3 = data_convert_adni(adni3)
adni_go = data_convert_adni(adni_go)

adni1.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni1_revised.csv', index=False)
adni2.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni2_revised.csv', index=False)
adni3.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni3_revised.csv', index=False)
adni_go.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/testing_cohorts/adni_go_revised.csv', index=False)

train_df.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/merged_data_nacc_nifd_stanford_aibl_ppmi_oasis_rtni_without_np_cli.csv', index=False)
adni_df.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_revised_labels.csv', index=False)

In [30]:
# train_df.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/training_cohorts/merged_data_nacc_nifd_stanford_aibl_ppmi_oasis_rtni.csv', index=False)

### Adni neuropath

In [35]:
# adni_np = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_neuropath.csv')
# adni_df = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_revised_labels.csv')
# adni_df['faq_BILLS'].value_counts()

In [29]:
import pandas as pd
adni_prev_np = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_neuropath.csv')
all_adni = pd.read_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_revised_labels.csv')
adni_np_orig = pd.read_csv('/home/skowshik/ADRD_repo/other_data/ADNI/Neuropathology_Results/NEUROPATH_02_06_23_25Jul2023.csv')
# adni_np_all['mri_to_yod'] = adni_np_all['NPDAGE'] - adni_np_all['MRIYR']

In [31]:
# prev_np_filenames = [fn.split('.')[0] + '.npy' for fn in list(all_prev_np[~all_prev_np['filename'].isna()]['filename'])]
adni_np = all_adni[all_adni['ID'].isin(adni_prev_np['ID'])]
adni_np_rid = [int(fn.split('_')[3]) for fn in list(adni_np['ID'])]
adni_np['RID'] = adni_np_rid
adni_np_orig_matched = adni_np_orig[adni_np_orig['RID'].isin(adni_np_rid)]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  adni_np['RID'] = adni_np_rid


In [32]:
merged_data = pd.merge(adni_np, adni_np_orig_matched, on=["RID"], how='left')
merged_data.to_csv('/home/skowshik/ADRD_repo/pipeline_v1_main/adrd_tool/data/train_vld_test_split_updated/adni_neuropath.csv', index=False)