In [6]:
import pandas as pd
import numpy as np

In [9]:
## load data

path = '/gpfs/commons/groups/knowles_lab/data/ADSP_reguloML/ADSP_vcf/phenotype_file/release_36K/'
manifest = pd.read_csv(path + "SampleManifest_DS_2022.08.18_ALL.txt", sep='\t') ## the one with mapping info
ADNI = pd.read_csv(path + "ADNIPhenotypes_DS_2022.08.18_ALL.txt", sep='\t')
family_based = pd.read_csv(path + "ADSPFamilyBasedPhenotypes_DS_2022.08.18_ALL.txt", sep='\t',encoding='cp1252')
case_control = pd.read_csv(path + "ADSPCaseControlPhenotypes_DS_2022.08.18_ALL.txt", sep='\t')
PSPCBD = pd.read_csv(path + "PSPCBDPhenotypes_DS_2022.08.18_ALL.txt", sep='\t')
SUBJ_drop = pd.read_excel(path + 'ng00067_subject_droplist_2022.08.18.xlsx', engine = 'openpyxl')

In [19]:
## SUBJID and SampleID are not 1-1 match

print(len(manifest.SUBJID.unique()))
print(len(manifest.SampleID.unique()))

51992
56863


In [40]:
print(ADNI.shape, family_based.shape, case_control.shape, PSPCBD.shape)

(1566, 31) (12858, 27) (45375, 31) (2803, 17)


In [43]:
print(ADNI.columns)
print(family_based.columns)
print(case_control.columns)
print(PSPCBD.columns)

Index(['SUBJID', 'Sex', 'PrevAD', 'IncAD', 'Age_current', 'Age_baseline',
       'Age_MCI_onset', 'Age_AD_onset', 'APOE_reported', 'APOE_WGS', 'AUTOPSY',
       'Braak', 'Race', 'Ethnicity', 'AD_last_visit', 'MCI_last_visit',
       'Duplicate_SUBJID', 'Comments', 'Latest_Update_Version', 'Base_PrevAD',
       'Base_IncAD', 'Base_Age', 'Base_AUTOPSY', 'Base_Braak', 'Base_AD',
       'Base_MCI', 'Base_Version', 'Update_baseline', 'Update_latest',
       'Update_Diagnosis', 'Correction'],
      dtype='object')
Index(['SUBJID', 'FamID', 'Father', 'Mother', 'Sex', 'AD', 'Age',
       'Age_baseline', 'APOE_reported', 'APOE_WGS', 'AUTOPSY', 'Braak', 'Race',
       'Ethnicity', 'FamGrp', 'Comments', 'Duplicate_SUBJID',
       'Latest_Update_Version', 'Base_AD', 'Base_Age', 'Base_AUTOPSY',
       'Base_Braak', 'Base_Version', 'Update_baseline', 'Update_latest',
       'Update_ADstatus', 'Correction'],
      dtype='object')
Index(['SUBJID', 'Sex', 'PrevAD', 'IncAD', 'Age', 'Age_baseline',
     

In [124]:
ADNI_processed = ADNI.loc[:,['SUBJID', 'Sex', 'Age_current', 'Age_AD_onset', 'Age_baseline','APOE_reported','Race', 'AD_last_visit']]

In [66]:
## There are two cases without age of onset

print(ADNI_processed[ADNI_processed.AD_last_visit == 1].Age_AD_onset.isnull().sum())
print(ADNI_processed[ADNI_processed.AD_last_visit == 1][ADNI_processed[ADNI_processed.AD_last_visit == 1].Age_AD_onset.isnull()])
print(ADNI_processed[ADNI_processed.AD_last_visit == 1].Age_baseline.isnull().sum())

## all controls have current age and the age they first entered the study!
ADNI_processed[ADNI_processed.AD_last_visit == 0].Age_current.isnull().sum()
ADNI_processed[ADNI_processed.AD_last_visit == 0].Age_baseline.isnull().sum()

2
              SUBJID  Sex Age_current Age_AD_onset Age_baseline  \
871  G-ADNI-AN000064    0          66          NaN           61   
975  G-ADNI-AN000170    0          76          NaN           73   

     APOE_reported  Race  AD_last_visit  
871             33   5.0              1  
975             34   5.0              1  


0

In [130]:
## ADNI
ADNI_processed = ADNI.loc[:,['SUBJID', 'Sex', 'Age_current', 'Age_AD_onset', 'Age_baseline','APOE_reported','Race', 'AD_last_visit']]
## fill the age of controls
ADNI_processed['Age_AD_onset'] = ADNI_processed['Age_AD_onset'].fillna(ADNI_processed['Age_current'])
ADNI_processed.rename(columns = {'Age_AD_onset':'Age', 'AD_last_visit':'Diagnosis'}, inplace = True)
ADNI_processed = ADNI_processed.drop(columns=['Age_current']).dropna()

In [152]:
## family_based

family_based['Diagnosis'] = 'NaN'
control = family_based[(family_based['AD'] == 0)| (family_based['AD'] == 10)].copy()
case = family_based[(family_based['AD'] >=1) & (family_based['AD'] <=4)].copy()

control.loc[:,["Diagnosis"]] = 0
case.loc[:,["Diagnosis"]] = 1
family_based = pd.concat([control,case])
family_based_processed = family_based.loc[:, ["SUBJID", "Sex", "Diagnosis", "APOE_reported","Age","Race", "Age_baseline"]]
print(family_based_processed.shape)

(6624, 7)


In [153]:
family_based_processed.isna().sum()


SUBJID              0
Sex                 0
Diagnosis           0
APOE_reported    2520
Age              2011
Race             1262
Age_baseline     2821
dtype: int64

In [182]:
family_based_processed = family_based_processed.dropna(subset = ['Age', 'APOE_reported', 'Race'])

In [172]:
## case_control

case_control_processed = case_control.loc[:,["SUBJID", "Sex", "AD", "APOE_reported","Age","Race", "Age_baseline"]]
case_control_processed = case_control_processed.rename(columns = {'AD':'Diagnosis'})

print(case_control_processed.isna().sum())
case_control_processed = case_control_processed.dropna(subset = ['Diagnosis','Age', 'APOE_reported', 'Race'])

SUBJID               0
Sex                  1
Diagnosis         5235
APOE_reported     5776
Age               4833
Race              2728
Age_baseline     18367
dtype: int64


In [171]:
print(case_control_processed.shape, case_control.shape)

(32644, 7) (45375, 31)


In [173]:
PSPCBD

Unnamed: 0,SUBJID,Sex,Diagnosis,AgeOnset,AgeDeath,Race,APOE_WGS,Duplicate_SUBJID,Comments,Latest_Update_Version,Base_Diagnosis,Base_Version,Update_Baseline,Update_Latest,Update_Diagnosis,Correction,Consent
0,P-ABN-CBD6614,1,2,67,73,5.0,,0,,ng00067.v2,2,ng00067.v2,0,0,0,0,GRU-IRB-PUB
1,P-ABN-PSP5079,1,1,71,76,,33.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
2,P-ABN-PSP5080,1,1,68,73,,34.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
3,P-ABN-PSP5081,0,1,72,75,,33.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
4,P-ABN-PSP5082,1,1,68,73,,24.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2798,P-WZB-PSP10181,0,1,74,82,5.0,33.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
2799,P-WZB-PSP10182,0,1,79,76,5.0,33.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
2800,P-WZB-PSP5049,0,1,63,70,5.0,33.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB
2801,P-WZB-PSP5050,0,1,56,62,5.0,34.0,0,,ng00067.v5,1,ng00067.v5,0,1,0,0,GRU-IRB-PUB


In [187]:
ADNI_processed["source"] = 'ADNI'
family_based_processed["source"] = 'Family_based'
case_control_processed["source"] = 'case_control'

In [188]:
pheno = pd.concat([ADNI_processed, family_based_processed, case_control_processed])

In [189]:
pheno.isna().sum()

SUBJID               0
Sex                  0
Age                  0
Age_baseline     14633
APOE_reported        0
Race                 0
Diagnosis            0
source               0
dtype: int64

In [197]:
pheno[pheno.SUBJID.duplicated()].shape

(112, 8)

In [203]:
duplicated_ID = pheno[pheno.SUBJID.duplicated()].SUBJID

In [205]:
len(duplicated_ID)

112

In [220]:
duplicates = pheno[pheno.SUBJID.isin(duplicated_ID)].sort_values('SUBJID')


In [242]:
duplicates.drop(columns = 'source').drop_duplicates().SUBJID.duplicated(keep=False)

2552     False
16133    False
2704     False
16135    False
2972     False
         ...  
11292    False
31383     True
31386    False
12243    False
11583    False
Name: SUBJID, Length: 144, dtype: bool

In [244]:
check

In [219]:
## The duplicates are between family based and case control, two as a pair
print(min(pheno[pheno.SUBJID.isin(duplicated_ID)].value_counts('SUBJID')))
print(max(pheno[pheno.SUBJID.isin(duplicated_ID)].value_counts('SUBJID')))

2
2


In [202]:
duplicated[duplicated.SUBJID=='A-CUHS-CU000194']

Unnamed: 0,SUBJID,Sex,Age,Age_baseline,APOE_reported,Race,Diagnosis,source
16134,A-CUHS-CU000194,1.0,73,65,33.0,6.0,0.0,case_control
