In [2]:
import scanpy as sc
import pandas as pd
import anndata as ad
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os

In [4]:
base_path = '/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/'
cell_types = ['Astro', 'Endo', 'Epend', 'Gaba', 'Glut', 'Micro', 'Olig', 'Opc', 'OtherImmune']
adatas = []

for cell_type in cell_types:
    file_path = os.path.join(base_path, f'counts_{cell_type}_filtered.h5ad')
    adata = sc.read_h5ad(file_path)
    adata.obs['cell_type'] = cell_type
    adatas.append(adata)

combined = ad.concat(adatas, axis=0)
combined.write(os.path.join(base_path, 'diverse_cohort.h5ad'))

In [5]:
combined

AnnData object with n_obs × n_vars = 1914581 × 22009
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'cell_type'

In [7]:
base_path = '/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/'
cell_types = ['Astro', 'Endo', 'Epend', 'Gaba', 'Glut', 'Micro', 'Olig', 'Opc', 'OtherImmune']
adatas = []

# Load the first file and store its var_names for reference
ref_adata = sc.read_h5ad(os.path.join(base_path, f'counts_{cell_types[0]}_filtered.h5ad'))
ref_var = ref_adata.var_names.copy()
adatas.append(ref_adata)

# Load and align the rest
for cell_type in cell_types[1:]:
    file_path = os.path.join(base_path, f'counts_{cell_type}_filtered.h5ad')
    adata = sc.read_h5ad(file_path)
    adata.obs['cell_type'] = cell_type

    # Ensure .var_names match exactly (and in the same order)
    adata = adata[:, ref_var]
    adatas.append(adata)

# Now concatenate
combined = ad.concat(adatas, axis=0, join="inner", merge="same")

In [12]:
combined.var_names = combined.var.features

In [14]:
combined.var = combined.var.drop(columns = '_index')

In [15]:
combined.write(os.path.join(base_path, 'diverse_cohort.h5ad'))

In [8]:
combined

AnnData object with n_obs × n_vars = 1914581 × 22009
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA'
    var: '_index', 'features'

## process metadata

In [44]:
cell_type = pd.read_csv('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/rosmap_diversity_snmultiome_celltypes_metadata_20230410.csv')
individual = pd.read_csv('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/AMP-AD_DiverseCohorts_individual_metadata.csv')
mapping = pd.read_csv('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/AMP-AD_DiverseCohorts_Rush_projid_mapping.csv')

  cell_type = pd.read_csv('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/rosmap_diversity_snmultiome_celltypes_metadata_20230410.csv')


In [45]:
merged_df = pd.merge(cell_type, individual, on='individualID')
merged_df = pd.merge(merged_df, mapping, on='individualID', how = 'left')

In [46]:
merged_df.head()

Unnamed: 0.1,Unnamed: 0,individualID,specimenID,region,batch,barcode,nCount_RNA,nFeature_RNA,mt_percent,broadcellclass,...,Braak,mayoDx,amyAny,bScore,reag,ADoutcome,derivedOutcomeBasedOnMayoDx,clinicalMetadataSource,individualID_AMPAD_1.0,projid
0,1,R5534945,R5534945_AC,caudate nucleus,VR001,AAACAGCCACCAGCAT-1,10849,3956,0.156696,Gaba,...,Stage IV,not applicable,0,Braak Stage III-IV,Low Likelihood,Other,False,AMP-AD_DiverseCohorts,,57342872.0
1,2,R7200691,R7200691_ST,superior temporal gyrus,VR001,AAACAGCCACTTACAG-1,5656,2745,0.106082,Gaba,...,Stage III,not applicable,1,Braak Stage III-IV,Intermediate Likelihood,Other,False,AMP-AD_DiverseCohorts,,71514280.0
2,3,R5534945,R5534945_AC,caudate nucleus,VR001,AAACAGCCAGGCATGA-1,19162,4987,0.031312,Gaba,...,Stage IV,not applicable,0,Braak Stage III-IV,Low Likelihood,Other,False,AMP-AD_DiverseCohorts,,57342872.0
3,4,R7200691,R7200691_ST,superior temporal gyrus,VR001,AAACATGCAGGCGATA-1,646,537,0.773994,Gaba,...,Stage III,not applicable,1,Braak Stage III-IV,Intermediate Likelihood,Other,False,AMP-AD_DiverseCohorts,,71514280.0
4,5,R5534945,R5534945_AC,caudate nucleus,VR001,AAACATGCAGGCTGTT-1,5388,2613,0.148478,Gaba,...,Stage IV,not applicable,0,Braak Stage III-IV,Low Likelihood,Other,False,AMP-AD_DiverseCohorts,,57342872.0


In [47]:
len(merged_df.individualID.unique())

167

In [48]:
merged_df['batch_barcode'] = merged_df.batch + '_' + merged_df.barcode

In [52]:
merged_df.drop(columns = 'Unnamed: 0', inplace = True)

## add metadata

In [5]:
combined = sc.read_h5ad('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/counts_Gaba_filtered.h5ad')

In [6]:
combined.var

Unnamed: 0,_index,features
0,MIR1302-2HG,MIR1302-2HG
1,FAM138A,FAM138A
2,OR4F5,OR4F5
3,OR4F29,OR4F29
4,OR4F16,OR4F16
...,...,...
22004,DIP2A,DIP2A
22005,S100B,S100B
22006,PRMT2,PRMT2
22007,MAFIP,MAFIP


In [31]:
combined

AnnData object with n_obs × n_vars = 1914581 × 22009
    obs: 'orig.ident', 'nCount_RNA', 'nFeature_RNA', 'cell_type'

In [33]:
combined.X.data

array([1., 2., 8., ..., 1., 1., 1.])

In [32]:
combined.obs

Unnamed: 0,orig.ident,nCount_RNA,nFeature_RNA,cell_type
VR001_AAACAGCCAACTGGGA-1,0,4964.0,2279,Astro
VR001_AAACAGCCAGGCCATT-1,0,7364.0,2880,Astro
VR001_AAACAGCCATTAGCCA-1,0,6940.0,2704,Astro
VR001_AAACCGAAGCACCACA-1,0,4471.0,2152,Astro
VR001_AAACCGAAGCCTGTTC-1,0,4052.0,1957,Astro
...,...,...,...,...
VR134_TTTCCACCAGCTTAGC-1,0,1268.0,838,OtherImmune
VR134_TTTGCGACAGGCTTCG-1,0,6597.0,2322,OtherImmune
VR134_TTTGCGGAGGCAACAA-1,0,1040.0,793,OtherImmune
VR134_TTTGTCCCAAACTGCC-1,0,5017.0,1885,OtherImmune


In [53]:
metadata = pd.merge(combined.obs, merged_df, left_index = True, right_on = 'batch_barcode')

In [57]:
metadata.set_index('batch_barcode', inplace = True)

In [66]:
metadata.rename_axis(None, axis=0, inplace = True)

In [67]:
combined.obs = metadata

In [68]:
combined

AnnData object with n_obs × n_vars = 1914581 × 22009
    obs: 'orig.ident', 'nCount_RNA_x', 'nFeature_RNA_x', 'cell_type', 'individualID', 'specimenID', 'region', 'batch', 'barcode', 'nCount_RNA_y', 'nFeature_RNA_y', 'mt_percent', 'broadcellclass', 'subcellclass', 'cogdx_rush', 'braak_rush', 'cerad_rush', 'sex_x', 'allen_m1_azimuth_subclass', 'allen_m1_azimuth_subclass.score', 'allen_m1_azimuth_mapping.score', 'dataContributionGroup', 'cohort', 'sex_y', 'race', 'isHispanic', 'ageDeath', 'PMI', 'apoeGenotype', 'amyThal', 'amyA', 'amyCerad', 'Braak', 'mayoDx', 'amyAny', 'bScore', 'reag', 'ADoutcome', 'derivedOutcomeBasedOnMayoDx', 'clinicalMetadataSource', 'individualID_AMPAD_1.0', 'projid'

In [70]:
combined.obs = combined.obs.drop(columns = 'individualID_AMPAD_1.0')

In [71]:
combined.write_h5ad('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/diverse_cohort.h5ad')

## check overlap with rosmap

In [2]:
adata = sc.read_h5ad('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/diverse_cohort/diverse_cohort.h5ad')

In [3]:
adata

AnnData object with n_obs × n_vars = 1914581 × 22009
    obs: 'orig.ident', 'nCount_RNA_x', 'nFeature_RNA_x', 'cell_type', 'individualID', 'specimenID', 'region', 'batch', 'barcode', 'nCount_RNA_y', 'nFeature_RNA_y', 'mt_percent', 'broadcellclass', 'subcellclass', 'cogdx_rush', 'braak_rush', 'cerad_rush', 'sex_x', 'allen_m1_azimuth_subclass', 'allen_m1_azimuth_subclass.score', 'allen_m1_azimuth_mapping.score', 'dataContributionGroup', 'cohort', 'sex_y', 'race', 'isHispanic', 'ageDeath', 'PMI', 'apoeGenotype', 'amyThal', 'amyA', 'amyCerad', 'Braak', 'mayoDx', 'amyAny', 'bScore', 'reag', 'ADoutcome', 'derivedOutcomeBasedOnMayoDx', 'clinicalMetadataSource', 'projid'

In [22]:
adata = adata[~adata.obs.projid.isna()]

In [5]:
clinical = pd.read_csv('/lustre/groups/ml01/projects/2024_microglia_zihe.zheng/rosmap/metadata/ROSMAP_clinical.csv')

In [23]:
adata.obs['projid'].value_counts()

projid
37744074.0    18855
71514280.0    18534
10459674.0    18324
91684359.0    18167
54396902.0    18016
              ...  
20934591.0     3378
34748028.0     3152
49295635.0     2528
22202022.0     1887
25823625.0     1448
Name: count, Length: 90, dtype: int64

In [24]:
adata.obs['projid'].unique()

array([57342872., 10100150., 71514280., 63105434., 45307098., 33334378.,
       35913453., 29903274., 69982407., 86464152., 52692199., 45635549.,
        4871863., 34748028., 42560037., 10574690., 10275252., 70636113.,
       54056119., 20237131., 28363718., 86712535., 11609672., 54396902.,
       37744074.,  2602916., 37125649., 74024679., 49295635., 82441989.,
       10459674., 56607358., 49040290., 21131938., 25823625., 62367972.,
       74718818., 10288185., 37065652., 20917568., 22101716., 42589954.,
       87038802., 11150132., 95442315., 20046260., 19415550., 76496027.,
       53315923., 20849279., 20510687., 87101984., 62656124., 90613980.,
       81313710., 99126836., 68879403., 22200264., 11318248., 20912775.,
       42592599., 21196129., 33137549., 74564064., 11648421., 48111294.,
        3817943., 22100413., 57291516.,  3052480., 62578487., 47354365.,
       52225174., 16741095., 14266701., 18500138., 22202022., 91684359.,
       87567672., 22202446., 60278494.,  1026348., 

In [20]:
len(set(clinical.projid.unique()))

3584

In [26]:
len(set(adata.obs['projid'].unique()).intersection(clinical.projid.unique()))

51