### Create the metadata for the ArchR object by joining the SnapATAC2 metadata with the fragment file names

In [1]:
import pandas as pd
import os 
import scanpy as sc

In [2]:
fragment_files = pd.DataFrame(os.listdir("../donor_fragment_files/"), columns = ["fragment_file"])
len(fragment_files)

130

In [3]:
fragment_files['sample_id'] = fragment_files['fragment_file'].str.replace(".bed.gz", "")

In [4]:
fragment_files

Unnamed: 0,fragment_file,sample_id
0,ENCFF904ARJ.bed.gz,ENCFF904ARJ
1,ENCSR958FFE.bed.gz,ENCSR958FFE
2,ENCSR435CGY.bed.gz,ENCSR435CGY
3,10X_ATAC_CK389.bed.gz,10X_ATAC_CK389
4,ENCSR546WZB.bed.gz,ENCSR546WZB
...,...,...
125,GSM5495104_F19_v2.bed.gz,GSM5495104_F19_v2
126,ENCSR918VTH.bed.gz,ENCSR918VTH
127,ENCFF958UTA.bed.gz,ENCFF958UTA
128,ENCSR320QOP.bed.gz,ENCSR320QOP


In [5]:
%%time
# join with sample metadata
adata = sc.read_h5ad("../../07_final_ATAC.h5ad")

CPU times: user 4.7 s, sys: 32.5 s, total: 37.2 s
Wall time: 37.2 s


In [6]:
adata.obs.columns

Index(['ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study',
       'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology',
       'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type',
       'tech_plus_study', 'age_group', 'decade', 'final_cell_type',
       'cell_or_nuclei', 'disease'],
      dtype='object')

In [7]:
adata_sample_metadata = adata.obs[["sample_id", "donor_id", "study", "age_status", "age", "age_group",
                                   "sex", "region", "disease_binary", "technology", "tech_plus_study"]].drop_duplicates().reset_index(drop=True)
adata_sample_metadata

Unnamed: 0,sample_id,donor_id,study,age_status,age,age_group,sex,region,disease_binary,technology,tech_plus_study
0,ENCSR556UHL,ENCODE v4 (Snyder):ENCSR556UHL,ENCODE v4 (Snyder),postnatal,61.0,old,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
1,ENCSR913OAS,ENCODE v4 (Snyder):ENCSR913OAS,ENCODE v4 (Snyder),postnatal,66.0,old,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
2,ENCSR495SMB,ENCODE v4 (Snyder):ENCSR455MGH,ENCODE v4 (Snyder),postnatal,32.0,young,female,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
3,ENCSR080TZR,ENCODE v4 (Snyder):ENCSR080TZR,ENCODE v4 (Snyder),postnatal,69.0,old,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
4,ENCSR450VTB,ENCODE v4 (Snyder):ENCSR008CVR,ENCODE v4 (Snyder),postnatal,67.0,old,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
...,...,...,...,...,...,...,...,...,...,...,...
125,K1727-LV-2,Penn:K1727,Penn,postnatal,40.0,middle,male,LV,N,10X_ATAC,10X_ATAC_Penn
126,ENCSR642ZIC,ENCODE v4 (Snyder):ENCSR489URW,ENCODE v4 (Snyder),postnatal,66.0,old,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
127,ENCSR003DSI,ENCODE v4 (Snyder):ENCSR056QLB,ENCODE v4 (Snyder),postnatal,50.0,middle,male,LV,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)
128,ENCFF851VTB,ENCODE v4 (Snyder):ENCFF775ANN,ENCODE v4 (Snyder),fetal,15.0,fetal,female,WH,N,Multiome-v1,Multiome-v1_ENCODE v4 (Snyder)


In [8]:
merged_df = fragment_files.merge(adata_sample_metadata, on = "sample_id")
merged_df.shape

(130, 12)

In [9]:
merged_df.to_csv("01_ArchR_sample_metadata.csv")