### Reformat original metadata

- `00_original_metadata.csv` contains metadata information from individually examining each study. We will reformat this to make sure that its metadata is concordant with the snRNA-seq aggregated analysis, especially with Multiome data

In [1]:
import pandas as pd
import numpy as np
import scanpy as sc
from collections import Counter
import os

In [2]:
metadata_df = pd.read_csv("00_original_metadata.csv", index_col = 0)
metadata_df = metadata_df[["sample_id", "donor_id", "study", "age_status", "age", 
                           "sex", "region", "disease_binary", "technology", "full_path"]].reset_index(drop=True)
metadata_df.head()

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,full_path
0,10X_ATAC_CK166,P1,Kuppe,Postnatal,44.0,male,LV,non-diseased,10X_ATAC,10X_ATAC_CK166.tsv.gz
1,10X_ATAC_CK337,P7,Kuppe,Postnatal,55.0,female,LV,non-diseased,10X_ATAC,10X_ATAC_CK337.tsv.gz
2,10X_ATAC_CK338,P8,Kuppe,Postnatal,44.0,male,LV,non-diseased,10X_ATAC,10X_ATAC_CK338.tsv.gz
3,10X_ATAC_CK353,P17,Kuppe,Postnatal,61.0,male,LV,non-diseased,10X_ATAC,10X_ATAC_CK353.tsv.gz
4,10X_ATAC_CK381,P17,Kuppe,Postnatal,61.0,male,LV,non-diseased,10X_ATAC,10X_ATAC_CK381.tsv.gz


In [3]:
metadata_df['fragment_file'] = metadata_df['full_path'].str.split("/").str[-1]
metadata_df = metadata_df.drop(columns = "full_path")
metadata_df['full_path'] = metadata_df['fragment_file']

In [4]:
Counter(metadata_df.study)

Counter({'ENCODE': 72, 'Kuppe': 28, 'Penn': 22, 'Kanemaru': 7, 'Ameen': 3})

#### Add the directory to fragment file name to get the full path

In [5]:
study_to_base_path = {
    "Kuppe": "/mnt/data1/william/human_heart_project/Final_manuscript_analysis/ATAC/external_datasets/Kuppe/",
    "ENCODE": "/mnt/data1/william/human_heart_project/Final_manuscript_analysis/ATAC/external_datasets/ENCODE/",
    "Penn": "/mnt/data1/william/human_heart_project/Final_manuscript_analysis/ATAC/internal_datasets/fragment_files/",
    "Kanemaru": "/mnt/data1/william/human_heart_project/Final_manuscript_analysis/ATAC/external_datasets/Kanemaru/",
    "Ameen": "/mnt/data1/william/human_heart_project/Final_manuscript_analysis/ATAC/external_datasets/Ameen/",
}

In [6]:
for study, base_path in study_to_base_path.items():
    mask = metadata_df['study'] == study  # Create a mask for the current study
    metadata_df.loc[mask, 'full_path'] = base_path + metadata_df.loc[mask, 'full_path']

In [7]:
# check that file exists
metadata_df['file_exists'] = metadata_df['full_path'].apply(os.path.exists)

print(metadata_df[metadata_df['file_exists'] != True])

Empty DataFrame
Columns: [sample_id, donor_id, study, age_status, age, sex, region, disease_binary, technology, fragment_file, full_path, file_exists]
Index: []


#### Check the metadata and make it consistent with snRNA-seq

In [8]:
Counter(metadata_df.region)

Counter({'LV': 89,
         'WH': 14,
         'FZ': 5,
         'GT/IZ': 5,
         'RZ': 4,
         'RZ/BZ': 3,
         'OFT': 3,
         'LRV': 3,
         'FZ/GT': 2,
         'IZ': 2,
         'RZ/FZ': 1,
         'IZ/BZ': 1})

In [9]:
Counter(metadata_df.sample_id)

Counter({'10X_ATAC_CK166': 1,
         '10X_ATAC_CK337': 1,
         '10X_ATAC_CK338': 1,
         '10X_ATAC_CK353': 1,
         '10X_ATAC_CK381': 1,
         'K1485-run-1': 1,
         'K1485-run-2': 1,
         'K1488-run-1': 1,
         'K1488-run-2': 1,
         'K1584-run-1': 1,
         'K1584-run-2': 1,
         'K1647-run-1': 1,
         'K1647-run-2': 1,
         'K1727-LV-1': 1,
         'K1727-LV-2': 1,
         'NDRI15755_10k': 1,
         'NDRI15755_5k': 1,
         'HCAHeart9508627_HCAHeart9508819': 1,
         'HCAHeart9508629_HCAHeart9508821': 1,
         'HCAHeart9845431_HCAHeart9917173': 1,
         'HCAHeartST10773165_HCAHeartST10781062': 1,
         'HCAHeartST10773166_HCAHeartST10781063': 1,
         'HCAHeartST11064574_HCAHeartST11023239': 1,
         'HCAHeartST11064575_HCAHeartST11023240': 1,
         'ENCSR506ROZ': 1,
         'ENCSR088ZOL': 1,
         'ENCSR318ERF': 1,
         'ENCSR233KYH': 1,
         'ENCSR785BSP': 1,
         'ENCSR912XZH': 1,
         '

In [10]:
Counter(metadata_df.donor_id)

Counter({'P9': 5,
         'AH1': 4,
         'Penn_F2': 4,
         'P17': 2,
         'K1485': 2,
         'K1488': 2,
         'K1584': 2,
         'K1647': 2,
         'K1727': 2,
         'ND15755': 2,
         'P20': 2,
         'P3': 2,
         'P2': 2,
         'P14': 2,
         'P15': 2,
         'Penn_F1': 2,
         'Penn_F5': 2,
         'P1': 1,
         'P7': 1,
         'P8': 1,
         'D3': 1,
         'D7': 1,
         'D8': 1,
         'ENCSR506ROZ': 1,
         'ENCSR088ZOL': 1,
         'ENCSR318ERF': 1,
         'ENCSR233KYH': 1,
         'ENCSR785BSP': 1,
         'ENCSR912XZH': 1,
         'ENCSR594WKI': 1,
         'ENCSR305KUV': 1,
         'ENCSR627IOJ': 1,
         'ENCSR161IHV': 1,
         'ENCSR642ZIC': 1,
         'ENCSR003DSI': 1,
         'ENCSR270CUT': 1,
         'ENCSR701JAT': 1,
         'ENCSR686FZG': 1,
         'ENCSR079TCC': 1,
         'ENCSR617IKC': 1,
         'ENCSR718YPN': 1,
         'ENCSR951AID': 1,
         'ENCSR101LQQ': 1,
      

In [11]:
Counter(metadata_df.age_status)

Counter({'Postnatal': 110, 'Fetal': 22})

In [12]:
Counter(metadata_df.disease_binary)

Counter({'non-diseased': 109, 'diseased': 23})

In [13]:
metadata_df.loc[metadata_df.disease_binary == "non-diseased", 'disease_binary'] = "N"
metadata_df.loc[metadata_df.disease_binary == "diseased", 'disease_binary'] = "Y"
Counter(metadata_df.disease_binary)

Counter({'N': 109, 'Y': 23})

In [14]:
Counter(metadata_df.sex)

Counter({'male': 90, 'female': 42})

### Load in the RNA adata

In [15]:
%%time
RNA_adata = sc.read_h5ad("../../RNA/aggregated_analysis/03_combined_all_snRNA.h5ad")
RNA_adata

CPU times: user 14.9 s, sys: 1min 38s, total: 1min 53s
Wall time: 1min 53s


AnnData object with n_obs × n_vars = 2841357 × 13640
    obs: 'age', 'donor_id', 'sex', 'region', 'cell_type', 'disease', 'consistent_cell_type', 'study', 'technology', 'cell_or_nuclei', 'barcode', 'sample_id', 'age_status', 'tech_plus_study', 'disease_binary', 'age_group'
    layers: 'count', 'counts'

In [16]:
RNA_donors = set(RNA_adata.obs.donor_id)
RNA_donors

{'Chaffin 2022:P1290',
 'Chaffin 2022:P1300',
 'Chaffin 2022:P1304',
 'Chaffin 2022:P1358',
 'Chaffin 2022:P1371',
 'Chaffin 2022:P1422',
 'Chaffin 2022:P1425',
 'Chaffin 2022:P1430',
 'Chaffin 2022:P1437',
 'Chaffin 2022:P1447',
 'Chaffin 2022:P1462',
 'Chaffin 2022:P1472',
 'Chaffin 2022:P1479',
 'Chaffin 2022:P1504',
 'Chaffin 2022:P1508',
 'Chaffin 2022:P1510',
 'Chaffin 2022:P1515',
 'Chaffin 2022:P1516',
 'Chaffin 2022:P1539',
 'Chaffin 2022:P1540',
 'Chaffin 2022:P1547',
 'Chaffin 2022:P1549',
 'Chaffin 2022:P1558',
 'Chaffin 2022:P1561',
 'Chaffin 2022:P1582',
 'Chaffin 2022:P1600',
 'Chaffin 2022:P1602',
 'Chaffin 2022:P1603',
 'Chaffin 2022:P1606',
 'Chaffin 2022:P1610',
 'Chaffin 2022:P1617',
 'Chaffin 2022:P1622',
 'Chaffin 2022:P1630',
 'Chaffin 2022:P1631',
 'Chaffin 2022:P1678',
 'Chaffin 2022:P1685',
 'Chaffin 2022:P1702',
 'Chaffin 2022:P1707',
 'Chaffin 2022:P1718',
 'Chaffin 2022:P1722',
 'Chaffin 2022:P1726',
 'Chaffin 2022:P1735',
 'ENCODE v4 (Snyder):ENCFF248EWR',

### The ATAC donor_id and studies need to be consistent.

#### For study, we will add the year of the study

In [17]:
Counter(metadata_df['study'])

Counter({'ENCODE': 72, 'Kuppe': 28, 'Penn': 22, 'Kanemaru': 7, 'Ameen': 3})

In [18]:
metadata_df['revised_study'] = metadata_df['study'].map(
    {
        "Kuppe": "Kuppe 2022",
        "Kanemaru": "Kanemaru 2023",
        "ENCODE": "ENCODE v4 (Snyder)",
        "Penn": "Penn",
        "Ameen": "Ameen 2022"
    }
)

metadata_df['study'] = metadata_df['revised_study']
metadata_df = metadata_df.drop(columns = ['revised_study'])

In [19]:
Counter(metadata_df['study'])

Counter({'ENCODE v4 (Snyder)': 72,
         'Kuppe 2022': 28,
         'Penn': 22,
         'Kanemaru 2023': 7,
         'Ameen 2022': 3})

#### Now, add the study to the donor_id in the ATAC metadata

In [20]:
metadata_df['donor_id'] = metadata_df['study'] + ":" + metadata_df['donor_id']

In [21]:
# examine intersection between RNA and ATAC 
set(metadata_df['donor_id']) & RNA_donors

{'Kuppe 2022:P1',
 'Kuppe 2022:P10',
 'Kuppe 2022:P11',
 'Kuppe 2022:P12',
 'Kuppe 2022:P14',
 'Kuppe 2022:P15',
 'Kuppe 2022:P17',
 'Kuppe 2022:P18',
 'Kuppe 2022:P19',
 'Kuppe 2022:P2',
 'Kuppe 2022:P20',
 'Kuppe 2022:P3',
 'Kuppe 2022:P4',
 'Kuppe 2022:P6',
 'Kuppe 2022:P7',
 'Kuppe 2022:P8',
 'Kuppe 2022:P9',
 'Penn:K1485',
 'Penn:K1488',
 'Penn:K1584',
 'Penn:K1647',
 'Penn:K1727',
 'Penn:ND15755',
 'Penn:Penn_F1',
 'Penn:Penn_F2'}

In [22]:
metadata_df[metadata_df['study']  == "Kanemaru 2023"]

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
17,HCAHeart9508627_HCAHeart9508819,Kanemaru 2023:D3,Kanemaru 2023,Postnatal,57.5,male,LV,N,10X_Multiome,HCAHeart9508627_HCAHeart9508819_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
18,HCAHeart9508629_HCAHeart9508821,Kanemaru 2023:D7,Kanemaru 2023,Postnatal,62.5,male,LV,N,10X_Multiome,HCAHeart9508629_HCAHeart9508821_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
19,HCAHeart9845431_HCAHeart9917173,Kanemaru 2023:D8,Kanemaru 2023,Postnatal,47.5,male,LV,N,10X_Multiome,HCAHeart9845431_HCAHeart9917173_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
20,HCAHeartST10773165_HCAHeartST10781062,Kanemaru 2023:AH1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST10773165_HCAHeartST10781062_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
21,HCAHeartST10773166_HCAHeartST10781063,Kanemaru 2023:AH1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST10773166_HCAHeartST10781063_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
22,HCAHeartST11064574_HCAHeartST11023239,Kanemaru 2023:AH1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST11064574_HCAHeartST11023239_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
23,HCAHeartST11064575_HCAHeartST11023240,Kanemaru 2023:AH1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST11064575_HCAHeartST11023240_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True


#### The Kanemaru donor_ids need to be renamed to be consistent with the names in the snRNA-seq dataset
RNA-seq: 
- 'Kanemaru 2023:AH1-Nuclei_Multiome-v1'
- 'Kanemaru 2023:D3-Nuclei_Multiome-v1'
- 'Kanemaru 2023:D7-Nuclei_Multiome-v1'
- 'Kanemaru 2023:D8-Nuclei_Multiome-v1',

currently ATAC-seq is just: Kanemaru 2023:D3, Kanemaru 2023:D7, Kanemaru 2023:D8, Kanemaru 2023:AH1

In [23]:
metadata_df.loc[metadata_df['donor_id'] == "Kanemaru 2023:D3", 'donor_id'] = "Kanemaru 2023:D3-Nuclei_Multiome-v1"
metadata_df.loc[metadata_df['donor_id'] == "Kanemaru 2023:D7", 'donor_id'] = "Kanemaru 2023:D7-Nuclei_Multiome-v1"
metadata_df.loc[metadata_df['donor_id'] == "Kanemaru 2023:D8", 'donor_id'] = "Kanemaru 2023:D8-Nuclei_Multiome-v1"
metadata_df.loc[metadata_df['donor_id'] == "Kanemaru 2023:AH1", 'donor_id'] = "Kanemaru 2023:AH1-Nuclei_Multiome-v1"

In [24]:
# examine intersection between RNA and ATAC 
set(metadata_df['donor_id']) & RNA_donors

{'Kanemaru 2023:AH1-Nuclei_Multiome-v1',
 'Kanemaru 2023:D3-Nuclei_Multiome-v1',
 'Kanemaru 2023:D7-Nuclei_Multiome-v1',
 'Kanemaru 2023:D8-Nuclei_Multiome-v1',
 'Kuppe 2022:P1',
 'Kuppe 2022:P10',
 'Kuppe 2022:P11',
 'Kuppe 2022:P12',
 'Kuppe 2022:P14',
 'Kuppe 2022:P15',
 'Kuppe 2022:P17',
 'Kuppe 2022:P18',
 'Kuppe 2022:P19',
 'Kuppe 2022:P2',
 'Kuppe 2022:P20',
 'Kuppe 2022:P3',
 'Kuppe 2022:P4',
 'Kuppe 2022:P6',
 'Kuppe 2022:P7',
 'Kuppe 2022:P8',
 'Kuppe 2022:P9',
 'Penn:K1485',
 'Penn:K1488',
 'Penn:K1584',
 'Penn:K1647',
 'Penn:K1727',
 'Penn:ND15755',
 'Penn:Penn_F1',
 'Penn:Penn_F2'}

#### It will be easier later if we use the ENCODE mapping file to change the names of the ENCODE donor_ids to their corresponding RNA donor_id

In [25]:
ENCODE_mapping_df = pd.read_csv("multiome_mapping_files/ENCODE_ATAC_RNA_mapping.txt", delim_whitespace=True)
ENCODE_mapping_df.head()

Unnamed: 0,RNA_directory_name,Multiomic_series,ATAC_directory_name,age_status
0,ENCSR906MRL,ENCSR100UGC,ENCSR686FZG,Postnatal
1,ENCSR084XKX,ENCSR870POU,ENCSR320QOP,Postnatal
2,ENCSR352DXB,ENCSR573DJW,ENCSR161IHV,Postnatal
3,ENCSR237HWJ,ENCSR371YIY,ENCSR101LQQ,Postnatal
4,ENCSR485GOL,ENCSR323AMA,ENCSR912XZH,Postnatal


In [26]:
# add the study name so that the donor_ids can be mapped to each other
ENCODE_mapping_df['RNA_directory_name'] = "ENCODE v4 (Snyder):" + ENCODE_mapping_df['RNA_directory_name']
ENCODE_mapping_df['ATAC_directory_name'] = "ENCODE v4 (Snyder):" + ENCODE_mapping_df['ATAC_directory_name']

Now, change the ATAC metadata_df's donor_id to the RNA_directory_name

In [27]:
merged_metadata_df = metadata_df.merge(
    ENCODE_mapping_df[['ATAC_directory_name', 'RNA_directory_name']],  
    left_on='donor_id',                                         
    right_on='ATAC_directory_name',                             
    how='left'                                                  
)

# update donor_id with 'RNA_directory_name' if 'RNA_directory_name' is not NaN
merged_metadata_df['donor_id'] = merged_metadata_df['RNA_directory_name'].where(
    merged_metadata_df['RNA_directory_name'].notna(),  
    merged_metadata_df['donor_id']                     
)

# drop merged df columns no longer needed
merged_metadata_df = merged_metadata_df.drop(columns = ["ATAC_directory_name", "RNA_directory_name"])

In [28]:
# now examine the intersection
set(merged_metadata_df['donor_id']) & RNA_donors

{'ENCODE v4 (Snyder):ENCFF248EWR',
 'ENCODE v4 (Snyder):ENCFF684YRB',
 'ENCODE v4 (Snyder):ENCFF727JRO',
 'ENCODE v4 (Snyder):ENCFF775ANN',
 'ENCODE v4 (Snyder):ENCFF776DQR',
 'ENCODE v4 (Snyder):ENCFF802AQC',
 'ENCODE v4 (Snyder):ENCFF805YRY',
 'ENCODE v4 (Snyder):ENCFF849ALE',
 'ENCODE v4 (Snyder):ENCSR002SMQ',
 'ENCODE v4 (Snyder):ENCSR008CVR',
 'ENCODE v4 (Snyder):ENCSR012APQ',
 'ENCODE v4 (Snyder):ENCSR056QLB',
 'ENCODE v4 (Snyder):ENCSR067BOK',
 'ENCODE v4 (Snyder):ENCSR076ZLE',
 'ENCODE v4 (Snyder):ENCSR084XKX',
 'ENCODE v4 (Snyder):ENCSR085XEW',
 'ENCODE v4 (Snyder):ENCSR093GXF',
 'ENCODE v4 (Snyder):ENCSR138JCM',
 'ENCODE v4 (Snyder):ENCSR157FDD',
 'ENCODE v4 (Snyder):ENCSR175TRJ',
 'ENCODE v4 (Snyder):ENCSR176WWW',
 'ENCODE v4 (Snyder):ENCSR190TRK',
 'ENCODE v4 (Snyder):ENCSR203YOV',
 'ENCODE v4 (Snyder):ENCSR204RHR',
 'ENCODE v4 (Snyder):ENCSR231FNL',
 'ENCODE v4 (Snyder):ENCSR237HWJ',
 'ENCODE v4 (Snyder):ENCSR259VOY',
 'ENCODE v4 (Snyder):ENCSR273JWD',
 'ENCODE v4 (Snyder)

In [29]:
set(merged_metadata_df['donor_id']) - RNA_donors

{'Ameen 2022:GSM5495102_F6_v2',
 'Ameen 2022:GSM5495103_F8_v2',
 'Ameen 2022:GSM5495104_F19_v2',
 'ENCODE v4 (Snyder):ENCFF069ATM',
 'ENCODE v4 (Snyder):ENCSR080TZR',
 'ENCODE v4 (Snyder):ENCSR277LDY',
 'ENCODE v4 (Snyder):ENCSR288RQI',
 'ENCODE v4 (Snyder):ENCSR409QLZ',
 'ENCODE v4 (Snyder):ENCSR556UHL',
 'ENCODE v4 (Snyder):ENCSR803QTN',
 'ENCODE v4 (Snyder):ENCSR886PUY',
 'ENCODE v4 (Snyder):ENCSR895LPX',
 'ENCODE v4 (Snyder):ENCSR913OAS',
 'Kuppe 2022:P5',
 'Penn:Penn_F3',
 'Penn:Penn_F4',
 'Penn:Penn_F5'}

### The non-intersecting donors are all accounted for:

- Ameen 2022 only performed snATAC-seq
- Penn F3 to F5 did not have corresponding RNA libraries
- ENCFF069ATM did not have very many high quality snRNA-seq nuclei after QC filtering
- There are 8 other ENCODE libraries for which only snATAC-seq was performed; they were NOT part of the Multiomics series
- ENCSR409QLZ is part of the multiomic series, but the Multiomics Series: ENCSR519BTE is accessed denied
- Kuppe P5 is the CAD,HF patient which we dropped as we cannot ascertain the patient's HF status

#### Based on this, we will drop the following patients that from the analysis as they lack a snRNA-seq correspondence or that snRNA-seq correspondence was removed: ENCODE v4 (Snyder):ENCSR409QLZ, Kuppe 2022:P5

#### Before doing this, indicate the number of donors per study 

In [30]:
merged_metadata_df[["study", "donor_id"]].drop_duplicates().groupby("study").count()

Unnamed: 0_level_0,donor_id
study,Unnamed: 1_level_1
Ameen 2022,3
ENCODE v4 (Snyder),72
Kanemaru 2023,4
Kuppe 2022,18
Penn,11


Remove those 2 samples

In [31]:
remove_samples = ["ENCODE v4 (Snyder):ENCSR409QLZ", "Kuppe 2022:P5"]

filt_metadata_df = merged_metadata_df[~merged_metadata_df.donor_id.isin(remove_samples)].reset_index(drop=True).copy()

# drop the file_exists column
filt_metadata_df = filt_metadata_df.drop(columns = ["file_exists"])

In [32]:
len(set(filt_metadata_df.donor_id) & RNA_donors)

91

In [33]:
len(set(filt_metadata_df.donor_id) - RNA_donors)

15

In [34]:
len(RNA_donors - set(filt_metadata_df.donor_id))

200

In [35]:
merged_metadata_df[["study", "technology", "donor_id"]].drop_duplicates().groupby(["study", "technology"]).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,donor_id
study,technology,Unnamed: 2_level_1
Ameen 2022,10X_ATAC,3
ENCODE v4 (Snyder),10X_ATAC,8
ENCODE v4 (Snyder),10X_Multiome,64
Kanemaru 2023,10X_Multiome,4
Kuppe 2022,10X_ATAC,18
Penn,10X_ATAC,11


#### Therefore, there are 200 donors for which there is only RNA, 91 for which there are both RNA + ATAC, and 15 for which there are only ATAC

In [36]:
#RNA_donors - set(filt_metadata_df.donor_id)

### Finally, inspect each study's metadata carefully one more time to confirm information

In [37]:
merged_metadata_df[merged_metadata_df['study'] == "Penn"].sort_values(by = "donor_id")

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
5,K1485-run-1,Penn:K1485,Penn,Postnatal,78.0,male,LV,N,10X_ATAC,LV-K1485-run-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
6,K1485-run-2,Penn:K1485,Penn,Postnatal,78.0,male,LV,N,10X_ATAC,LV-K1485-run-2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
7,K1488-run-1,Penn:K1488,Penn,Postnatal,81.0,male,LV,N,10X_ATAC,LV-K1488-run-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
8,K1488-run-2,Penn:K1488,Penn,Postnatal,81.0,male,LV,N,10X_ATAC,LV-K1488-run-2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
9,K1584-run-1,Penn:K1584,Penn,Postnatal,22.0,male,LV,N,10X_ATAC,LV-K1584-run-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
10,K1584-run-2,Penn:K1584,Penn,Postnatal,22.0,male,LV,N,10X_ATAC,LV-K1584-run-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
11,K1647-run-1,Penn:K1647,Penn,Postnatal,34.0,male,LV,N,10X_ATAC,LV-K1647-run-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
12,K1647-run-2,Penn:K1647,Penn,Postnatal,34.0,male,LV,N,10X_ATAC,LV-K1647-run-2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
13,K1727-LV-1,Penn:K1727,Penn,Postnatal,40.0,male,LV,N,10X_ATAC,K1727-LV-1_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
14,K1727-LV-2,Penn:K1727,Penn,Postnatal,40.0,male,LV,N,10X_ATAC,K1727-LV-2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True


In [38]:
merged_metadata_df[merged_metadata_df['study'] == "Kuppe 2022"].sort_values(by = "donor_id")

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
0,10X_ATAC_CK166,Kuppe 2022:P1,Kuppe 2022,Postnatal,44.0,male,LV,N,10X_ATAC,10X_ATAC_CK166.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
94,10X_ATAC_CK341,Kuppe 2022:P10,Kuppe 2022,Postnatal,38.0,male,IZ,Y,10X_ATAC,10X_ATAC_CK341.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
103,10X_ATAC_CK380,Kuppe 2022:P11,Kuppe 2022,Postnatal,60.0,female,RZ,Y,10X_ATAC,10X_ATAC_CK380.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
87,10X_ATAC_CK385,Kuppe 2022:P12,Kuppe 2022,Postnatal,40.0,male,RZ/BZ,Y,10X_ATAC,10X_ATAC_CK385.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
105,10X_ATAC_CK346,Kuppe 2022:P14,Kuppe 2022,Postnatal,59.0,male,FZ,Y,10X_ATAC,10X_ATAC_CK346.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
109,10X_ATAC_CK387,Kuppe 2022:P14,Kuppe 2022,Postnatal,59.0,male,FZ,Y,10X_ATAC,10X_ATAC_CK387.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
106,10X_ATAC_CK352,Kuppe 2022:P15,Kuppe 2022,Postnatal,43.0,female,IZ,Y,10X_ATAC,10X_ATAC_CK352.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
108,10X_ATAC_CK350,Kuppe 2022:P15,Kuppe 2022,Postnatal,43.0,female,GT/IZ,Y,10X_ATAC,10X_ATAC_CK350.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
4,10X_ATAC_CK381,Kuppe 2022:P17,Kuppe 2022,Postnatal,61.0,male,LV,N,10X_ATAC,10X_ATAC_CK381.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
3,10X_ATAC_CK353,Kuppe 2022:P17,Kuppe 2022,Postnatal,61.0,male,LV,N,10X_ATAC,10X_ATAC_CK353.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True


In [39]:
merged_metadata_df[merged_metadata_df['study'] == "Ameen 2022"].sort_values(by = "donor_id")

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
121,GSM5495102_F6_v2,Ameen 2022:GSM5495102_F6_v2,Ameen 2022,Fetal,6.0,female,WH,N,10X_ATAC,GSM5495102_F6_v2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
120,GSM5495103_F8_v2,Ameen 2022:GSM5495103_F8_v2,Ameen 2022,Fetal,8.0,male,WH,N,10X_ATAC,GSM5495103_F8_v2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
122,GSM5495104_F19_v2,Ameen 2022:GSM5495104_F19_v2,Ameen 2022,Fetal,19.0,female,WH,N,10X_ATAC,GSM5495104_F19_v2_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True


In [40]:
merged_metadata_df[merged_metadata_df['study'] == "Kanemaru 2023"].sort_values(by = "donor_id")

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
20,HCAHeartST10773165_HCAHeartST10781062,Kanemaru 2023:AH1-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST10773165_HCAHeartST10781062_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
21,HCAHeartST10773166_HCAHeartST10781063,Kanemaru 2023:AH1-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST10773166_HCAHeartST10781063_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
22,HCAHeartST11064574_HCAHeartST11023239,Kanemaru 2023:AH1-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST11064574_HCAHeartST11023239_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
23,HCAHeartST11064575_HCAHeartST11023240,Kanemaru 2023:AH1-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,47.5,female,LV,N,10X_Multiome,HCAHeartST11064575_HCAHeartST11023240_fragment...,/mnt/data1/william/human_heart_project/Final_m...,True
17,HCAHeart9508627_HCAHeart9508819,Kanemaru 2023:D3-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,57.5,male,LV,N,10X_Multiome,HCAHeart9508627_HCAHeart9508819_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
18,HCAHeart9508629_HCAHeart9508821,Kanemaru 2023:D7-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,62.5,male,LV,N,10X_Multiome,HCAHeart9508629_HCAHeart9508821_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
19,HCAHeart9845431_HCAHeart9917173,Kanemaru 2023:D8-Nuclei_Multiome-v1,Kanemaru 2023,Postnatal,47.5,male,LV,N,10X_Multiome,HCAHeart9845431_HCAHeart9917173_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True


In [41]:
merged_metadata_df[merged_metadata_df['study'] == "ENCODE v4 (Snyder)"].sort_values(by = "donor_id")

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path,file_exists
127,ENCFF575DGZ,ENCODE v4 (Snyder):ENCFF069ATM,ENCODE v4 (Snyder),Fetal,8.0,male,WH,N,10X_Multiome,ENCFF575DGZ_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
131,ENCFF908JHS,ENCODE v4 (Snyder):ENCFF248EWR,ENCODE v4 (Snyder),Fetal,15.0,male,WH,N,10X_Multiome,ENCFF908JHS_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
125,ENCFF658TEH,ENCODE v4 (Snyder):ENCFF684YRB,ENCODE v4 (Snyder),Fetal,10.0,female,WH,N,10X_Multiome,ENCFF658TEH_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
124,ENCFF904ARJ,ENCODE v4 (Snyder):ENCFF727JRO,ENCODE v4 (Snyder),Fetal,17.0,female,WH,N,10X_Multiome,ENCFF904ARJ_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
128,ENCFF851VTB,ENCODE v4 (Snyder):ENCFF775ANN,ENCODE v4 (Snyder),Fetal,15.0,female,WH,N,10X_Multiome,ENCFF851VTB_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
...,...,...,...,...,...,...,...,...,...,...,...,...
56,ENCSR769WLL,ENCODE v4 (Snyder):ENCSR919ENI,ENCODE v4 (Snyder),Postnatal,51.0,male,LV,N,10X_Multiome,ENCSR769WLL_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
71,ENCSR020FAW,ENCODE v4 (Snyder):ENCSR962JKS,ENCODE v4 (Snyder),Postnatal,21.0,male,LV,N,10X_Multiome,ENCSR020FAW_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
68,ENCSR546WZB,ENCODE v4 (Snyder):ENCSR980OCK,ENCODE v4 (Snyder),Postnatal,47.0,female,LV,N,10X_Multiome,ENCSR546WZB_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True
48,ENCSR173UUN,ENCODE v4 (Snyder):ENCSR991LHO,ENCODE v4 (Snyder),Postnatal,65.0,male,LV,N,10X_Multiome,ENCSR173UUN_fragments.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...,True


### Save this updated metadata

In [42]:
filt_metadata_df.head()

Unnamed: 0,sample_id,donor_id,study,age_status,age,sex,region,disease_binary,technology,fragment_file,full_path
0,10X_ATAC_CK166,Kuppe 2022:P1,Kuppe 2022,Postnatal,44.0,male,LV,N,10X_ATAC,10X_ATAC_CK166.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...
1,10X_ATAC_CK337,Kuppe 2022:P7,Kuppe 2022,Postnatal,55.0,female,LV,N,10X_ATAC,10X_ATAC_CK337.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...
2,10X_ATAC_CK338,Kuppe 2022:P8,Kuppe 2022,Postnatal,44.0,male,LV,N,10X_ATAC,10X_ATAC_CK338.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...
3,10X_ATAC_CK353,Kuppe 2022:P17,Kuppe 2022,Postnatal,61.0,male,LV,N,10X_ATAC,10X_ATAC_CK353.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...
4,10X_ATAC_CK381,Kuppe 2022:P17,Kuppe 2022,Postnatal,61.0,male,LV,N,10X_ATAC,10X_ATAC_CK381.tsv.gz,/mnt/data1/william/human_heart_project/Final_m...


In [43]:
filt_metadata_df.to_csv("00_updated_metadata.csv")