#### Performs `donor_id` (not `sample_id`) pseudobulking of ATAC-seq peaks
#### Uses the peak matrix (called by MACS3), not the tile matrix

In [1]:
import snapatac2 as snap
import scanpy as sc
import numpy as np
import tempfile
import os
from collections import Counter
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
# Get the temporary directory path
tmp_dir = tempfile.gettempdir()
import statsmodels.api as sm
import shutil

In [2]:
plots_dir = "../../../paper_figures/"

Load the matrix with the merged peaks called by MACS3.

In [3]:
%%time
peak_mat = sc.read_h5ad("../07_final_ATAC.h5ad")
peak_mat

CPU times: user 4.77 s, sys: 41.7 s, total: 46.5 s
Wall time: 1min 25s


AnnData object with n_obs × n_vars = 690044 × 654221
    obs: 'ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study', 'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology', 'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type', 'tech_plus_study', 'age_group', 'decade', 'final_cell_type', 'cell_or_nuclei', 'disease'
    var: 'count', 'selected'
    uns: 'age_status_colors', 'cell_type_colors', 'leiden', 'leiden_colors', 'neighbors', 'spectral_eigenvalue', 'study_colors'
    obsm: 'X_spectral', 'X_spectral_harmony', 'X_umap'
    obsp: 'connectivities', 'distances'

### Extract the relevant metadata for the design matrix

The design we will use is: 

- `accessibility ~ age_status + sex + disease_binary + tech_plus_study` (major batch effect)

In [4]:
metadata = peak_mat.obs[["donor_id", "age_status", "age_group",
                         "sex", "disease_binary", "tech_plus_study",
                        "technology", "study"]].drop_duplicates().reset_index(drop = True)
metadata

Unnamed: 0,donor_id,age_status,age_group,sex,disease_binary,tech_plus_study,technology,study
0,ENCODE v4 (Snyder):ENCSR556UHL,postnatal,old,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
1,ENCODE v4 (Snyder):ENCSR913OAS,postnatal,old,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
2,ENCODE v4 (Snyder):ENCSR455MGH,postnatal,young,female,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
3,ENCODE v4 (Snyder):ENCSR080TZR,postnatal,old,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
4,ENCODE v4 (Snyder):ENCSR008CVR,postnatal,old,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
...,...,...,...,...,...,...,...,...
101,ENCODE v4 (Snyder):ENCSR540DHJ,postnatal,middle,female,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
102,ENCODE v4 (Snyder):ENCSR489URW,postnatal,old,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
103,ENCODE v4 (Snyder):ENCSR056QLB,postnatal,middle,male,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)
104,ENCODE v4 (Snyder):ENCFF775ANN,fetal,fetal,female,N,Multiome-v1_ENCODE v4 (Snyder),Multiome-v1,ENCODE v4 (Snyder)


### Get the pseudobulked adata counts

In [5]:
directory_path = "pseudobulked_counts/"
#shutil.rmtree(directory_path)
os.makedirs(directory_path, exist_ok=True)
print(f"Directory '{directory_path}' created successfully (if it didn't already exist).")

Directory 'pseudobulked_counts/' created successfully (if it didn't already exist).


### Pseudobulk the counts per cell type per donor
- Filter the peaks to those that are are least the mean TPM > 1 

In [6]:
cell_types_to_study = peak_mat.obs.cell_type.unique()
cell_types_to_study

['Adipocyte', 'Cardiomyocyte', 'Endothelial', 'Epicardial', 'Fibroblast', ..., 'Myeloid', 'Lymphoid', 'Mast', 'Neuronal', 'vSMC']
Length: 11
Categories (11, object): ['Adipocyte', 'Cardiomyocyte', 'Endothelial', 'Epicardial', ..., 'Myeloid', 'Neuronal', 'Pericyte', 'vSMC']

In [7]:
# specify the donor key to pseudobulk on, and the TPM threshold (technically cuts per million)
donor_key = "donor_id"
TPM_threshold = 1.0

In [8]:
%%time

for cell_type in cell_types_to_study:
    
    print(cell_type, flush = True)
    
    # subset on the peak matrix adata (peak_mat)
    subset_adata = peak_mat[peak_mat.obs.cell_type == cell_type, :]
    pseudobulked_adata = sc.get.aggregate(subset_adata, by = donor_key, func = 'sum').copy()

    # add back the metadata info (donor_id, study)
    pseudobulked_adata.obs = pseudobulked_adata.obs.merge(metadata, on = donor_key, how = "inner")
    pseudobulked_adata.obs_names = pseudobulked_adata.obs_names.astype(str)
    pseudobulked_adata.X = pseudobulked_adata.layers['sum'] 

    # calculate TPM and filter peaks to those above the mean TPM threshold 
    TPM_values = pd.DataFrame(pseudobulked_adata.X)
    TPM_values = TPM_values.div(TPM_values.sum(axis = 1), axis = 0) * 1e6

    # calculate the mean 
    mean_TPM_values = TPM_values.mean(axis = 0)
    TPM_values_greater_than_threshold = mean_TPM_values > TPM_threshold

    # keep only those peaks with TPM values greater than the threshold
    filtered_pseudobulked_adata = pseudobulked_adata[:, TPM_values_greater_than_threshold]

    # create csv files to save
    filtered_pseudobulked_counts = pd.DataFrame(filtered_pseudobulked_adata.layers['sum'])
    filtered_pseudobulked_counts.columns = filtered_pseudobulked_adata.var.index
    filtered_pseudobulked_counts.index = filtered_pseudobulked_adata.obs.index

    # save count matrix
    filtered_pseudobulked_counts.to_csv(directory_path + cell_type + "_count_matrix.csv")
    # save metadata
    filtered_pseudobulked_adata.obs.to_csv(directory_path + cell_type + "_metadata.csv")

Adipocyte
Cardiomyocyte
Endothelial
Epicardial
Fibroblast
Pericyte
Myeloid
Lymphoid
Mast
Neuronal
vSMC
CPU times: user 6min 59s, sys: 7min 52s, total: 14min 51s
Wall time: 14min 56s
