In [1]:
import pandas as pd
import numpy as np
import scanpy as sc

In [2]:
%%time
peak_mat = sc.read_h5ad("07_final_ATAC.h5ad")
peak_mat

CPU times: user 4.34 s, sys: 28.3 s, total: 32.7 s
Wall time: 32.7 s


AnnData object with n_obs × n_vars = 690044 × 654221
    obs: 'ATAC_barcode', 'sample_id', 'leiden', 'donor_id', 'study', 'age_status', 'age', 'sex', 'region', 'disease_binary', 'technology', 'fragment_file', 'full_path', 'file', 'nfrag', 'tsse', 'cell_type', 'tech_plus_study', 'age_group', 'decade', 'final_cell_type', 'cell_or_nuclei', 'disease'
    var: 'count', 'selected'
    uns: 'age_status_colors', 'cell_type_colors', 'leiden', 'leiden_colors', 'neighbors', 'spectral_eigenvalue', 'study_colors'
    obsm: 'X_spectral', 'X_spectral_harmony', 'X_umap'
    obsp: 'connectivities', 'distances'

In [3]:
def extract_genome_coordinates(peak_df, column_name):
    '''Convert a column of a pandas df that is of the format (chr:start-end) to three different columns, 
    called chr, start, and end. Specify the column_name for original column.'''

    peak_df[["chr", "coords"]] =  peak_df[column_name].str.split(":", expand=True)
    peak_df[["start", "end"]] = peak_df["coords"].str.split("-", expand=True)
    peak_df = peak_df.drop(columns=["coords"])

    return peak_df

In [4]:
peak_names_df = peak_mat.var
peak_names_df = peak_names_df.reset_index().rename(columns = {'index': 'coordinates'})
peak_names_df = extract_genome_coordinates(peak_names_df, column_name = "coordinates")

In [5]:
peak_names_df.shape

(654221, 6)

In [6]:
peak_names_df.head()

Unnamed: 0,coordinates,count,selected,chr,start,end
0,chr1:794224-794725,2242.0,False,chr1,794224,794725
1,chr1:794869-795370,5710.0,False,chr1,794869,795370
2,chr1:798148-798649,993.0,False,chr1,798148,798649
3,chr1:802009-802510,1454.0,False,chr1,802009,802510
4,chr1:806996-807497,1576.0,False,chr1,806996,807497


### Save in a BED format file

In [8]:
peak_names_bed = peak_names_df[["chr", "start", "end"]]
peak_names_bed.to_csv("07B_snATAC_peaks.bed", header=False, index=None, sep = "\t")