In [57]:
import gzip
import os
import scanpy as sc

In [58]:
# Specify desired cell type, batch number, path to input file and path to output file
cell_type = "Normoblast"
batch = "s1d1"
input_path = "/s/project/transcription_factor_activity/data/neurips/anndata/neurips_atac_modified.h5ad"
output_path = "/s/project/transcription_factor_activity/data/neurips/processed/s1d1/separated/s1d1_Normoblast.tsv.gz"

barcodes = set()

In [59]:
# Read data to AnnData object
adata = sc.read_h5ad(input_path)

In [68]:
# Get all cell barcodes corresponding to cell type
for index, fragment in adata.obs.iterrows():
    if fragment['batch'] == batch and fragment['neurips21_cell_type'] == cell_type:
        barcodes.add(fragment['frag_file_bcs'])
# for barcode in barcodes:
#     print(barcode)

In [62]:
# Open input file with gzip
with gzip.open(input_file, 'rt') as f:
    # Skip header lines starting with #
    for line in f:
        if not line.startswith('#'):
            break
    # Creat output file and handle
    os.makedirs(os.path.dirname(output_path), exist_ok=True)
    # Open the output file in write mode with gzip compression
    output_handle = gzip.open(output_path, 'wt')

    # Process each line in the input file
    for line in f:
        fields = line.strip().split('\t')
        cell_barcode = fields[3]  # Assuming cell barcode is in the fourth column

        # Check if the cell barcode matches the desired cell type
        if cell_barcode in barcodes:
            # Write the line to the corresponding output file
            output_handle.write(line)
            
    # Close output file handle
    output_handle.close()

In [72]:
# Print lines for debugging
import gzip

# Define the input TSV.gz file path
path = '/s/project/transcription_factor_activity/data/neurips/raw/s1d1/atac_fragments.tsv.gz'
# path = output_path


# Open file using gzip and print the first 100 lines
with gzip.open(path, 'rt') as f:
    for i in range(100):
        line = f.readline()
        print(line.strip())

# id=libraries_neurips_site1_donor1
# description=libraries_neurips_site1_donor1
#
# pipeline_name=cellranger-arc
# pipeline_version=cellranger-arc-2.0.0
#
# reference_path=/mnt/15e1f33a-e47d-42f0-967d-b954cff90843/genome/reference/refdata-cellranger-arc-GRCh38-2020-A-2.0.0
# reference_fasta_hash=b6f131840f9f337e7b858c3d1e89d7ce0321b243
# reference_gtf_hash=3b4c36ca3bade222a5b53394e8c07a18db7ebb11
# reference_version=2020-A
# mkref_version=cellranger-arc-2.0.0
#
# primary_contig=chr1
# primary_contig=chr10
# primary_contig=chr11
# primary_contig=chr12
# primary_contig=chr13
# primary_contig=chr14
# primary_contig=chr15
# primary_contig=chr16
# primary_contig=chr17
# primary_contig=chr18
# primary_contig=chr19
# primary_contig=chr2
# primary_contig=chr20
# primary_contig=chr21
# primary_contig=chr22
# primary_contig=chr3
# primary_contig=chr4
# primary_contig=chr5
# primary_contig=chr6
# primary_contig=chr7
# primary_contig=chr8
# primary_contig=chr9
# primary_contig=chrX
# primary_cont