### Step 1: Extract TSS Regions from GTF File

In [None]:
import pandas as pd

# Load GTF file - replace with your actual file path
gtf_df = pd.read_csv('mm39.ncbiRefSeq.gtf', sep='\t', comment='#', header=None,
                     names=['seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute'])

# Filter for genes or transcripts to get TSS
tss_df = gtf_df[(gtf_df['feature'] == 'gene') | (gtf_df['feature'] == 'transcript')]

# Define a function to extract gene name and TSS based on the strand
def extract_tss_info(row):
    attributes = {key.strip(): value.strip() for key, value in
                  (item.split(' ') for item in row['attribute'].strip(';').split(';'))}
    gene_name = attributes.get('gene_name', '').replace('"', '')
    tss_start = row['start'] if row['strand'] == '+' else row['end']
    return pd.Series([gene_name, tss_start], index=['gene_name', 'tss_start'])

# Apply function to get TSS positions
tss_df[['gene_name', 'tss_start']] = tss_df.apply(extract_tss_info, axis=1)

# Calculate TSS regions (2000 bp upstream and downstream)
tss_df['tss_start'] = tss_df['tss_start'].astype(int)
tss_df['tss_end'] = tss_df['tss_start']
tss_df.loc[tss_df['strand'] == '+', 'tss_start'] -= 2000
tss_df.loc[tss_df['strand'] == '-', 'tss_end'] += 2000

# Ensure start is always less than end
tss_df['start'] = tss_df[['tss_start', 'tss_end']].min(axis=1)
tss_df['end'] = tss_df[['tss_start', 'tss_end']].max(axis=1)

# Create BED file format
tss_bed = tss_df[['seqname', 'start', 'end', 'gene_name', 'score', 'strand']]

# Write BED file - replace with your actual file path
tss_bed.to_csv('tss_regions.bed', sep='\t', header=False, index=False)


### Step 2: Run BEDTools to Calculate Read Counts

In [None]:
# BED file with TSS regions
TSS_BED="tss_regions.bed"

# BAM files for WT and GF
declare -a BAM_FILES=("CV-H3K9ac_1.sort.bam" "CV-H3K9ac_2.sort.bam" "CV-H3K9ac_3.sort.bam" "CV-H3K9ac_4.sort.bam" "CV-H3K9ac_5.sort.bam" "CV-H3K9ac_6.sort.bam" "GF-H3K9ac_1.sort.bam" "GF-H3K9ac_2.sort.bam" "GF-H3K9ac_3.sort.bam" "GF-H3K9ac_4.sort.bam")

# Loop through each BAM file and count reads in TSS regions
for BAM_FILE in "${BAM_FILES[@]}"
do
    bedtools coverage -a "$TSS_BED" -b "$BAM_FILE" > "${BAM_FILE}_tss_counts.txt"
done


### Step 3: Compile Results into a Table

In [None]:
# Collect all counts into a dictionary
counts = {}
for bam_file in BAM_FILES:
    counts[bam_file] = pd.read_csv(f"{bam_file}_tss_counts.txt", sep='\t', header=None,
                                   names=['chrom', 'start', 'end', 'gene_name', 'score', 'strand', 'count'])

# Convert dictionary to DataFrame
all_counts_df = pd.DataFrame(counts)

# Pivot table to have one row per gene and one column
