# Peak analysis from MACS2 output

This notebook analyzes peaks from MACS2 output.

1. Calculate average peak position from replicates

2. Identify nearest ORF adjacent to each peak and plot distribution of distance to the nearest ORF

3. Cross-reference peaks with RNA-seq table and assign "activate" or "repressed" type to each peak

In [1]:
import os
import sys
import re

SCRIPTS = "/Users/yunfei/GeisingerLab/2023ChipSeq/src"
print(sys.version)


3.11.3 (main, May 15 2023, 10:43:03) [Clang 14.0.6 ]


In [24]:
%pip install pandas
%pip install numpy

Collecting pandas
  Downloading pandas-2.0.2-cp311-cp311-macosx_10_9_x86_64.whl (11.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m36.8 MB/s[0m eta [36m0:00:00[0m00:01[0m0:01[0m
Collecting pytz>=2020.1 (from pandas)
  Downloading pytz-2023.3-py2.py3-none-any.whl (502 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m502.3/502.3 kB[0m [31m31.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tzdata>=2022.1 (from pandas)
  Downloading tzdata-2023.3-py2.py3-none-any.whl (341 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m341.8/341.8 kB[0m [31m25.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting numpy>=1.21.0 (from pandas)
  Downloading numpy-1.25.0-cp311-cp311-macosx_10_9_x86_64.whl (20.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m20.0/20.0 MB[0m [31m33.6 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Installing collected packages: pytz, tzdata, numpy, pandas
Successfully inst

## Calculate average peak position from replicates

### Use bedtools multiinter to identify intersected peaks

In [33]:
INDIR = "/Users/yunfei/2023_ChipSeq/MACS2_output_500k_ext200_input28-2/narrowPeak"
OUTDIR = "/Users/yunfei/2023_ChipSeq/bed_multiinter"

!mkdir -p $OUTDIR

for seed in [1, 4, 7]:
    for sample in [28, 49]:
        replicates = []
        for file in os.listdir(INDIR):
            if sample == 28: # for 28, only include replicate 2, and populate the list with 3 x replicate 2
                if re.match("^.+" + str(sample) + "-2.seed" + str(seed) + ".+$", file):
                    replicates.append(os.path.join(INDIR, file))
            else:
                if re.match("^.+" + str(sample) + "-\d.seed" + str(seed) + ".+$", file):
                    replicates.append(os.path.join(INDIR, file))
        if len(replicates) == 1:
            replicates += replicates
        outfile =os.path.join(OUTDIR, ("BfmR-ChIP-" + str(sample) + "_seed" + str(seed) + ".intersect.bed"))
        cmd = "bedtools multiinter -cluster -header -i " + " ".join(replicates) + " > " + outfile # 23Feb2023: added -header option so the output contains file path
        os.system(cmd)

### Calculate average peak position


In [2]:
INDIR = "/Users/yunfei/2023_ChipSeq/bed_multiinter"
OUTDIR = "/Users/yunfei/2023_ChipSeq/average_peak_summit"
SCRIPT = "/Users/yunfei/GeisingerLab/2023ChipSeq/src/find_peak_summit.py"
!mkdir -p $OUTDIR

for file in os.listdir(INDIR):
    if re.match("^.+" + "intersect.bed$", file):
        infile = os.path.join(INDIR, file)
        outfile = os.path.join(OUTDIR, (file.split('.')[0] + '.average_summit.bed'))
        cmd = "python " + SCRIPT + " " + infile + " " + outfile
        print("new output: " + outfile)
        os.system(cmd)

new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-28_seed4.average_summit.bed
new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-49_seed1.average_summit.bed
new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-28_seed7.average_summit.bed
new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-49_seed7.average_summit.bed
new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-28_seed1.average_summit.bed
new output: /Users/yunfei/2023_ChipSeq/average_peak_summit/BfmR-ChIP-49_seed4.average_summit.bed
