### Examine the number of enhancer peaks that have at least 1 snATAC-seq called peak overlapping them, versus null sets of snATAC-seq peaks created using bedtools shuffle

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import subprocess
import os

In [2]:
import matplotlib
matplotlib.rcParams['pdf.fonttype'] = 42
matplotlib.rcParams['ps.fonttype'] = 42

In [3]:
plots_dir = "plots/"
os.makedirs(plots_dir, exist_ok=True)

In [4]:
null_intersections = pd.read_csv("null_dist_results.txt", header=None)
null_intersections.columns = ['num_intersecting']
null_intersections.head()

Unnamed: 0,num_intersecting
0,19474
1,19392
2,19305
3,19442
4,19348


In [5]:
Spurrell_peak_intersection_df = pd.read_csv("enhancer_snATAC_peak_intersection.bed", delim_whitespace=True, header=None)
Spurrell_peak_intersection_df.columns = ["chr", "start", "end", "n_intersect_with_snATAC_peak"]
Spurrell_peak_intersection_df.head()

Unnamed: 0,chr,start,end,n_intersect_with_snATAC_peak
0,chr1,836153,837500,0
1,chr1,864654,866686,1
2,chr1,1004411,1006406,1
3,chr1,1022693,1026419,3
4,chr1,1079279,1081182,3


In [6]:
# number of enhancers from Spurrell et al. 2022 that have some intersection with ATAC-seq peaks
num_overlapping_enhancers = Spurrell_peak_intersection_df[Spurrell_peak_intersection_df['n_intersect_with_snATAC_peak'] > 0]

print(f"Number of enhancers overlapping with snATAC-seq peaks: {num_overlapping_enhancers.shape[0]}")

prop_overlapping_enhancers = num_overlapping_enhancers.shape[0] / Spurrell_peak_intersection_df.shape[0]
print(f"Proportion of enhancers overlapping with snATAC-seq peaks: {prop_overlapping_enhancers}")

Number of enhancers overlapping with snATAC-seq peaks: 32525
Proportion of enhancers overlapping with snATAC-seq peaks: 0.9762283518924273


### Compare actual overlap with random shuffles

#### Calculate Z-score

In [8]:
mean_null = np.mean(null_intersections)
std_null = np.std(null_intersections)[0]

# actual overlap 
actual_overlap = num_overlapping_enhancers.shape[0]
print(f"Actual number of Spurrell enhancers with at least 1 snATAC-seq peak overlapping {actual_overlap}")

z_score = (actual_overlap - mean_null) / std_null

print(f"Z-score: {z_score}")

Actual number of Spurrell enhancers with at least 1 snATAC-seq peak overlapping 32525
Z-score: 161.65132712832488


  std_null = np.std(null_intersections)[0]


In [None]:
# produce histogram

plt.hist(null_intersections)
plt.axvline(actual_overlap, color = "red", linestyle= "--")

# add the z-score in the top-right corner
plt.text(
    x=np.max(null_intersections) * 1.4,
    y=plt.ylim()[1] * 0.9,
    s=f"Z-score: {z_score:.2f}",
    fontsize=12,
    color="blue",
)

plt.title("actual number of bulk H3K27ac peaks (Spurrell et al. 2022) \n intersecting with snATAC-seq peaks, \n versus null distribution", size = 10)
plt.xlabel("number of intersecting peaks")
plt.ylabel("num simulations")
plt.savefig(plots_dir + "actual_Spurrell_intersection_vs_null.pdf")
plt.show()

#### Conclusion: A much higher overlap of the Spurrell enhancers with open chromatin regions of snATAC-seq than expected simply through shuffling