In [None]:
from cooltools import snipping
import cooler
import pandas as pd
from scipy.stats import ttest_ind
import numpy as np
import pybedtools
import bioframe
import seaborn as sns
import matplotlib.pyplot as plt
import glob
import os


In [None]:
plt.rcParams.update({'font.size': 24})
chromsizes=bioframe.fetch_chromsizes('mm10')
chromosomes = list(chromsizes.index)
supports = [(chrom, 0, chromsizes[chrom]) for chrom in chromosomes]
binsize=5000
flank=0

binsDf=pd.read_csv('../data/genomeInfo/mm10bin5kb.bed',sep='\t',header=None,names=['chrom','start','end']) #bin file can be made with cooler makebins, binsize = 5000

signalFiles=glob.glob('../data/binnedBedgraphs/*')
signalDf=binsDf.copy()
for signalFile in signalFiles:
    signalDf=signalDf.merge(pd.read_csv(signalFile,sep="\t", header=None,names=['chrom','start','end',os.path.basename(signalFile).split('.')[0]]),how='left',on=['chrom','start','end'])
    signalDf.drop_duplicates(subset=['chrom','start','end'],inplace=True)

chromHMMdict={1:'H3K4me3', 
       2:'H3K4me1/3',
       3:'H3K4me1',
       4:'H3K4me1+\nH3K36me3',
       5:'H3K36me3',
       6:'Unmarked',
       7:'H3K27me3'}
palette=[(1,0,0),
         (1,.6,0),
         (1,1,0),
         (.6,.8,0),
         (0,.6,.4),
         (.6,.6,.6),
         (0,0,1)]

In [None]:
def getPeakDf(peakFile):
    return pd.read_csv(peakFile,header=None,sep="\t",names=['chrom','start','end','name','score','strand','signalValue','pvalue','qvalue','peak'])

def plotCompartmentFractions(countDf,saveName=None):
    plt.figure(figsize=(0.5*countDf.shape[0],3))
    if 'ChIP-seq peaks' in countDf.columns:
        sns.barplot(y='peak fraction',x='ChIP-seq peaks',hue='compartment',data=countDf)
        plt.xticks(rotation=90)
    else:
        sns.barplot(y='peak fraction',x='compartment',data=countDf)
    if saveName is not None:
        plt.savefig(saveName)
        
def compartmentCountPeakDf(peakDf,eigVecFile='../data/binnedBedgraphs/ES_E1.bg',labelString=None):
    peakCenterDf=pd.DataFrame(peakDf['chrom'])
    peakCenterDf['start']=(peakDf.start+peakDf.end)//2
    peakCenterDf['end']=peakCenterDf.start+1
    peakBt=pybedtools.bedtool.BedTool.from_dataframe(peakCenterDf)
    eigVecBt=pybedtools.BedTool(eigVecFile)
    intersectDf=peakBt.intersect(eigVecBt,wb=True).to_dataframe()[['chrom','start','end','thickStart']].rename(columns={'thickStart':'E1'})
    comp=['A' if E1>0 else 'B' for E1 in intersectDf.E1]
    intersectDf['compartment']=comp
    peakCount=intersectDf.shape[0]
    countDf=(intersectDf.compartment.value_counts()/peakCount).reset_index().rename(columns={'index':'compartment','compartment':'peak fraction'})
    if labelString is not None:
        countDf['ChIP-seq peaks']=labelString
    return countDf

def chromhmmLabelPeakDf(peakDf,chromHmmFile='../data/chromhmm/ENCFF215ZKG_mm9mm10liftOver.bed'):
    peakCenterDf=pd.DataFrame(peakDf['chrom'])
    peakCenterDf['start']=(peakDf.start+peakDf.end)//2
    peakCenterDf['end']=peakCenterDf.start+1
    peakBt=pybedtools.bedtool.BedTool.from_dataframe(peakCenterDf)
    chromhmmBt=pybedtools.BedTool(chromHmmFile)
    return peakBt.intersect(chromhmmBt,wb=True).to_dataframe()[['chrom','start','end','thickStart']].rename(columns={'thickStart':'chromHMM'})

def mergeChromhmmPeaksToSignal(peakDfChromHMM,signalCol, binsize=5000,flank=0,supports=supports, signalDf=signalDf):
    windows = snipping.make_bin_aligned_windows(
        binsize,
        peakDfChromHMM['chrom'],
        (peakDfChromHMM['start'] + peakDfChromHMM['end'])//2,
        flank_bp=flank)
    windows=windows.join(peakDfChromHMM['chromHMM'])
    windows=windows.merge(pd.DataFrame(supports,columns=['chrom','zero','chromsize']),how='left',on='chrom')
    windows=windows[(windows.end<windows.chromsize) & (windows.start>0)].drop_duplicates()
    windows=windows.loc[(windows.chrom!='chrX')&(windows.chrom!='chrY')&(windows.chrom!='chrM'),:]
    return windows.merge(signalDf[['chrom','start','end',signalCol]],how='left',on=['chrom','start','end'])

def makeChromhmmBoxPlot(plotDf,plotCol='yin2019_crossoversCast1C_binned_weightedScore_medianNormalized', yLabel='Crossover score', legendLabel='median across DSB peak bins', palette=palette,chromHMMdict=chromHMMdict,saveName=None):
    plt.figure()
    ax1=sns.boxplot(x='chromHMM', y=plotCol, data=plotDf, palette=palette, showfliers=False, showcaps=False, whiskerprops={'color':(0.5,0.5,0.5), 'linestyle':'dotted'})
    ax1.set_xticklabels([chromHMMdict[int(ticklabel.get_text())] for ticklabel in ax1.get_xticklabels()])
    for item in ax1.get_xticklabels():
        item.set_rotation(90)
    plt.axhline(y=plotDf[plotCol].median(), linestyle='--', color='grey', label=legendLabel)

    plt.ylabel(yLabel)
    plt.legend()
    if saveName is not None:
        plt.savefig(saveName)

In [None]:
peakFiles=glob.glob('../data/peakBeds/*')
peakDfDict=dict(zip([os.path.basename(file).split('.')[0] for file in peakFiles],[getPeakDf(file) for file in peakFiles]))

saveDir='../outputs/'

eigVecDf=pd.read_csv('../data/binnedBedgraphs/ES_E1.bg',header=None,names=['chrom','start','end','E1'],sep="\t") #eigenvector file ES_E1.bg generated using getHiCtracks.py on ES Hi-C
eigVecDf['compartment']=['A' if E1>0 else 'B' for E1 in eigVecDf.E1]
compartmentCoverageDf=(eigVecDf.compartment.value_counts()/eigVecDf.shape[0]).reset_index().rename(columns={'index':'compartment','compartment':'peak fraction'})
compartmentCoverageDf['ChIP-seq peaks']='genome\ncoverage'

In [None]:
compartmentCountDfs=[compartmentCountPeakDf(peakDfDict['Baker2015_prdm9_B6xCAST_peaks'],labelString='PRDM9 B6xCAST (Baker 2015)'),
                     compartmentCountPeakDf(peakDfDict['grey2017orig_PRDM9_B6_class1_peaks'],labelString='PRDM9 B6Class 1 (Grey 2017)'),
                     compartmentCountPeakDf(peakDfDict['grey2017orig_PRDM9_RJ2_class1_peaks'],labelString='PRDM9 CAST Class 1 (Grey 2017)'),
                     compartmentCountPeakDf(peakDfDict['grey2017orig_PRDM9_B6_class2_peaks'],labelString='PRDM9 B6 Class 2 (Grey 2017)'),
                     compartmentCountPeakDf(peakDfDict['grey2017orig_PRDM9_RJ2_class2_peaks'],labelString='PRDM9 CAST Class 2 (Grey 2017)'),
                     compartmentCountPeakDf(peakDfDict['smagulova2016_B6xCAST_DSB_fraglen1000_peaks'],labelString='DMC1-SSDS\n(DSB)'),
                    compartmentCoverageDf]
compartmentCountDf=pd.concat(compartmentCountDfs)
plotCompartmentFractions(compartmentCountDf,saveName=f"{saveDir}/PRDM9_DSB_ABcompFracs.pdf")
compartmentCountDf


In [None]:
compartmentCountDfs=[compartmentCountPeakDf(peakDfDict['Nitzsche2011_ESC_RAD21_peaks'],labelString='ES RAD21'),
                     compartmentCountPeakDf(peakDfDict['Nitzsche2011_ESC_CTCF_peaks'],labelString='ES CTCF'),
                     compartmentCountPeakDf(peakDfDict['ES_RNAP2_peaks'],labelString='ES RNAPII'),
                     compartmentCountPeakDf(peakDfDict['vara2019_PDrad21l_peaks'],labelString='Meiotic RAD21L'),
                     compartmentCountPeakDf(peakDfDict['vara2019_PDrec8_peaks'],labelString='Meiotic REC8'),
                     compartmentCountPeakDf(peakDfDict['vara2019_PDctcf_peaks'],labelString='Meiotic CTCF'),
                     compartmentCountPeakDf(peakDfDict['margolin2014RNAPII16dpp_peaks'],labelString='Meiotic RNAPII'),
                    compartmentCoverageDf]
compartmentCountDf=pd.concat(compartmentCountDfs)
plotCompartmentFractions(compartmentCountDf,saveName=f"{saveDir}/RNAPcohesinCTCF_ABcompFracs.pdf")
compartmentCountDf

In [None]:
peakDf=peakDfDict['smagulova2016_B6xCAST_DSB_fraglen1000_peaks']
signalCol='yin2019_crossoversCast1C_binned_weightedScore_medianNormalized'
plotDf=mergeChromhmmPeaksToSignal(chromhmmLabelPeakDf(peakDf),signalCol=signalCol)
makeChromhmmBoxPlot(plotDf, saveName=f"{saveDir}/DSBtoCO_chromhmmBoxplot.pdf")

In [None]:
peakDf=peakDfDict['Baker2015_prdm9_B6xCAST_peaks']
signalCol='yin2019_crossoversCast1C_binned_weightedScore_medianNormalized'
plotDf=mergeChromhmmPeaksToSignal(chromhmmLabelPeakDf(peakDf),signalCol=signalCol)
makeChromhmmBoxPlot(plotDf,plotCol=signalCol, yLabel='Crossover score', legendLabel='median across PRDM9 peak bins', saveName=f"{saveDir}/PRDM9toCO_chromhmmBoxplot.pdf")

In [None]:
peakDf=peakDfDict['Baker2015_prdm9_B6xCAST_peaks']
signalCol='smagulova2016_B6xCAST_DSB_fraglen1000'
plotDf=mergeChromhmmPeaksToSignal(chromhmmLabelPeakDf(peakDf),signalCol=signalCol)
makeChromhmmBoxPlot(plotDf,plotCol=signalCol, yLabel='DMC1 ChIP-seq\n(DSB) score', legendLabel='median across PRDM9 peak bins', saveName=f"{saveDir}/PRDM9toDSB_chromhmmBoxplot.pdf")