In [1]:
import os 
import torch
import warnings
import numpy as np
import pandas as pd
import torch.nn as nn
from TrainAndTest import *
from Preprocessing import *
from MotifExtraction import *
from sklearn import metrics
import matplotlib.pyplot as plt
import torch.nn.functional as F
from prettytable import PrettyTable
from torch.utils.data import Dataset, DataLoader


warnings.filterwarnings("ignore")

In [2]:
model_dir = 'model'
results_dir = 'results'
data_dir = 'Data/'

In [3]:
boundaryLogPath = data_dir + 'Boundaries_Log.bed'

boundaryLogData = pd.read_csv(boundaryLogPath, delimiter='\t')
boundaryLogData.columns=['Chr', 'PeakStart', 'PeakEnd', 'Tag', 'I']
boundaryLogData

Unnamed: 0,Chr,PeakStart,PeakEnd,Tag,I
0,chrI,20801,21000,104|sacCer3|chrI:20801-21000,0.637253
1,chrI,45401,45600,227|sacCer3|chrI:45401-45600,0.773369
2,chrI,71601,71800,358|sacCer3|chrI:71601-71800,0.707931
3,chrI,93001,93200,465|sacCer3|chrI:93001-93200,0.540502
4,chrI,100801,101000,504|sacCer3|chrI:100801-101000,0.479720
...,...,...,...,...,...
752,chrXVI,860601,860800,59927|sacCer3|chrXVI:860601-860800,0.460258
753,chrXVI,901201,901400,60130|sacCer3|chrXVI:901201-901400,0.955603
754,chrXVI,911601,911800,60182|sacCer3|chrXVI:911601-911800,0.421940
755,chrXVI,920201,920400,60225|sacCer3|chrXVI:920201-920400,0.529056


In [4]:
index = 0
for chrom in boundaryLogData['Chr']:
    num = chrom[chrom.index('r') + 1:]
    curChrom = str('[chromosome=') + str(num) + ']'
    boundaryLogData.iat[index,0] = curChrom
    index += 1

boundaryLogData



Unnamed: 0,Chr,PeakStart,PeakEnd,Tag,I
0,[chromosome=I],20801,21000,104|sacCer3|chrI:20801-21000,0.637253
1,[chromosome=I],45401,45600,227|sacCer3|chrI:45401-45600,0.773369
2,[chromosome=I],71601,71800,358|sacCer3|chrI:71601-71800,0.707931
3,[chromosome=I],93001,93200,465|sacCer3|chrI:93001-93200,0.540502
4,[chromosome=I],100801,101000,504|sacCer3|chrI:100801-101000,0.479720
...,...,...,...,...,...
752,[chromosome=XVI],860601,860800,59927|sacCer3|chrXVI:860601-860800,0.460258
753,[chromosome=XVI],901201,901400,60130|sacCer3|chrXVI:901201-901400,0.955603
754,[chromosome=XVI],911601,911800,60182|sacCer3|chrXVI:911601-911800,0.421940
755,[chromosome=XVI],920201,920400,60225|sacCer3|chrXVI:920201-920400,0.529056


In [5]:
chromSeqsPath = data_dir + 'S288C_reference_sequence_R64-3-1_20210421.fsa'
chromSeqsData = genome_data(chromSeqsPath)

index = 0
for chromSeq in chromSeqsData:
    chromSeqsData[index] = chromSeqsData[index].replace('\n', '')
    index += 1

chromSeqsData[0]

'ref|NC_001133| [org=Saccharomyces cerevisiae] [strain=S288C] [moltype=genomic] [chromosome=I]CCACACCACACCCACACACCCACACACCACACCACACACCACACCACACCCACACACACACATCCTAACACTACCCTAACACAGCCCTAATCTAACCCTGGCCAACCTGTCTCTCAACTTACCCTCCATTACCCTGCCTCCACTCGTTACCCTGTCCCATTCAACCATACCACTCCGAACCACCATCCATCCCTCTACTTACTACCACTCACCCACCGTTACCCTCCAATTACCCATATCCAACCCACTGCCACTTACCCTACCATTACCCTACCATCCACCATGACCTACTCACCATACTGTTCTTCTACCCACCATATTGAAACGCTAACAAATGATCGTAAATAACACACACGTGCTTACCCTACCACTTTATACCACCACCACATGCCATACTCACCCTCACTTGTATACTGATTTTACGTACGCACACGGATGCTACAGTATATACCATCTCAAACTTACCCTACTCTCAGATTCCACTTCACTCCATGGCCCATCTCTCACTGAATCAGTACCAAATGCACTCACATCATTATGCACGGCACTTGCCTCAGCGGTCTATACCCTGTGCCATTTACCCATAACGCCCATCATTATCCACATTTTGATATCTATATCTCATTCGGCGGTCCCAAATATTGTATAACTGCCCTTAATACATACGTTATACCACTTTTGCACCATATACTTACCACTCCATTTATATACACTTATGTCAATATTACAGAAAAATCCCCACAAAAATCACCTAAACATAAAAATATTCTACTTTTCAACAATAATACATAAACATATTGGCTTGTGGTAGCAACACTATCATGGTATCACTAACGTAAAAGTTCCTCAATATTGCAATTTGCTTGAACGGATGCTATTTCAGAATATTTCGTACTTACACAGGCC

In [6]:
chromList = []

index = 0
for chromSeq in chromSeqsData:
    if 'chromosome' in chromSeq:
        chromStripStart = chromSeq.index('chromosome=') - 1
        chromStripEndSub = chromSeq.index('chromosome=') + 50
        subString = chromSeq[chromStripStart:chromStripEndSub]
        chromStripEnd = subString.index(']')
        chromosome = chromSeq[chromStripStart:chromStripStart + chromStripEnd + 1]
        chromosomeSequence = chromSeq[chromStripStart+chromStripEnd+1:]
        index += 1
        chromList.append([chromosome, chromosomeSequence])

chromSeqsDataFrame = pd.DataFrame(chromList, columns=['Chromosome', 'Sequence'])
chromSeqsDataFrame

Unnamed: 0,Chromosome,Sequence
0,[chromosome=I],CCACACCACACCCACACACCCACACACCACACCACACACCACACCA...
1,[chromosome=II],AAATAGCCCTCATGTACGTCTCCTCCAAGCCCTGTTGTCTCTTACC...
2,[chromosome=III],CCCACACACCACACCCACACCACACCCACACACCACACACACCACA...
3,[chromosome=IV],ACACCACACCCACACCACACCCACACACACCACACCCACACACCAC...
4,[chromosome=V],CGTCTCCTCCAAGCCCTGTTGTCTCTTACCCGGATGTTCAACCAAA...
5,[chromosome=VI],GATCTCGCAAGTGCATTCCTAGACTTAATTCATATCTGCTCCTCAA...
6,[chromosome=VII],CCACACCCACACACACCACACCCACACCCACACACTACCCTAACAC...
7,[chromosome=VIII],CCCACACACACCACACCCACACACCACACCCACACTTTTCACATCT...
8,[chromosome=IX],CACACACACCACACCCACACCACACCACACCACACCCACACCCACA...
9,[chromosome=X],CCCACACACACACCACACCCACACCCACACACACCACACCCACACA...


In [7]:
#df.index[df['column_name']==value].tolist()

peakSeq = chromSeqsDataFrame['Chromosome'].at[0]
peakSeq

chromSeqsDataFrame.index[chromSeqsDataFrame['Chromosome']==peakSeq][0]

0

In [8]:
boundaryBindingRegionsList = []

index = 0
for peak in range(len(boundaryLogData)):
    chromosome = boundaryLogData.iat[index,0]
    newChrom = boundaryLogData.iat[index,0] + str('.') + str(index)
    peakStart = int(boundaryLogData.iat[index,1])
    peakEnd = int(boundaryLogData.iat[index,2])
    dfIndex = chromSeqsDataFrame.index[chromSeqsDataFrame['Chromosome']==chromosome][0]
    peakSeq = chromSeqsDataFrame.iloc[dfIndex,1][peakStart:peakEnd + 1]
    index += 1
    boundaryBindingRegionsList.append([newChrom,peakSeq])

boundaryBindingRegionsDF = pd.DataFrame(boundaryBindingRegionsList, columns=['Chromosome', 'PeakSeq'])
boundaryBindingRegionsDF
    

Unnamed: 0,Chromosome,PeakSeq
0,[chromosome=I].0,CGAAGAAAAGTTTAATTAACTTTCAAATGCCAGAACTAAAGATTGA...
1,[chromosome=I].1,CACGTGAACATTTTTAGGGGATGGAGAGTGCTACGCCGTTCGTCCG...
2,[chromosome=I].2,GTAACTTTTCATTTCCAAGAACCTCTTTTTTCCAGTTATATCATGG...
3,[chromosome=I].3,AAGCAAATGATTGACGCTGGAACACCAGATGTTGGCCACAAATCTA...
4,[chromosome=I].4,AATTTTGTAAATCTCTGTTTACACTTATGACGGAAAAACTTGCTCC...
...,...,...
752,[chromosome=XVI].752,TCCACTAGTTATCCGACTAGCAAGGCAACTAGTCAAGATGAAACCG...
753,[chromosome=XVI].753,CCTTTGACCATCGCCTCCGACCCCATGAGCGGGTTCTTCGGTCTAC...
754,[chromosome=XVI].754,GGCAAGAATTTTAGGTACAAGAGCCCTACAAATTTCCATGAATGCG...
755,[chromosome=XVI].755,TAAACTTGATAAATTCACTGACGGCGGCCTATTTACTCTGTTTGTA...


In [9]:
boundaryBindingRegionsDF.to_csv('Data/Boundaries_Log_PeakSeqs.csv')

In [10]:
boundaryLogPath = data_dir + 'Boundaries_Log.bed'

boundaryLogData = pd.read_csv(boundaryLogPath, delimiter='\t')
boundaryLogData.columns=['Chr', 'PeakStart', 'PeakEnd', 'Tag', 'I']
boundaryLogData

Unnamed: 0,Chr,PeakStart,PeakEnd,Tag,I
0,chrI,20801,21000,104|sacCer3|chrI:20801-21000,0.637253
1,chrI,45401,45600,227|sacCer3|chrI:45401-45600,0.773369
2,chrI,71601,71800,358|sacCer3|chrI:71601-71800,0.707931
3,chrI,93001,93200,465|sacCer3|chrI:93001-93200,0.540502
4,chrI,100801,101000,504|sacCer3|chrI:100801-101000,0.479720
...,...,...,...,...,...
752,chrXVI,860601,860800,59927|sacCer3|chrXVI:860601-860800,0.460258
753,chrXVI,901201,901400,60130|sacCer3|chrXVI:901201-901400,0.955603
754,chrXVI,911601,911800,60182|sacCer3|chrXVI:911601-911800,0.421940
755,chrXVI,920201,920400,60225|sacCer3|chrXVI:920201-920400,0.529056


In [44]:
boundaryQPath = data_dir + 'Boundaries_Q.bed'

boundaryQData = pd.read_csv(boundaryQPath, delimiter='\t')
boundaryQData.columns=['Chr', 'PeakStart', 'PeakEnd', 'Tag', 'I']
boundaryQData

Unnamed: 0,Chr,PeakStart,PeakEnd,Tag,I
0,chrI,26601,26800,133|sacCer3|chrI:26601-26800,0.565811
1,chrI,32601,32800,163|sacCer3|chrI:32601-32800,0.600932
2,chrI,45401,45600,227|sacCer3|chrI:45401-45600,1.454368
3,chrI,70201,70400,351|sacCer3|chrI:70201-70400,1.276343
4,chrI,101201,101400,506|sacCer3|chrI:101201-101400,0.400332
...,...,...,...,...,...
850,chrXVI,860801,861000,59928|sacCer3|chrXVI:860801-861000,0.838822
851,chrXVI,875401,875600,60001|sacCer3|chrXVI:875401-875600,0.521443
852,chrXVI,901401,901600,60131|sacCer3|chrXVI:901401-901600,1.105070
853,chrXVI,920601,920800,60227|sacCer3|chrXVI:920601-920800,1.004107


In [45]:
peakLogFilePath = data_dir + 'Condensin_peaks_Log.bed'
peakLogFileData = pd.read_csv(peakLogFilePath, delimiter='\t')
peakLogFileData.columns=['Chr', 'Peak', 'PeakEnd', 'Tag', 'I']
peakLogFileData = peakLogFileData.drop(['PeakEnd'], axis=1)
peakLogFileData

Unnamed: 0,Chr,Peak,Tag,I
0,I,1233,Brn1L__peak_2,5.70204
1,I,70693,Brn1L__peak_3,2.98313
2,I,72763,Brn1L__peak_4,12.02753
3,I,73568,Brn1L__peak_5,2.16854
4,I,139187,Brn1L__peak_6,9.92922
...,...,...,...,...
1000,pombeIII,2442132,Brn1L__peak_1002,4.70638
1001,pombeIII,2443332,Brn1L__peak_1003,6.17034
1002,pombeIII,2446043,Brn1L__peak_1004,11.86138
1003,pombeIII,2447349,Brn1L__peak_1005,7.15511


In [46]:
peakQFilePath = data_dir + 'Condensin_peaks_quiescence.bed'
peakQFileData = pd.read_csv(peakQFilePath, delimiter='\t')
peakQFileData.columns=['Chr', 'Peak', 'PeakEnd', 'Tag', 'I']
peakQFileData = peakQFileData.drop(['PeakEnd'], axis=1)
peakQFileData

Unnamed: 0,Chr,Peak,Tag,I
0,I,2557,Brn1Q_all__peak_2,3.93538
1,I,20936,Brn1Q_all__peak_3,16.59925
2,I,29936,Brn1Q_all__peak_4,6.21531
3,I,31272,Brn1Q_all__peak_5,7.41001
4,I,33022,Brn1Q_all__peak_6,4.44991
...,...,...,...,...
1682,pombeIII,2442304,Brn1Q_all__peak_1684,8.14362
1683,pombeIII,2443323,Brn1Q_all__peak_1685,13.46661
1684,pombeIII,2446041,Brn1Q_all__peak_1686,20.33775
1685,pombeIII,2447820,Brn1Q_all__peak_1687,8.36264


In [47]:
workingPeaks = peakQFileData

workingPeaks.iloc[0,1]

2557

In [86]:
#Detect peaks within boundaries.

def Peaks_Within_Boundary(peakFile, boundaryFile, width=0):
    
    chroms = ['I', 'II', 'III', 'IV', 'V', 'VI', 'VII', 'VIII', 'IX', 'X', 'XI', 'XII', 'XIII', 'XIV', 'XV', 'XVI']
    bChroms = ['chrI','chrII','chrIII','chrIV', 'chrV', 'chrVI', 'chrVII', 'chrVIII', 'chrIX','chrX', 'chrXI', 'chrXII', 'chrXIII', 'chrXIV', 'chrXV', 'chrXVI']

    curPeaksWithin = 0
    peaksPerChrom = []
    pIndex = 0
    bIndex = 0

    for chrom in chroms:
        workingPeaks = peakFile[peakFile['Chr'] == chrom]
        peakLen = len(workingPeaks)
    for chrom in bChroms:
        workingBound = boundaryFile[boundaryFile['Chr'] == chrom]
        #print(workingBound)
        for peak in workingPeaks['Peak']:
            for boundary in workingBound['PeakStart']:
                if peak >= boundary - width and peak <= (boundary + 199 + width):
                    #print(peak)
                    #print(boundary)
                    curPeaksWithin += 1
        peaksPerChrom.append([str(chrom), curPeaksWithin, round(curPeaksWithin/peakLen, 3)])
        curPeaksWithin = 0
            
    
    peaksPerChrom = pd.DataFrame(peaksPerChrom, columns=['Chr', 'PeaksWithin', 'Ratio'])
    return peaksPerChrom

LogPeaksWithin = Peaks_Within_Boundary(peakQFileData, boundaryQData, width=0)
LogPeaksWithin



Unnamed: 0,Chr,PeaksWithin,Ratio
0,chrI,0,0.0
1,chrII,2,0.016
2,chrIII,1,0.008
3,chrIV,0,0.0
4,chrV,0,0.0
5,chrVI,1,0.008
6,chrVII,1,0.008
7,chrVIII,1,0.008
8,chrIX,0,0.0
9,chrX,0,0.0


In [87]:
QPeaksWithin = Peaks_Within_Boundary(peakLogFileData, boundaryLogData, width=0)
QPeaksWithin

Unnamed: 0,Chr,PeaksWithin,Ratio
0,chrI,0,0.0
1,chrII,2,0.03
2,chrIII,0,0.0
3,chrIV,1,0.015
4,chrV,0,0.0
5,chrVI,0,0.0
6,chrVII,3,0.045
7,chrVIII,1,0.015
8,chrIX,0,0.0
9,chrX,0,0.0
