# Program name: ChIP-seq peak collection of gene set from RNA-seq experiment

- Version: 2023-02-20
- Author: Vera Laub
- Stage: complete
- Input: .txt with RNA-Seq data (ENSEMBLE ID, Gene ID, Fold change), .txt files with ChIP-seq data (Gene ID, peak chr, peak start, peak stop, summit chr, summit start, summit stop)
- Output: .bed files with ChIP-seq peaks of all genes defined in RNA-seq file (peak chr, peak start, peak stop, Gene ID); peaks for up/downregulated genes are saved in separate files; whole peaks and 200nt summit of peaks saved in separate files for subsequent analysis
- Description: This program serves the purpose of assembling Genes from an RNA-seq file (1. Input) and collecting the respective ChIP-seq peaks (2. Input), i.e. PBX1 ChIP-seq peaks of genes that are differentially regulated in Pbx1 kd RNA-seq in aNS cells (but can be used on any kind of ChIP/RNA-seq, if necessary data is provided). Code meant to be flexible :)!

In [1]:
# Set up list of genes used for collection of peaks

# Retrieve RNA-seq peaks file
file = open("pbx1_chip_rna_pbx1kd/rna-seq_aNS_pbx1_kd.txt", "r")
rna_seq_genes = file.readlines()
file.close()

#print(rna_seq_genes)

['ENSMUSG00000039997\tIfi203\t3.47\n', 'ENSMUSG00000097767\tMiat\t3.06\n', 'ENSMUSG00000010797\tWnt2\t2.32\n', 'ENSMUSG00000024697\tGna14\t2.16\n', 'ENSMUSG00000032291\tCrabp1\t2.08\n', 'ENSMUSG00000044313\tMab21l3\t2.08\n', 'ENSMUSG00000035783\tActa2\t2.04\n', 'ENSMUSG00000002633\tShh\t2.04\n', 'ENSMUSG00000024526\tCidea\t2.03\n', 'ENSMUSG00000050808\tMuc15\t2.03\n', 'ENSMUSG00000063297\tLuzp2\t1.98\n', 'ENSMUSG00000049892\tRasd1\t1.97\n', 'ENSMUSG00000035864\tSyt1\t1.94\n', 'ENSMUSG00000053846\tLipg\t1.94\n', 'ENSMUSG00000048834\tVstm2a\t1.94\n', 'ENSMUSG00000056158\tCar10\t1.92\n', 'ENSMUSG00000050711\tScg2\t1.92\n', 'ENSMUSG00000090061\tNwd2\t1.87\n', 'ENSMUSG00000001333\tSync\t1.84\n', 'ENSMUSG00000009734\tPou6f2\t1.83\n', 'ENSMUSG00000091956\tC2cd4b\t1.81\n', 'ENSMUSG00000030092\tCntn6\t1.81\n', 'ENSMUSG00000039717\tRalyl\t1.79\n', 'ENSMUSG00000050359\tSprr1a\t1.77\n', 'ENSMUSG00000032085\tTagln\t1.70\n', 'ENSMUSG00000068614\tActc1\t1.67\n', 'ENSMUSG00000055026\tGabrg3\t1.67\n', 

In [2]:
# Iterate over RNA-seq file and store information of genes in list, process data to remove whitepsace    
for n in range(0, len(rna_seq_genes)):
    rna_seq_genes[n] = rna_seq_genes[n].strip()
    rna_seq_genes[n] = rna_seq_genes[n].split()
    rna_seq_genes[n][2] = float(rna_seq_genes[n][2])
    

# Initiate 2 separate list of up-/ downregulated genes
genes_id_up = []
genes_id_down = []
j = 0

for line in range(0, len(rna_seq_genes)):
    if rna_seq_genes[j][2] > 0:
        genes_id_up.append(rna_seq_genes[j][1])
        j += 1
    elif rna_seq_genes[j][2] < 0:
        genes_id_down.append(rna_seq_genes[j][1])
        j += 1


print("Upregulated genes used for peak collection:", genes_id_up[:10])
print("Downregulated genes used for peak collection:", genes_id_down[:10])

Upregulated genes used for peak collection: ['Ifi203', 'Miat', 'Wnt2', 'Gna14', 'Crabp1', 'Mab21l3', 'Acta2', 'Shh', 'Cidea', 'Muc15']
Downregulated genes used for peak collection: ['Ccnd2', 'Atad2', 'Trak1', 'Xiap', 'Add3', 'Zfp652', 'Ptar1', 'Klf12', 'Bbx', 'Dcp2']


In [3]:
# Set up list of ChIP-seq peaks to collect from

# Retrieve ChIP-seq peaks file
file = open("pbx1_chip_rna_pbx1kd/chip_seq_pbx1_aNS.txt", "r")
chip_seq_peaks = file.readlines()
file.close()

print(chip_seq_peaks[:10])

['Gm10655\tchr9\t61368487\t61372608\tchr9\t61370468\t61370668\n', 'Xpo1\tchr11\t23255159\t23256311\tchr11\t23255740\t23255940\n', 'D930016D06Rik\tchr5\t104570442\t104572056\tchr5\t104571465\t104571665\n', 'Slc39a3\tchr10\t81036891\t81037851\tchr10\t81037299\t81037499\n', 'Sp2\tchr11\t96975685\t96978462\tchr11\t96977668\t96977868\n', 'Actr2\tchr11\t20112098\t20113457\tchr11\t20112899\t20113099\n', 'Dcaf4\tchr12\t83520026\t83520919\tchr12\t83520367\t83520567\n', 'Acap2\tchr16\t31199390\t31201701\tchr16\t31201137\t31201337\n', 'Nek7\tchr1\t138617295\t138618621\tchr1\t138618062\t138618262\n', 'Fbxl19\tchr7\t127768548\t127770250\tchr7\t127768904\t127769104\n']


In [4]:
# Iterate over ChIP-seq file and store information of peaks in list, process data to remove whitepsace    
for n in range(0, len(chip_seq_peaks)):
    chip_seq_peaks[n] = chip_seq_peaks[n].strip()
    chip_seq_peaks[n] = chip_seq_peaks[n].split()
    
print(chip_seq_peaks[:10])

[['Gm10655', 'chr9', '61368487', '61372608', 'chr9', '61370468', '61370668'], ['Xpo1', 'chr11', '23255159', '23256311', 'chr11', '23255740', '23255940'], ['D930016D06Rik', 'chr5', '104570442', '104572056', 'chr5', '104571465', '104571665'], ['Slc39a3', 'chr10', '81036891', '81037851', 'chr10', '81037299', '81037499'], ['Sp2', 'chr11', '96975685', '96978462', 'chr11', '96977668', '96977868'], ['Actr2', 'chr11', '20112098', '20113457', 'chr11', '20112899', '20113099'], ['Dcaf4', 'chr12', '83520026', '83520919', 'chr12', '83520367', '83520567'], ['Acap2', 'chr16', '31199390', '31201701', 'chr16', '31201137', '31201337'], ['Nek7', 'chr1', '138617295', '138618621', 'chr1', '138618062', '138618262'], ['Fbxl19', 'chr7', '127768548', '127770250', 'chr7', '127768904', '127769104']]


In [32]:
# Function to retrieve ChIP-seq peak information of defined gene set
# something is wrong with counters!


def construct_peak_collection(genes, chip_peaks, output_list):
    k = 0
    for line in range(len(genes)):
        i = 0
        for line in range(len(chip_peaks)):
            if genes[k] == chip_peaks[i][0]:
                output_list.append([chip_peaks[i][1], chip_peaks[i][2], chip_peaks[i][3], chip_peaks[i][0]])
            i += 1
        k += 1
    return output_list

In [36]:
# Create list for ChIP-seq (whole) peaks with : Gene ID, chr, peak start, peak stop
j = 0
chip_seq_whole_peaks = []

for n in range(0, len(chip_seq_peaks)):
    chip_seq_whole_peaks.append([chip_seq_peaks[j][0], chip_seq_peaks[j][1], chip_seq_peaks[j][2], chip_seq_peaks[j][3]])
    j += 1
    
#print(chip_seq_whole_peaks[:10])


# Store information of ChIP-seq (whole) peaks of upregulated genes in two different lists
chip_seq_peaks_Pbx1kd_up = []
chip_seq_peaks_Pbx1kd_up = construct_peak_collection(genes_id_up, chip_seq_whole_peaks, chip_seq_peaks_Pbx1kd_up)
#print(chip_seq_peaks_Pbx1kd_up[:10])


# Save information on (whole) peaks of upregulated genes in output file
chip_seq_peaks_Pbx1kd_up_output = open("pbx1_chip_rna_pbx1kd/chip_seq_whole_peaks_Pbx1kd_up.bed", "w")

def reconstruct_peak_file(chr_peaks_kd_targets):
    k = 0
    for line in range(len(chr_peaks_kd_targets)):
        chip_seq_peaks_Pbx1kd_up_output.write(chr_peaks_kd_targets[k][0] + "\t" + chr_peaks_kd_targets[k][1] + "\t" + chr_peaks_kd_targets[k][2]  + "\t" + chr_peaks_kd_targets[k][3] + "\n")
        k += 1
    return chip_seq_peaks_Pbx1kd_up_output

reconstruct_peak_file(chip_seq_peaks_Pbx1kd_up)
chip_seq_peaks_Pbx1kd_up_output.close()



# Store information of ChIP-seq (whole) peaks of downregulated genes in two different lists
chip_seq_peaks_Pbx1kd_down = []
chip_seq_peaks_Pbx1kd_down = construct_peak_collection(genes_id_down, chip_seq_whole_peaks, chip_seq_peaks_Pbx1kd_down)



# Save information on (whole) peaks of downregulated genes in output file
chip_seq_peaks_Pbx1kd_down_output = open("pbx1_chip_rna_pbx1kd/chip_seq_whole_peaks_Pbx1kd_down.bed", "w")

def reconstruct_peak_file(chr_peaks_kd_targets):
    k = 0
    for line in range(len(chr_peaks_kd_targets)):
        chip_seq_peaks_Pbx1kd_down_output.write(chr_peaks_kd_targets[k][0] + "\t" + chr_peaks_kd_targets[k][1] + "\t" + chr_peaks_kd_targets[k][2]  + "\t" + chr_peaks_kd_targets[k][3] + "\n")
        k += 1
    return chip_seq_peaks_Pbx1kd_down_output

reconstruct_peak_file(chip_seq_peaks_Pbx1kd_down)
chip_seq_peaks_Pbx1kd_down_output.close()

In [44]:
# Create list for ChIP-seq (200nt summit) peaks with : Gene ID, chr, peak start, peak stop
j = 0
chip_seq_200nt_peaks = []

for n in range(0, len(chip_seq_peaks)):
    chip_seq_200nt_peaks.append([chip_seq_peaks[j][0], chip_seq_peaks[j][4], chip_seq_peaks[j][5], chip_seq_peaks[j][6]])
    j += 1
    
print(chip_seq_200nt_peaks[:10])


# Store information of ChIP-seq (200nt summit) peaks of upregulated genes in two different lists
chip_seq_peaks_200nt_Pbx1kd_up = []
chip_seq_peaks_200nt_Pbx1kd_up = construct_peak_collection(genes_id_up, chip_seq_200nt_peaks, chip_seq_peaks_200nt_Pbx1kd_up)
print(chip_seq_peaks_200nt_Pbx1kd_up[:10])


# Save information on 200nt summit peaks of upregulated genes in output file
chip_seq_peaks_200nt_Pbx1kd_up_output = open("pbx1_chip_rna_pbx1kd/chip_seq_200nt_peaks_Pbx1kd_up.bed", "w")

def reconstruct_peak_file(chr_peaks_kd_targets):
    k = 0
    for line in range(len(chr_peaks_kd_targets)):
        chip_seq_peaks_200nt_Pbx1kd_up_output.write(chr_peaks_kd_targets[k][0] + "\t" + chr_peaks_kd_targets[k][1] + "\t" + chr_peaks_kd_targets[k][2]  + "\t" + chr_peaks_kd_targets[k][3] + "\n")
        k += 1
    return chip_seq_peaks_200nt_Pbx1kd_up_output

reconstruct_peak_file(chip_seq_peaks_200nt_Pbx1kd_up)
chip_seq_peaks_200nt_Pbx1kd_up_output.close()


# Store information of ChIP-seq (200nt summit) peaks of downregulated genes in two different lists
chip_seq_peaks_200nt_Pbx1kd_down = []
chip_seq_peaks_200nt_Pbx1kd_down = construct_peak_collection(genes_id_down, chip_seq_200nt_peaks, chip_seq_peaks_200nt_Pbx1kd_down)



# Save information on 200nt summit peaks of downregulated genes in output file
chip_seq_peaks_200nt_Pbx1kd_down_output = open("pbx1_chip_rna_pbx1kd/chip_seq_200nt_peaks_Pbx1kd_down.bed", "w")

def reconstruct_peak_file(chr_peaks_kd_targets):
    k = 0
    for line in range(len(chr_peaks_kd_targets)):
        chip_seq_peaks_200nt_Pbx1kd_down_output.write(chr_peaks_kd_targets[k][0] + "\t" + chr_peaks_kd_targets[k][1] + "\t" + chr_peaks_kd_targets[k][2]  + "\t" + chr_peaks_kd_targets[k][3] + "\n")
        k += 1
    return chip_seq_peaks_200nt_Pbx1kd_down_output

reconstruct_peak_file(chip_seq_peaks_200nt_Pbx1kd_down)
chip_seq_peaks_200nt_Pbx1kd_down_output.close()

[['Gm10655', 'chr9', '61370468', '61370668'], ['Xpo1', 'chr11', '23255740', '23255940'], ['D930016D06Rik', 'chr5', '104571465', '104571665'], ['Slc39a3', 'chr10', '81037299', '81037499'], ['Sp2', 'chr11', '96977668', '96977868'], ['Actr2', 'chr11', '20112899', '20113099'], ['Dcaf4', 'chr12', '83520367', '83520567'], ['Acap2', 'chr16', '31201137', '31201337'], ['Nek7', 'chr1', '138618062', '138618262'], ['Fbxl19', 'chr7', '127768904', '127769104']]
[['chr1', '173942666', '173942866', 'Ifi203'], ['chr6', '17968913', '17969113', 'Wnt2'], ['chr6', '18029846', '18030046', 'Wnt2'], ['chr6', '18031674', '18031874', 'Wnt2'], ['chr19', '16593447', '16593647', 'Gna14'], ['chr19', '16484858', '16485058', 'Gna14'], ['chr19', '16540895', '16541095', 'Gna14'], ['chr19', '16598958', '16599158', 'Gna14'], ['chr19', '16442097', '16442297', 'Gna14'], ['chr19', '16529349', '16529549', 'Gna14']]
