## Program name: pbx1_chip-seq_atac-seq_rna-seq_combination

- Version: 2023-02-20
- Author: Vera Laub
- Stage: Currently only copy of PBX1 ChIP-seq + ATAC-seq, RNA-seq not included yet!
- Input: 
1. .txt with PBX1 ChIP-seq motifs (MOTIF_ID, ALT_ID, CONSENSUS, WIDTH, SITES, E-VALUE)
2. .txt with ATAC-seq footprinting motifs (ID, Alt ID, p-value, E-value, TP Thresh, TP #, TP (%), FP #, FP (%))
3. .txt with RNA-seq aNS WT -> keep working on this!
- Output: .txt with combination of information in all 3 files (MOTIF_ID, PBX1 MEME-ChIP ALT_ID, PBX1 MEME-ChIP CONSENSUS, PBX1 MEME-ChIP WIDTH, PBX1 MEME-ChIP SITES, PBX1 MEME-ChIP E-VALUE, ATAC Footprint p-value, ATAC Footprint E-value, ATAC Footprint TP Thresh, ATAC Footprint TP (%), ATAC Footprint FP (%), RNA-seq ENSEMBLE IDs, RNA-seq WT Mean reads)

In [18]:
# Set up list of PBX1 ChIP-seq motifs

# Retrieve PBX1 ChIP-seq motifs
file = open("pbx1_chip_atac_footprinting/PBX1_ChIP-seq_MA_IDs_only.txt", "r")
chip_seq_motifs = file.readlines()
file.close()

print(chip_seq_motifs[:10])

['MA0782.2\tPKNOX1\tNNTGAGTGACAGVNN\t15\t74780\t5.4e-9768\n', 'MA1114.1\tPBX3\tNNNTGAGTGACAGGBNN\t17\t73416\t6.1e-9236\n', 'MA0774.1\tMEIS2\tTTGACAGS\t8\t81692\t2.8e-8927\n', 'MA0498.2\tMEIS1\tHTGACAD\t7\t92434\t1.5e-7946\n', 'MA0775.1\tMEIS3\tDTGACAGS\t8\t87467\t7.3e-7020\n', 'MA1572.1\tTGIF2LY\tTGACAGCTGTCA\t12\t42050\t6.5e-2486\n', 'MA1571.1\tTGIF2LX\tTGACAGCTGTCA\t12\t34834\t3.6e-2311\n', 'MA0797.1\tTGIF2\tTGACAGSTGTCA\t12\t17903\t8.8e-1701\n', 'MA1644.1\tNFYC\tRDCCAATCASN\t11\t58768\t1.1e-1035\n', 'MA0060.3\tNFYA\tRRCCAATCAGM\t11\t43266\t2.5e-1024\n']


In [19]:
# Iterate over PBX1 ChIP-seq motifs and store information in list, process data to remove whitepsace    
for n in range(0, len(chip_seq_motifs)):
    chip_seq_motifs[n] = chip_seq_motifs[n].strip()
    chip_seq_motifs[n] = chip_seq_motifs[n].split()
    chip_seq_motifs[n][3] = int(chip_seq_motifs[n][3])
    chip_seq_motifs[n][4] = int(chip_seq_motifs[n][4])
    chip_seq_motifs[n][5] = float(chip_seq_motifs[n][5])

print(chip_seq_motifs[:10])

[['MA0782.2', 'PKNOX1', 'NNTGAGTGACAGVNN', 15, 74780, 0.0], ['MA1114.1', 'PBX3', 'NNNTGAGTGACAGGBNN', 17, 73416, 0.0], ['MA0774.1', 'MEIS2', 'TTGACAGS', 8, 81692, 0.0], ['MA0498.2', 'MEIS1', 'HTGACAD', 7, 92434, 0.0], ['MA0775.1', 'MEIS3', 'DTGACAGS', 8, 87467, 0.0], ['MA1572.1', 'TGIF2LY', 'TGACAGCTGTCA', 12, 42050, 0.0], ['MA1571.1', 'TGIF2LX', 'TGACAGCTGTCA', 12, 34834, 0.0], ['MA0797.1', 'TGIF2', 'TGACAGSTGTCA', 12, 17903, 0.0], ['MA1644.1', 'NFYC', 'RDCCAATCASN', 11, 58768, 0.0], ['MA0060.3', 'NFYA', 'RRCCAATCAGM', 11, 43266, 0.0]]


In [49]:
# Set up list of ATAC-seq footprinting motifs

# Retrieve ATAC-seq footprinting motifs
file = open("pbx1_chip_atac_footprinting/ATAC-seq_aNS_footprinting_MA_IDs_only.txt", "r")
atac_seq_motifs = file.readlines()
file.close()

print(atac_seq_motifs[:10])

['MA0084.1\tSRY\t5.52e-2497\t4.19e-2494\t1.01\t46775 (56.7%)\t484 (4.8%)\n', 'MA0703.1\tLMX1B\t5.25e-2469\t3.98e-2466\t1.01\t42889 (52.0%)\t240 (2.4%)\n', 'MA0893.1\tGSX2\t1.88e-2394\t1.43e-2391\t1.01\t42498 (51.5%)\t265 (2.6%)\n', 'MA1152.1\tSOX15\t1.34e-2272\t1.02e-2269\t1.01\t51271 (62.2%)\t1066 (10.6%)\n', 'MA0892.1\tGSX1\t1.05e-2240\t7.94e-2238\t1.01\t40399 (49.0%)\t241 (2.4%)\n', 'MA0722.1\tVAX1\t1.67e-2235\t1.26e-2232\t1.01\t40643 (49.3%)\t259 (2.6%)\n', 'MA0879.1\tDlx1\t2.30e-2224\t1.74e-2221\t1.01\t40467 (49.1%)\t256 (2.5%)\n', 'MA0675.1\tNKX6-2\t6.51e-2202\t4.94e-2199\t1.01\t40412 (49.0%)\t268 (2.7%)\n', 'MA0109.1\tHLTF\t1.24e-2162\t9.41e-2160\t1\t42230 (51.2%)\t414 (4.1%)\n', 'MA0902.1\tHOXB2\t2.00e-2109\t1.52e-2106\t1.01\t39078 (47.4%)\t252 (2.5%)\n']


In [50]:
# Iterate over ATAC-seq footprinting motifs and store information in list, process data to remove whitepsace    
for n in range(0, len(atac_seq_motifs)):
    atac_seq_motifs[n] = atac_seq_motifs[n].strip()
    atac_seq_motifs[n] = atac_seq_motifs[n].split()
    #atac_seq_motifs[n][2] = float(atac_seq_motifs[n][2])
    #atac_seq_motifs[n][3] = float(atac_seq_motifs[n][3])
    #atac_seq_motifs[n][4] = float(atac_seq_motifs[n][4])
   # atac_seq_motifs[n][5] = float(atac_seq_motifs[n][5])

print(atac_seq_motifs[:10])

[['MA0084.1', 'SRY', '5.52e-2497', '4.19e-2494', '1.01', '46775', '(56.7%)', '484', '(4.8%)'], ['MA0703.1', 'LMX1B', '5.25e-2469', '3.98e-2466', '1.01', '42889', '(52.0%)', '240', '(2.4%)'], ['MA0893.1', 'GSX2', '1.88e-2394', '1.43e-2391', '1.01', '42498', '(51.5%)', '265', '(2.6%)'], ['MA1152.1', 'SOX15', '1.34e-2272', '1.02e-2269', '1.01', '51271', '(62.2%)', '1066', '(10.6%)'], ['MA0892.1', 'GSX1', '1.05e-2240', '7.94e-2238', '1.01', '40399', '(49.0%)', '241', '(2.4%)'], ['MA0722.1', 'VAX1', '1.67e-2235', '1.26e-2232', '1.01', '40643', '(49.3%)', '259', '(2.6%)'], ['MA0879.1', 'Dlx1', '2.30e-2224', '1.74e-2221', '1.01', '40467', '(49.1%)', '256', '(2.5%)'], ['MA0675.1', 'NKX6-2', '6.51e-2202', '4.94e-2199', '1.01', '40412', '(49.0%)', '268', '(2.7%)'], ['MA0109.1', 'HLTF', '1.24e-2162', '9.41e-2160', '1', '42230', '(51.2%)', '414', '(4.1%)'], ['MA0902.1', 'HOXB2', '2.00e-2109', '1.52e-2106', '1.01', '39078', '(47.4%)', '252', '(2.5%)']]


In [52]:
# Function to combine motif information of PBX1 ChIP-seq and ATAC-seq footprinting motifs 

def motif_combination(chip_motifs, atac_motifs, output_list):
    k = 0
    for line in range(len(chip_motifs)):
        i = 0
        for line in range(len(atac_motifs)):
            if chip_motifs[k][0] == atac_motifs[i][0]:
                output_list.append([chip_motifs[k][0], \
                                    chip_motifs[k][1], \
                                    chip_motifs[k][2], \
                                    chip_motifs[k][3], \
                                    chip_motifs[k][4], \
                                    chip_motifs[k][5], \
                                    atac_motifs[i][2], \
                                    atac_motifs[i][3], \
                                    atac_motifs[i][4], \
                                    atac_motifs[i][5], \
                                    atac_motifs[i][6], \
                                    atac_motifs[i][7], \
                                    atac_motifs[i][8]])
            i += 1
        k += 1
    return output_list
                                    

# Define output list for collected motifs and store information in this list (of lists)
combined_motif_list = [["MOTIF_ID", "PBX1 MEME-ChIP ALT_ID", "PBX1 MEME-ChIP CONSENSUS", "PBX1 MEME-ChIP WIDTH",\
                       "PBX1 MEME-ChIP SITES", "PBX1 MEME-ChIP E-VALUE", "ATAC Footprint p-value", \
                       "ATAC Footprint E-value", "ATAC Footprint TP Threshold", "ATAC Footprint TP #", \
                       "ATAC Footprint TP (%)", "ATAC Footprint FP #", "ATAC Footprint FP (%)"]]
combined_motif_list = motif_combination(chip_seq_motifs, atac_seq_motifs, combined_motif_list)

print(combined_motif_list[:10])

[['MOTIF_ID', 'PBX1 MEME-ChIP ALT_ID', 'PBX1 MEME-ChIP CONSENSUS', 'PBX1 MEME-ChIP WIDTH', 'PBX1 MEME-ChIP SITES', 'PBX1 MEME-ChIP E-VALUE', 'ATAC Footprint p-value', 'ATAC Footprint E-value', 'ATAC Footprint TP Threshold', 'ATAC Footprint TP #', 'ATAC Footprint TP (%)', 'ATAC Footprint FP #', 'ATAC Footprint FP (%)'], ['MA1114.1', 'PBX3', 'NNNTGAGTGACAGGBNN', 17, 73416.0, 0.0, '5.29E-67', '4.01E-64', '1.03', '25990', '(31.5%)', '2316', '(23.0%)'], ['MA0774.1', 'MEIS2', 'TTGACAGS', 8, 81692.0, 0.0, '0.00E+00', '0.00E+00', '1.01', '31860', '(38.6%)', '2006', '(19.9%)'], ['MA0498.2', 'MEIS1', 'HTGACAD', 7, 92434.0, 0.0, '1.74e-1139', '1.32e-1136', '1', '42576', '(51.6%)', '1528', '(15.2%)'], ['MA0775.1', 'MEIS3', 'DTGACAGS', 8, 87467.0, 0.0, '9.45e-543', '7.18e-540', '1.01', '34023', '(41.2%)', '1693', '(16.8%)'], ['MA0797.1', 'TGIF2', 'TGACAGSTGTCA', 12, 17903.0, 0.0, '3.10E-121', '2.35E-118', '1.04', '9155', '(11.1%)', '422', '(4.2%)'], ['MA0070.1', 'PBX1', 'HCATCAATCAAW', 12, 25644.0,

In [53]:
# Save combined information on PBX1 ChIP-seq and ATAC-seq footprint motifs in output file
pbx1_chip_seq_atac_seq_footprint_output = open("pbx1_chip_atac_footprinting/pbx1_chip_seq_atac_seq_footprint_aNS.txt", "w")

def construct_combination_motif_file(combined_motifs):
    k = 0
    for line in range(len(combined_motifs)):
        pbx1_chip_seq_atac_seq_footprint_output.write(str(combined_motifs[k][0]) + "\t" + \
                                                      str(combined_motifs[k][1]) + "\t" + \
                                                      str(combined_motifs[k][2]) + "\t" + \
                                                      str(combined_motifs[k][3]) + "\t" + \
                                                      str(combined_motifs[k][4]) + "\t" + \
                                                      str(combined_motifs[k][5]) + "\t" + \
                                                      str(combined_motifs[k][6]) + "\t" + \
                                                      str(combined_motifs[k][7]) + "\t" + \
                                                      str(combined_motifs[k][8]) + "\t" + \
                                                      str(combined_motifs[k][9]) + "\t" + \
                                                      str(combined_motifs[k][10]) + "\t" + \
                                                      str(combined_motifs[k][11]) + "\t" + \
                                                      str(combined_motifs[k][12]) + "\n")
        k += 1
    return pbx1_chip_seq_atac_seq_footprint_output

construct_combination_motif_file(combined_motif_list)
pbx1_chip_seq_atac_seq_footprint_output.close()