# Probe Designer


## Environment


In [4]:
# basci env
import os
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# # get gene data from ncbi
# from Bio import Entrez

# # blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("../lib")

# dir
workdir = './dataset/2024.1.23_Sindy_marker_genes/'
os.makedirs(workdir, exist_ok=True)

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)

output = os.path.join(workdir, 'results', formatted_time)
pre_binding_dir = os.path.join(output, "pre_binding")
os.makedirs(output, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total.fasta"

# tmp file
pre_binding_num_file = "pre_binding_num.json"
blast_results_file = "blast_results.xml"

In [5]:
organism = 'mouse'
gene_info = pd.read_excel(os.path.join(workdir, "cell_type_marker_list.xlsx"), sheet_name='cell_type_list')
gene_list = list(gene_info['gene'].unique())

## Get seq from ensembl dataset

In [None]:
# from lib.database_interaction import ensembl_id_to_seqs
# import time

# skip = 0
# trial = 0

# tmp_isoform_list = isoform_list[skip:]
# tmp_id_list = id_list[len(sequences_of_all)+skip:]

# for i in range(len(tmp_isoform_list)):
#     isoform = tmp_isoform_list[i]
#     id = tmp_id_list[i]
#     sequences_of_all[f'{id}_{isoform}'] = dict()
#     sequences = ensembl_id_to_seqs(gene=isoform, gene_id=id.split('.')[0], seq_type='cds')
#     for desc, sequence in sequences.items():
#         sequences_of_all[f'{id}_{isoform}'][desc] = sequence

In [12]:
from lib.database_interaction import ensembl_name_to_seqs
import time


sequences_of_all = dict()
skip, trial = 0, 0

while True:
    if skip == len(gene_list): break
    if trial > 3:
        skip += 1
        trial = 0
    try:
        tmp_gene_list = gene_list[skip:]
        for gene in tmp_gene_list:
            sequences_of_all[gene] = dict()
            sequences = ensembl_name_to_seqs(gene=gene, species='mouse', seq_type='cds')
            for desc, sequence in sequences.items():
                sequences_of_all[gene][desc] = sequence
            skip += 1
    except: 
        trial += 1
        time.sleep(5)
        
with open(os.path.join(output, 'sequence_of_all.json'), 'w') as file: json.dump(sequences_of_all, file)

Gene:	Ttr: 100%|██████████| 1/1 [00:00<00:00,  2.06it/s]
Gene:	Tmem72: 100%|██████████| 2/2 [00:01<00:00,  1.87it/s]
Gene:	Rgs22: 100%|██████████| 6/6 [00:03<00:00,  1.89it/s]
Gene:	Spag17: 100%|██████████| 12/12 [00:06<00:00,  1.99it/s]
Gene:	Dnah12: 100%|██████████| 2/2 [00:01<00:00,  1.75it/s]
Gene:	Adamtsl3: 100%|██████████| 6/6 [00:03<00:00,  1.95it/s]
Gene:	Bnc2: 100%|██████████| 23/23 [00:11<00:00,  2.02it/s]
Gene:	Ptprb: 100%|██████████| 6/6 [00:03<00:00,  1.73it/s]
Gene:	Adgrl4: 100%|██████████| 9/9 [00:04<00:00,  1.88it/s]
Gene:	Hexb: 100%|██████████| 8/8 [00:04<00:00,  1.67it/s]
Gene:	Tgfbr1: 100%|██████████| 5/5 [00:03<00:00,  1.64it/s]
Gene:	F13a1: 100%|██████████| 7/7 [00:04<00:00,  1.56it/s]
Gene:	Mrc1: 100%|██████████| 3/3 [00:01<00:00,  1.65it/s]
Gene:	Mobp: 100%|██████████| 8/8 [00:04<00:00,  1.64it/s]
Gene:	Prr5l: 100%|██████████| 11/11 [00:06<00:00,  1.82it/s]
Gene:	Pdgfra: 100%|██████████| 12/12 [00:06<00:00,  1.78it/s]
Gene:	Megf11: 100%|██████████| 10/10 [00:05<0

In [15]:
from pprint import pprint
pprint(sequences_of_all, sort_dicts=False)

{'Ttr': {'ENSMUST00000075312|Ttr-201|protein_coding': 'ATGGCTTCCCTTCGACTCTTCCTCCTTTGCCTCGCTGGACTGGTATTTGTGTCTGAAGCTGGCCCCGCGGGTGCTGGAGAATCCAAATGTCCTCTGATGGTCAAAGTCCTGGATGCTGTCCGAGGCAGCCCTGCTGTAGACGTGGCTGTAAAAGTGTTCAAAAAGACCTCTGAGGGATCCTGGGAGCCCTTTGCCTCTGGGAAGACCGCGGAGTCTGGAGAGCTGCACGGGCTCACCACAGATGAGAAGTTTGTAGAAGGAGTGTACAGAGTAGAACTGGACACCAAATCGTACTGGAAGACACTTGGCATTTCCCCGTTCCATGAATTCGCGGATGTGGTTTTCACAGCCAACGACTCTGGCCATCGCCACTACACCATCGCAGCCCTGCTCAGCCCATACTCCTACAGCACCACGGCTGTCGTCAGCAACCCCCAGAATTGA'},
 'Tmem72': {'ENSMUST00000056623|Tmem72-201|protein_coding': 'ATGAAGCTCCAGGTATTCTGGACTGGACTGGAGTATACCTGCCGGCTCTTGGGCATCGCTACGGCTGCAGTGTTGATTGGAGTGGGCACCGAGACCTTCCTCCGGGGGCGGTTCAAAAGCCTGGCCTTCTATCTGCTGTTTACAGGAGTCACCATCTCTGTGTGTGAAGGGACCTACTTTGTGGCTCAACTCTTGGCCATCTGCTTCAAGTGCCAGCCGGGGTCTCTGGCACACAGAGCGAAGGAGAGGGCCCACTGGCTGGGCTGCTTCCAGAAGTTCCTCGCCTACATGCTGCTGTCAGTGGCCTGCTTCCTCCACCCTGTCCTGGTCTGGCATGTGACCATTCCAGGCTCCATGTTAATCATCACTGGCCTGGCCTACTTCCTGCTGAGCAAGCGAAAGAAAAAAAAGGCTGCTCCAGAGGTGGCACCCCCCA

In [17]:
longest_isoforms = {}

for gene, isoforms in sequences_of_all.items():
    longest_isoform = None
    max_length = 0
    for isoform, sequence in isoforms.items():
        if len(sequence) > max_length:
            max_length = len(sequence)
            longest_isoform = isoform
    if longest_isoform:
        longest_isoforms[longest_isoform] = isoforms[longest_isoform]

with open(os.path.join(output, 'longest_isoforms.json'), 'w') as file: json.dump(longest_isoforms, file)

## Binding site Searcher


In [19]:
from lib.search_binding import step_by_step, find_max_min_difference_fixed_length_subsequence, seq_minus

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(os.path.join(file_out_dir, total_pre_binding_file_name), "w") as handle:
    handle.write("")

for desc, seq in longest_isoforms.items():
    id, gene_name, mol_type = desc.split('|')
    minus_seq = seq_minus(seq)
    
    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        minus_seq, gene=gene_name,
        BDS_len=40, BDS_num=50, min_gap=1, better_gap=40,
        G_min=0.25, G_max=0.7, G_consecutive=5, Tm_low=50, Tm_high=65,)
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = os.path.join(file_out_dir, gene_name + pre_binding_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir + total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(os.path.join(output, pre_binding_num_file), "w") as f:
    json.dump(pre_binding_num, f)

position_searching_Ttr-201: 100%|██████████| 316/316 [00:00<00:00, 31685.39it/s]
  FOI = pd.concat([FOI, add], ignore_index=True)


Gene Ttr-201: 	Not enough pos for 50 binding sites.
Gene Ttr-201: 	condition too harsh, loose to get better results
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 30, 82, 152, 153, 154, 155, 156, 157, 158, 159, 160, 162, 165, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 189, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 248, 249, 250, 251, 252, 253, 254, 255, 256, 257, 258, 259, 260, 261, 262, 263, 264, 265, 267, 268, 311, 312, 313, 314, 315]


position_searching_Tmem72-201: 100%|██████████| 624/624 [00:00<00:00, 23173.77it/s]


Gene Tmem72-201: 	condition too harsh, loose to get better results
[9, 23, 77, 90, 95, 100, 110, 115, 120, 125, 130, 135, 140, 145, 150, 157, 177, 195, 237, 242, 248, 253, 258, 263, 268, 273, 326, 340, 345, 368, 388, 393, 398, 403, 408, 413, 418, 423, 428, 434, 441, 446, 451, 476, 497, 502, 507, 513, 521, 526, 541, 607, 617, 622]


position_searching_Rgs22-203: 100%|██████████| 2983/2983 [00:00<00:00, 51568.11it/s]


Gene Rgs22-203: 	condition too harsh, loose to get better results
[93, 546, 553, 560, 567, 574, 585, 918, 925, 938, 1072, 1079, 1498, 1550, 1557, 1711, 1718, 1725, 1732, 1739, 1746, 1753, 1760, 1774, 1781, 1794, 1801, 1808, 1815, 1822, 1941, 2001, 2171, 2180, 2187, 2194, 2203, 2386, 2395, 2402, 2409, 2417, 2483, 2530, 2680, 2691, 2843, 2850, 2857, 2864, 2923]


position_searching_Spag17-204: 100%|██████████| 5531/5531 [00:00<00:00, 54370.37it/s]


Gene Spag17-204: 	condition too harsh, loose to get better results
[275, 333, 616, 748, 908, 936, 987, 1131, 1158, 1418, 1451, 1478, 1505, 1597, 1665, 1692, 1832, 1901, 1990, 2020, 2047, 2150, 2245, 2298, 2325, 2415, 2481, 2665, 2692, 3218, 3262, 3824, 3858, 4009, 4036, 4063, 4153, 4274, 4310, 4381, 4526, 4583, 4645, 4672, 4699, 4818, 4855, 5014, 5071, 5098, 5125, 5440, 5467]


position_searching_Dnah12-201: 100%|██████████| 9467/9467 [00:00<00:00, 72461.15it/s]


Gene Dnah12-201: 	condition too harsh, loose to get better results
[27, 238, 550, 859, 903, 926, 1131, 1197, 1265, 1600, 1748, 1768, 1790, 1958, 2413, 2497, 2517, 2612, 2827, 2847, 2909, 3301, 3452, 3524, 3599, 4271, 4579, 4950, 5173, 5318, 5384, 5600, 5967, 5987, 6364, 6458, 6633, 6802, 6823, 6849, 7329, 7419, 7439, 7791, 7811, 7837, 7857, 8283, 8776, 9111, 9410]


position_searching_Adamtsl3-203: 100%|██████████| 4057/4057 [00:00<00:00, 37665.41it/s]
position_searching_Bnc2-201: 100%|██████████| 2668/2668 [00:00<00:00, 30888.74it/s]


Gene Bnc2-201: 	condition too harsh, loose to get better results
[2, 31, 59, 109, 137, 166, 213, 241, 308, 336, 466, 544, 586, 619, 647, 675, 745, 773, 869, 933, 982, 1044, 1072, 1100, 1128, 1158, 1186, 1262, 1291, 1391, 1477, 1687, 1751, 1789, 1911, 1939, 2049, 2229, 2257, 2285, 2313, 2350, 2378, 2407, 2436, 2464, 2492, 2595, 2624, 2652]


position_searching_Ptprb-203: 100%|██████████| 5448/5448 [00:00<00:00, 31944.85it/s]
position_searching_Adgrl4-201: 100%|██████████| 1736/1736 [00:00<00:00, 37839.94it/s]


Gene Adgrl4-201: 	Not enough pos for 50 binding sites.
Gene Adgrl4-201: 	condition too harsh, loose to get better results
[307, 308, 309, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 455, 457, 516, 536, 627, 628, 629, 630, 631, 632, 633, 634, 635, 636, 637, 638, 639, 640, 641, 642, 643, 644, 645, 646, 647, 648, 649, 650, 651, 652, 653, 655, 657, 658, 659, 660, 661, 662, 663, 664, 665, 666, 669, 670, 672, 717, 734, 735, 966, 967, 968, 969, 970, 971, 972, 976, 1128, 1130, 1131, 1287, 1288, 1289, 1290, 1291, 1293, 1294, 1295, 1296, 1297, 1300, 1307, 1308, 1516, 1536, 1548, 1549, 1550, 1551, 1552, 1554, 1557, 1558, 1561, 1563, 1564, 1565, 1566, 1567, 1568, 1569, 1570, 1571, 1572, 1573, 1575, 1686, 1698, 1699, 1700, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1725]


position_searching_Gfm2-208: 100%|██████████| 1832/1832 [00:00<00:00, 40816.14it/s]


Gene Gfm2-208: 	condition too harsh, loose to get better results
[0, 30, 47, 72, 90, 107, 132, 152, 169, 186, 203, 220, 241, 280, 297, 314, 342, 359, 379, 403, 420, 503, 551, 576, 620, 638, 708, 725, 742, 759, 899, 923, 943, 1018, 1036, 1054, 1071, 1109, 1126, 1148, 1288, 1305, 1322, 1529, 1546, 1563, 1580, 1640, 1701, 1718, 1830]


position_searching_Tgfbr1-201: 100%|██████████| 1170/1170 [00:00<00:00, 55868.66it/s]


Gene Tgfbr1-201: 	Not enough pos for 50 binding sites.
Gene Tgfbr1-201: 	condition too harsh, loose to get better results
[211, 212, 213, 214, 215, 216, 217, 218, 219, 220, 221, 382, 402, 403, 855, 856, 857, 871, 872, 873, 880, 884, 951, 952, 955, 956, 961, 962, 963, 964, 966, 967, 968, 969, 970, 971, 972, 973, 974, 975, 976, 977, 978, 981, 1028, 1029, 1057, 1058, 1059, 1060]


position_searching_F13a1-201: 100%|██████████| 1721/1721 [00:00<00:00, 38347.39it/s]


Gene F13a1-201: 	condition too harsh, loose to get better results
[40, 50, 103, 113, 124, 138, 162, 172, 182, 214, 275, 289, 448, 461, 577, 588, 598, 612, 705, 715, 725, 735, 745, 755, 765, 775, 819, 829, 844, 856, 882, 922, 948, 1032, 1084, 1094, 1104, 1178, 1188, 1198, 1210, 1261, 1272, 1435, 1445, 1455, 1594, 1604, 1614, 1624, 1675]


position_searching_Mrc1-201: 100%|██████████| 3457/3457 [00:00<00:00, 50235.28it/s]


Gene Mrc1-201: 	condition too harsh, loose to get better results
[96, 116, 131, 218, 235, 250, 370, 661, 905, 987, 1084, 1099, 1114, 1129, 1518, 1538, 1553, 1733, 1958, 1974, 1991, 2006, 2021, 2135, 2150, 2341, 2419, 2481, 2547, 2577, 2792, 2807, 2822, 2837, 2969, 2986, 3003, 3018, 3073, 3088, 3121, 3136, 3151, 3167, 3182, 3319, 3334, 3349, 3366, 3422]


position_searching_Mobp-201: 100%|██████████| 371/371 [00:00<00:00, 24801.36it/s]


Gene Mobp-201: 	condition too harsh, loose to get better results
[45, 49, 53, 57, 62, 66, 70, 75, 82, 86, 90, 95, 99, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 157, 161, 165, 169, 173, 177, 181, 185, 189, 198, 202, 206, 210, 218, 222, 226, 230, 234, 238, 242, 246, 250, 254, 307, 314, 318, 322, 327, 332, 343, 347, 352, 363, 367]


position_searching_Prr5l-201: 100%|██████████| 851/851 [00:00<00:00, 30473.69it/s]


Gene Prr5l-201: 	condition too harsh, loose to get better results
[12, 62, 68, 74, 80, 86, 92, 98, 105, 111, 118, 127, 154, 162, 177, 183, 189, 197, 203, 218, 224, 238, 244, 250, 257, 263, 269, 275, 281, 293, 299, 320, 363, 369, 375, 381, 393, 449, 488, 494, 500, 507, 513, 519, 525, 531, 551, 576, 585, 596, 815, 821, 834, 840]


position_searching_Pdgfra-201: 100%|██████████| 2576/2576 [00:00<00:00, 33273.98it/s]


Gene Pdgfra-201: 	condition too harsh, loose to get better results
[3, 69, 95, 147, 173, 235, 283, 312, 338, 446, 562, 631, 699, 815, 847, 933, 1009, 1035, 1081, 1153, 1179, 1206, 1232, 1281, 1340, 1366, 1392, 1418, 1446, 1472, 1588, 1614, 1763, 1824, 1874, 1900, 1938, 1964, 2077, 2127, 2187, 2273, 2336, 2362, 2388, 2415, 2441, 2467, 2534, 2565]


position_searching_Megf11-210: 100%|██████████| 2592/2592 [00:00<00:00, 22213.08it/s]
position_searching_Bcas1-201: 100%|██████████| 1482/1482 [00:00<00:00, 29137.21it/s]


Gene Bcas1-201: 	condition too harsh, loose to get better results
[48, 66, 81, 133, 151, 166, 243, 266, 286, 461, 481, 496, 539, 554, 571, 592, 612, 668, 688, 710, 737, 757, 774, 842, 861, 969, 984, 999, 1014, 1029, 1044, 1059, 1075, 1137, 1157, 1172, 1187, 1211, 1230, 1245, 1264, 1279, 1294, 1309, 1325, 1343, 1358, 1373, 1420, 1472]


position_searching_Enpp6-201: 100%|██████████| 1019/1019 [00:00<00:00, 31929.56it/s]


Gene Enpp6-201: 	condition too harsh, loose to get better results
[48, 68, 73, 78, 83, 341, 346, 351, 356, 362, 367, 372, 377, 382, 387, 392, 397, 460, 465, 481, 486, 491, 539, 544, 553, 558, 563, 573, 580, 585, 590, 596, 608, 616, 626, 798, 810, 815, 820, 825, 830, 835, 840, 845, 864, 884, 889, 894, 899, 905, 911, 916, 921, 926, 949, 984]


position_searching_Gpc5-203: 100%|██████████| 1512/1512 [00:00<00:00, 36095.64it/s]


Gene Gpc5-203: 	condition too harsh, loose to get better results
[43, 50, 62, 69, 76, 83, 90, 172, 286, 307, 314, 321, 562, 571, 578, 642, 660, 667, 678, 685, 693, 700, 707, 715, 735, 755, 762, 771, 779, 794, 803, 820, 827, 846, 856, 1098, 1105, 1112, 1160, 1168, 1245, 1305, 1433, 1440, 1450, 1458, 1465, 1477, 1484, 1497, 1504]


position_searching_Prex2-201: 100%|██████████| 3799/3799 [00:00<00:00, 49469.61it/s]


Gene Prex2-201: 	condition too harsh, loose to get better results
[10, 211, 226, 246, 474, 487, 501, 543, 587, 622, 635, 648, 661, 674, 687, 745, 758, 836, 925, 1001, 1224, 1299, 1312, 1325, 1339, 1359, 1448, 1519, 1534, 1547, 1562, 1575, 1588, 1758, 1772, 1789, 1886, 2017, 2064, 2083, 2096, 2109, 2188, 2252, 2307, 2320, 3027, 3616, 3716, 3729, 3743, 3757]


position_searching_Gfap-201: 100%|██████████| 995/995 [00:00<00:00, 24941.63it/s]


Gene Gfap-201: 	condition too harsh, loose to get better results
[39, 54, 67, 91, 113, 126, 154, 172, 185, 198, 211, 224, 237, 250, 263, 276, 289, 302, 315, 328, 341, 355, 415, 432, 452, 465, 478, 491, 507, 622, 657, 677, 692, 705, 718, 739, 757, 775, 795, 808, 821, 834, 847, 860, 873, 886, 903, 960, 975, 988]


position_searching_Tmem132b-201: 100%|██████████| 2551/2551 [00:00<00:00, 26924.32it/s]


Gene Tmem132b-201: 	condition too harsh, loose to get better results
[0, 58, 88, 130, 171, 223, 253, 283, 315, 345, 375, 433, 463, 495, 603, 646, 676, 798, 828, 858, 888, 918, 961, 991, 1094, 1124, 1179, 1247, 1278, 1338, 1368, 1398, 1471, 1548, 1617, 1699, 1729, 1777, 1808, 1838, 1944, 1983, 2020, 2053, 2083, 2113, 2143, 2201, 2249, 2279, 2467, 2524]


position_searching_Cables1-202: 100%|██████████| 1389/1389 [00:00<00:00, 28422.57it/s]


Gene Cables1-202: 	condition too harsh, loose to get better results
[2, 12, 22, 38, 71, 91, 108, 221, 287, 335, 365, 388, 409, 419, 430, 440, 457, 479, 571, 622, 632, 771, 782, 792, 808, 834, 854, 864, 874, 898, 918, 928, 938, 948, 958, 972, 992, 1035, 1045, 1055, 1120, 1130, 1140, 1150, 1160, 1205, 1215, 1273, 1283, 1367]


position_searching_Nwd1-201: 100%|██████████| 3714/3714 [00:00<00:00, 28426.88it/s]


## Blast and extract blast results

In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f:
#     fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(os.path.join(os.path.join(output, blast_results_file)), "w") as f:
#     f.write(handle.read())

In [20]:
# Extract interested information from blast_results
from Bio.Blast import NCBIXML


align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try:
            FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except:
            FOI.loc[loca, "plus/minus"] = "NAN"

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [21]:
FOI["wanted"] = [True] * len(FOI)

In [22]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.split('-')[0].upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "protein_coding":
            FOI.loc[i, "wanted"] = False
            print(FOI.loc[i, "mol_type"])

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"].split('-')[0]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(os.path.join(output, "probes_sieve.xlsx"))

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))