# Probe Designer


## Environment


In [1]:
# basci env
import os
import sys
import pandas as pd
import time
import json

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# get gene data from ncbi
from Bio import Entrez

# blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
sys.path.append("../lib")

# dir
project_name = "2023.11.4_Embryo_100"
os.makedirs(f"../dataset/{project_name}", exist_ok=True)
os.chdir(f"../dataset/{project_name}")

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)
tmp = "./results/" + formatted_time + "/tmp/"
output = "./results/" + formatted_time + "/"
pre_binding_dir = tmp + "pre_binding/"
os.makedirs(tmp, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total_pre_binding.fasta"

# tmp file
gene_name_list_file = "1_gene_name_list.txt"
gene_id_name_file = "2_id_list.txt"
gene_seq_in_file = "3_gene_seq_in_file.gb"
pre_binding_num_file = "4_pre_binding_num.json"
blast_results_file = "5_blast_results.xml"

## Get genbank file of each gene from ncbi dataset

https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch


In [2]:
# Get gene id and other information from ncbi dataset(api)
## Generate gene_search_list from gene_name_list
organism_of_interest = "Homo sapiens"
n_type_of_interest = "mRNA"
with open(tmp + gene_name_list_file) as f:
    gene_name_list = f.read().splitlines()
gene_search_list = [
    ", ".join([name, organism_of_interest, n_type_of_interest])
    for name in gene_name_list
]

In [161]:
## Get gene id list using Entrez.esearch
id_list = []
for gene_search in gene_search_list:
    Entrez.email = "1418767067@qq.com"
    handle = Entrez.esearch(db="nuccore", term=gene_search)
    record = Entrez.read(handle)
    handle.close()
    id_list += record["IdList"][:1]  # set number of search results to read
with open(tmp + gene_id_name_file, "w") as f:
    f.write("\n".join(id_list))

In [3]:
## Read id_list from existing file
with open(tmp + gene_id_name_file, "r") as f:
    id_list = f.read().split("\n")

In [8]:
# Get the genbank file of each gene by id list
fetch_per_round = 3
round = -(-len(id_list) // fetch_per_round)

# initialization of gb file
with open(tmp + gene_seq_in_file, "w") as f:
    f.write("")

for i in range(round):
    id_list_per_round = id_list[i * fetch_per_round : (i + 1) * fetch_per_round]
    Entrez.email = "1418767067@qq.com"
    handle = Entrez.efetch(
        db="nuccore",
        strand=1,  # plus if strand=1
        id=id_list_per_round,
        rettype="gbwithparts",
        retmode="text",
    )
    seq_record = handle.read()
    handle.close()
    print(i + 1, "{:.1f} %".format((i + 1) / round * 100))
    with open(tmp + gene_seq_in_file, "a") as f:
        f.write(seq_record)

1 12.5 %
2 25.0 %
3 37.5 %
4 50.0 %
5 62.5 %
6 75.0 %
7 87.5 %
8 100.0 %


## Binding site Searcher


In [4]:
sys.path.append("../lib")

In [5]:
from search_method import step_by_step, find_max_min_difference_fixed_length_subsequence, gb_extract

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_in = tmp + gene_seq_in_file
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(file_out_dir + total_pre_binding_file_name, "w") as handle:
    handle.write("")

for record in SeqIO.parse(tmp + gene_seq_in_file, "genbank"):
    id, gene_name, mol_type, organism, seq = gb_extract(record, CDS=True)

    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        seq,
        BDS_len=40,
        BDS_num=50,
        min_gap=1,
        better_gap=40,
        gene=gene_name,
        G_min=0.25,
        G_max=0.7,
        G_consecutive=5,
        Tm_low=50,
        Tm_high=65,
    )
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = file_out_dir + gene_name + pre_binding_file_suffix
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir + total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(tmp + pre_binding_num_file, "w") as f:
    json.dump(pre_binding_num, f)

position_searching_Arhgap36:  48%|████▊     | 1143/2401 [00:00<00:00, 11415.94it/s]

position_searching_Arhgap36: 100%|██████████| 2401/2401 [00:00<00:00, 12506.72it/s]


Gene Arhgap36: 	condition too harsh, loose to get better results
[27, 47, 153, 261, 361, 442, 518, 562, 628, 647, 825, 847, 890, 909, 928, 947, 966, 1098, 1117, 1136, 1155, 1232, 1254, 1275, 1336, 1484, 1505, 1570, 1604, 1624, 1643, 1670, 1690, 1926, 1950, 1969, 1988, 2007, 2046, 2065, 2084, 2201, 2220, 2239, 2258, 2278, 2330, 2349, 2373, 2393]


position_searching_Cacna2d3: 100%|██████████| 2928/2928 [00:00<00:00, 13930.72it/s]


Gene Cacna2d3: 	condition too harsh, loose to get better results
[5, 41, 63, 142, 164, 227, 250, 272, 316, 338, 360, 429, 460, 489, 683, 705, 785, 807, 846, 901, 971, 993, 1015, 1037, 1075, 1101, 1181, 1214, 1236, 1258, 1291, 1319, 1603, 1702, 1789, 1816, 1838, 1861, 1898, 1950, 2023, 2105, 2127, 2149, 2175, 2197, 2263, 2285, 2380, 2402, 2479, 2690]


position_searching_Cnpy1: 100%|██████████| 2968/2968 [00:00<00:00, 25549.88it/s]


Gene Cnpy1: 	condition too harsh, loose to get better results
[255, 275, 343, 347, 611, 615, 625, 629, 635, 645, 939, 1601, 1606, 1703, 1707, 1711, 1718, 2131, 2135, 2139, 2143, 2147, 2151, 2155, 2159, 2170, 2176, 2180, 2184, 2188, 2192, 2196, 2200, 2206, 2436, 2440, 2444, 2505, 2509, 2593, 2597, 2613, 2617, 2801, 2805, 2809, 2813, 2817, 2870, 2874, 2878, 2882, 2886, 2890, 2894, 2945, 2949]


position_searching_Dkk2: 100%|██████████| 2925/2925 [00:00<00:00, 17936.04it/s]


Gene Dkk2: 	condition too harsh, loose to get better results
[547, 633, 710, 1026, 1819, 1839, 1856, 1920, 1940, 1957, 2017, 2073, 2090, 2110, 2130, 2147, 2164, 2182, 2199, 2224, 2275, 2292, 2310, 2342, 2361, 2394, 2411, 2428, 2459, 2479, 2499, 2516, 2533, 2551, 2568, 2585, 2614, 2634, 2651, 2670, 2763, 2780, 2797, 2814, 2833, 2850, 2867, 2886, 2906, 2923]


position_searching_Nan: 100%|██████████| 3320/3320 [00:00<00:00, 33196.07it/s]


Gene Nan: 	Not enough pos for 50 binding sites.
Gene Nan: 	condition too harsh, loose to get better results
[517, 529, 530, 531, 532, 533, 534, 537, 595, 1702, 1703, 1704, 1705, 1706, 1707, 1708, 1709, 1710, 1711, 1712, 1713, 1714, 1715, 1716, 1717, 1718, 1719, 1720, 1721, 1722, 1723, 1724, 1725, 1726, 1727, 1728, 1729, 1730, 1731, 1732, 1733, 1734, 1735, 1736, 1737, 1738, 1739, 1878, 1879, 1880, 1881, 1882, 1883, 1884, 1885, 1886, 1887, 1888, 1889, 1890, 1891, 1892, 1893, 1894, 1895, 1896, 1897, 1898, 1899, 1900, 1901, 1902, 1903, 1904, 1906, 1916, 1917, 2097, 2098, 2099, 2100, 2101, 2247, 2248, 2249, 2329, 2332, 2638, 2640, 2641, 2642, 2643, 2644, 2645, 2646, 2647, 2648, 2649, 2650, 2651, 2652, 2654, 2658, 2659, 2660, 2793, 2794, 2798, 3131, 3143, 3150, 3151]


position_searching_Ednrb: 100%|██████████| 3157/3157 [00:00<00:00, 16272.66it/s]


Gene Ednrb: 	condition too harsh, loose to get better results
[600, 607, 614, 621, 628, 635, 642, 649, 658, 665, 672, 805, 812, 891, 996, 1102, 1109, 1116, 1183, 1214, 1760, 1805, 1813, 1823, 1860, 2015, 2022, 2066, 2087, 2099, 2289, 2296, 2303, 2310, 2317, 2324, 2331, 2341, 2378, 2385, 2600, 2607, 2658, 2665, 2807, 2814, 2862, 2921, 2942, 3002, 3011, 3097, 3104, 3147, 3154]


position_searching_Gabrp: 100%|██████████| 2584/2584 [00:00<00:00, 11666.21it/s]


Gene Gabrp: 	condition too harsh, loose to get better results
[59, 151, 181, 201, 214, 234, 255, 268, 288, 301, 314, 328, 480, 493, 515, 528, 543, 556, 570, 587, 934, 1151, 1164, 1180, 1194, 1725, 1740, 1757, 1788, 1805, 1839, 1852, 1865, 1964, 2061, 2120, 2205, 2218, 2302, 2315, 2328, 2388, 2404, 2417, 2435, 2448, 2461, 2475, 2488, 2507, 2527, 2540, 2555]


position_searching_Gjb2: 100%|██████████| 1884/1884 [00:00<00:00, 14797.69it/s]


Gene Gjb2: 	condition too harsh, loose to get better results
[326, 334, 342, 350, 417, 451, 471, 479, 500, 520, 839, 861, 932, 940, 1040, 1048, 1056, 1121, 1129, 1143, 1211, 1347, 1355, 1366, 1375, 1476, 1485, 1493, 1501, 1509, 1605, 1613, 1621, 1629, 1637, 1645, 1653, 1661, 1669, 1677, 1694, 1703, 1714, 1736, 1819, 1828, 1836, 1844, 1852, 1860, 1868, 1876]


position_searching_Grid2: 100%|██████████| 5850/5850 [00:00<00:00, 14031.74it/s]
position_searching_Hs3st4: 100%|██████████| 2544/2544 [00:00<00:00, 12007.38it/s]


Gene Hs3st4: 	condition too harsh, loose to get better results
[124, 184, 200, 279, 295, 314, 374, 476, 527, 549, 565, 1056, 1077, 1101, 1303, 1319, 1337, 1353, 1369, 1431, 1458, 1529, 1549, 1575, 1591, 1609, 1629, 1647, 1667, 1683, 1725, 1741, 1849, 1870, 1888, 1904, 1921, 1942, 1958, 1974, 2093, 2240, 2268, 2284, 2351, 2368, 2388, 2404, 2420, 2438, 2466]


position_searching_Htr2c: 100%|██████████| 3773/3773 [00:00<00:00, 20249.69it/s]


Gene Htr2c: 	condition too harsh, loose to get better results
[147, 426, 619, 940, 945, 964, 2232, 2237, 2364, 2373, 2385, 2571, 2686, 2701, 2721, 2726, 2731, 2736, 2741, 2746, 2751, 2756, 2814, 2836, 3071, 3085, 3128, 3133, 3138, 3143, 3258, 3263, 3273, 3420, 3425, 3441, 3543, 3549, 3557, 3562, 3567, 3682, 3687, 3694, 3701, 3707, 3714, 3719, 3724, 3737, 3742, 3757, 3762]


position_searching_Lef1: 100%|██████████| 2865/2865 [00:00<00:00, 10215.74it/s]


Gene Lef1: 	condition too harsh, loose to get better results
[24, 87, 427, 464, 507, 545, 582, 670, 723, 760, 797, 834, 871, 918, 1005, 1074, 1111, 1230, 1267, 1317, 1404, 1448, 1485, 1531, 1575, 1612, 1658, 1713, 1750, 1792, 1829, 1866, 1903, 1963, 2004, 2041, 2095, 2132, 2194, 2235, 2272, 2309, 2347, 2385, 2435, 2487, 2526, 2609, 2706, 2857]


position_searching_Myh6: 100%|██████████| 4825/4825 [00:00<00:00, 11312.49it/s]
position_searching_Myl2: 100%|██████████| 1097/1097 [00:00<00:00, 9328.82it/s]


Gene Myl2: 	condition too harsh, loose to get better results
[19, 33, 47, 58, 69, 83, 261, 272, 288, 304, 315, 326, 344, 369, 387, 408, 435, 455, 468, 479, 490, 501, 512, 523, 552, 563, 576, 587, 605, 616, 677, 688, 699, 711, 722, 733, 745, 756, 773, 786, 798, 809, 820, 836, 891, 939, 971, 1069, 1082, 1093]


position_searching_Nppa: 100%|██████████| 648/648 [00:00<00:00, 10496.61it/s]


Gene Nppa: 	condition too harsh, loose to get better results
[58, 63, 68, 74, 79, 85, 90, 95, 100, 105, 113, 121, 128, 136, 187, 192, 197, 204, 209, 214, 260, 265, 270, 275, 282, 287, 293, 314, 321, 343, 363, 368, 373, 379, 390, 396, 401, 410, 415, 420, 432, 439, 505, 555, 570, 601, 621, 626, 631, 636, 643]


position_searching_Nppc: 100%|██████████| 794/794 [00:00<00:00, 12507.85it/s]


Gene Nppc: 	condition too harsh, loose to get better results
[121, 319, 326, 333, 340, 347, 354, 361, 368, 376, 384, 398, 405, 412, 425, 432, 439, 446, 453, 460, 480, 489, 496, 503, 510, 518, 526, 533, 546, 553, 560, 567, 574, 581, 588, 603, 610, 673, 694, 702, 709, 716, 723, 730, 737, 744, 751, 758, 765, 772, 779, 786, 793]


position_searching_Opcml: 100%|██████████| 5020/5020 [00:00<00:00, 16908.66it/s]


Gene Opcml: 	condition too harsh, loose to get better results
[317, 535, 811, 823, 847, 869, 1020, 1032, 1119, 1290, 1546, 1619, 1773, 1791, 1803, 1874, 1886, 1898, 1934, 2092, 2104, 2413, 2425, 2669, 2681, 2758, 3019, 3031, 3214, 3271, 3283, 3295, 3397, 3447, 4131, 4202, 4214, 4607, 4619, 4631, 4670, 4682, 4694, 4706, 4860, 4872, 4886, 4982, 4994, 5014]


position_searching_Pax3: 100%|██████████| 3056/3056 [00:00<00:00, 14671.81it/s]


Gene Pax3: 	condition too harsh, loose to get better results
[18, 544, 1094, 1210, 1334, 1517, 1673, 1694, 1744, 1790, 1815, 1841, 1864, 1885, 1906, 1927, 1948, 2004, 2025, 2066, 2087, 2120, 2141, 2162, 2183, 2219, 2248, 2274, 2296, 2318, 2339, 2360, 2381, 2402, 2475, 2496, 2617, 2638, 2744, 2765, 2787, 2808, 2831, 2860, 2881, 2902, 2923, 2944, 2966, 2987, 3008, 3031]


position_searching_Ptprz1: 100%|██████████| 4368/4368 [00:00<00:00, 18279.46it/s]


Gene Ptprz1: 	condition too harsh, loose to get better results
[219, 287, 308, 329, 453, 611, 872, 962, 995, 1019, 1040, 1127, 1179, 1200, 1388, 1409, 1435, 1456, 1477, 1498, 1521, 1543, 1779, 1800, 1948, 1969, 1990, 2277, 2371, 2432, 2453, 2508, 2540, 2598, 2619, 2640, 2717, 2846, 2867, 3038, 3059, 3200, 3305, 3413, 3574, 3639, 3660, 3681, 3920, 4177]


position_searching_Syt4: 100%|██████████| 3081/3081 [00:00<00:00, 28609.49it/s]


Gene Syt4: 	condition too harsh, loose to get better results
[148, 301, 690, 1191, 1196, 1685, 1694, 1699, 1968, 1974, 1991, 1996, 2003, 2252, 2311, 2317, 2322, 2328, 2380, 2546, 2612, 2679, 2684, 2689, 2694, 2699, 2704, 2709, 2714, 2792, 2798, 2803, 2808, 2814, 2819, 2827, 2834, 2839, 2859, 2887, 2893, 2907, 2912, 2917, 2924, 2929, 2934, 2939, 2944, 2952, 2963, 2968, 2983, 2988, 2993]


position_searching_Ttn: 100%|██████████| 81300/81300 [00:04<00:00, 17658.73it/s]
position_searching_Wwp2: 100%|██████████| 3964/3964 [00:00<00:00, 10014.90it/s]


## Blast and extract blast results

NCBIXML: https://homolog.us/Biopython/Bio.Blast.NCBIXML.html#read/0

BlastRecord: https://biopython.org/docs/1.75/api/Bio.Blast.Record.html

XMLReader: https://codebeautify.org/xmlviewer#


In [None]:
with open(file_out_dir + total_pre_binding_file_name, "r") as f:
    fasta_string = f.read()
txid = [2697049]  # organism

# Submit BLAST search and get handle object
handle = NCBIWWW.qblast(
    program="blastn",
    megablast="yes",
    database="refseq_rna",
    sequence=fasta_string,
    url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
    format_object="Alignment",
    format_type="Xml",
)

# read handle object and save to a file
with open(tmp + blast_results_file, "w") as f:
    f.write(handle.read())

In [6]:
# Extract interested information from blast_results
align_num = []

# read the id/plus-minus part/align_num
with open(tmp + blast_results_file, "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try:
            FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except:
            FOI.loc[loca, "plus/minus"] = "NAN"

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [7]:
FOI["wanted"] = [True] * len(FOI)

In [8]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_name_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "mRNA":
            FOI.loc[i, "wanted"] = False

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(tmp + "probes_sieve.xlsx")

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(output + "probes_wanted.xlsx")