# Probe Designer


## Environment


In [1]:
# basci env
import os
import sys
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# # get gene data from ncbi
# from Bio import Entrez

# # blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("../lib")

# dir
workdir = './dataset/2023.12.15_Sindy_Isoforms/'
os.makedirs(workdir, exist_ok=True)

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)

output = os.path.join(workdir, 'results', formatted_time)
pre_binding_dir = os.path.join(output, "pre_binding")
os.makedirs(output, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total.fasta"

# tmp file
gene_name_list_file = "gene_list.xlsx"
pre_binding_num_file = "pre_binding_num.json"
blast_results_file = "blast_results.xml"

In [7]:
organism = 'mouse'
gene_info = pd.read_excel(os.path.join(workdir, gene_name_list_file), sheet_name='Sheet1')
# gene_list = list(gene_info['gene_name'].unique())
id_list = list(gene_info['ensembl_id'])[:1]
isoform_list = list(gene_info['isoform'])[:1]

sequences_of_all = dict()

## Get seq from ensembl dataset

In [8]:
from lib.database_interaction import ensembl_id_to_seqs
import time

skip = 0
trial = 0

for i in range(1):
    if trial > 3:
        skip += 1
        trial = 0
    try:
        tmp_isoform_list = isoform_list[skip:]
        tmp_id_list = id_list[len(sequences_of_all)+skip:]
        for i in range(len(tmp_isoform_list)):
            isoform = tmp_isoform_list[i]
            id = tmp_id_list[i]
            sequences = ensembl_id_to_seqs(gene=isoform, gene_id=id, seq_type='cds')
            for desc, sequence in sequences.items():
                sequences_of_all[desc] = sequence
        skip += 1
    except: 
        trial += 1
        time.sleep(5)

Gene:	Gm16024-201: 100%|██████████| 1/1 [00:00<?, ?it/s]


In [11]:
id_list

['ENSMUST00000128841.1']

In [13]:
import requests
transcripts_url = f"http://rest.ensembl.org/overlap/id/{'ENSMUST00000128841'}?feature=transcript;content-type=application/json"
transcripts = requests.get(url=transcripts_url).json()
transcripts

[{'transcript_id': 'ENSMUST00000105608',
  'transcript_support_level': '5 (assigned to previous version 8)',
  'biotype': 'protein_coding',
  'logic_name': 'ensembl_havana_transcript_mus_musculus',
  'seq_region_name': '4',
  'is_canonical': 1,
  'assembly_name': 'GRCm39',
  'description': None,
  'ccdsid': 'CCDS38995.1',
  'source': 'ensembl_havana',
  'external_name': 'Slc35e2-202',
  'end': 155707797,
  'feature_type': 'transcript',
  'id': 'ENSMUST00000105608',
  'strand': 1,
  'start': 155685873,
  'version': 9,
  'Parent': 'ENSMUSG00000042202',
  'tag': 'basic'},
 {'start': 155686068,
  'strand': 1,
  'id': 'ENSMUST00000043829',
  'tag': 'basic',
  'version': 11,
  'Parent': 'ENSMUSG00000042202',
  'ccdsid': 'CCDS38995.1',
  'assembly_name': 'GRCm39',
  'description': None,
  'feature_type': 'transcript',
  'end': 155704547,
  'external_name': 'Slc35e2-201',
  'source': 'havana',
  'transcript_support_level': '1 (assigned to previous version 10)',
  'transcript_id': 'ENSMUST00000

In [10]:
sequences

{}

In [6]:
sequences

{}

In [5]:
len(sequences_of_all)

0

In [None]:
import json
with open(os.path.join(output, 'sequences.json'), "w") as json_file:
    json.dump(sequences_of_all, json_file)

## Binding site Searcher


In [27]:
from lib.search_binding import step_by_step, find_max_min_difference_fixed_length_subsequence, seq_minus

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(os.path.join(file_out_dir, total_pre_binding_file_name), "w") as handle:
    handle.write("")

for desc, seq in sequences_of_all.items():
    id, gene_name, mol_type = desc.split('|')
    minus_seq = seq_minus(seq)
    
    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        minus_seq,
        BDS_len=40,
        BDS_num=50,
        min_gap=1,
        better_gap=40,
        gene=gene_name,
        G_min=0.25,
        G_max=0.7,
        G_consecutive=5,
        Tm_low=50,
        Tm_high=65,
    )
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = os.path.join(file_out_dir, gene_name + pre_binding_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir + total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(os.path.join(output, pre_binding_num_file), "w") as f:
    json.dump(pre_binding_num, f)

position_searching_CDX2-201: 100%|██████████| 714/714 [00:00<00:00, 23093.97it/s]
  FOI = pd.concat([FOI, add], ignore_index=True)


Gene CDX2-201: 	condition too harsh, loose to get better results
[0, 6, 12, 18, 24, 30, 36, 42, 51, 59, 69, 75, 149, 158, 164, 170, 178, 184, 192, 207, 232, 282, 288, 294, 301, 309, 315, 321, 329, 343, 349, 363, 411, 417, 423, 429, 436, 449, 456, 469, 475, 481, 487, 497, 503, 512, 521, 532, 541, 547, 604, 612, 654, 665, 683, 703]


position_searching_GATA3-202: 100%|██████████| 1029/1029 [00:00<00:00, 23686.62it/s]


Gene GATA3-202: 	condition too harsh, loose to get better results
[8, 28, 101, 155, 167, 179, 191, 203, 215, 227, 239, 251, 264, 276, 288, 300, 320, 332, 344, 357, 371, 383, 404, 416, 497, 509, 521, 533, 551, 563, 575, 587, 599, 611, 624, 636, 682, 694, 706, 718, 730, 743, 800, 819, 831, 898, 910, 925, 937, 949, 961, 973, 985, 997, 1009]


position_searching_YAP1-210: 100%|██████████| 1183/1183 [00:00<00:00, 26359.09it/s]


Gene YAP1-210: 	condition too harsh, loose to get better results
[0, 13, 33, 59, 96, 109, 121, 133, 159, 246, 283, 295, 321, 341, 353, 369, 381, 432, 444, 456, 468, 482, 499, 511, 524, 571, 586, 645, 657, 682, 731, 743, 755, 893, 905, 944, 956, 986, 1001, 1013, 1026, 1038, 1050, 1063, 1075, 1087, 1100, 1112, 1124, 1137, 1149, 1168, 1180]


position_searching_GATA2-207: 100%|██████████| 1341/1341 [00:00<00:00, 22556.99it/s]


Gene GATA2-207: 	condition too harsh, loose to get better results
[48, 78, 110, 131, 154, 175, 198, 219, 240, 261, 282, 303, 327, 350, 385, 406, 436, 457, 478, 499, 520, 541, 562, 611, 652, 674, 704, 726, 753, 774, 848, 869, 890, 918, 950, 971, 992, 1030, 1051, 1072, 1104, 1125, 1146, 1167, 1188, 1209, 1233, 1277, 1298, 1319]


position_searching_TFAP2C-201: 100%|██████████| 1043/1043 [00:00<00:00, 29050.13it/s]


Gene TFAP2C-201: 	condition too harsh, loose to get better results
[0, 50, 62, 67, 72, 82, 87, 142, 313, 318, 323, 428, 448, 453, 458, 484, 505, 513, 521, 526, 531, 536, 541, 555, 560, 565, 573, 578, 583, 641, 646, 651, 656, 662, 667, 672, 677, 682, 687, 692, 741, 746, 751, 756, 761, 951, 956, 965, 970, 975, 980, 985, 990, 995, 1034, 1039]


position_searching_MSX2-201: 100%|██████████| 604/604 [00:00<00:00, 22430.23it/s]


Gene MSX2-201: 	condition too harsh, loose to get better results
[0, 9, 15, 35, 55, 61, 67, 73, 115, 121, 136, 142, 156, 169, 175, 181, 189, 231, 237, 244, 250, 256, 264, 270, 276, 282, 288, 294, 300, 306, 312, 318, 324, 330, 336, 344, 350, 383, 401, 407, 450, 464, 512, 520, 528, 536, 542, 548, 554, 560, 566, 572, 578]


position_searching_WNT3-201: 100%|██████████| 816/816 [00:00<00:00, 23377.61it/s]


Gene WNT3-201: 	condition too harsh, loose to get better results
[9, 20, 31, 42, 53, 64, 75, 126, 146, 170, 203, 232, 245, 258, 271, 305, 316, 327, 338, 349, 361, 376, 402, 422, 452, 497, 510, 521, 533, 544, 556, 567, 578, 592, 605, 616, 627, 638, 649, 660, 676, 696, 707, 719, 731, 755, 775, 786, 797, 808]


position_searching_BMP4-201: 100%|██████████| 943/943 [00:00<00:00, 23638.00it/s]


Gene BMP4-201: 	condition too harsh, loose to get better results
[18, 31, 44, 57, 70, 87, 100, 115, 136, 150, 164, 177, 190, 207, 220, 237, 253, 266, 279, 296, 319, 339, 359, 372, 385, 401, 414, 438, 461, 474, 487, 500, 567, 588, 605, 618, 632, 694, 707, 720, 733, 750, 770, 827, 840, 853, 873, 893, 906, 919, 932]


position_searching_ITGA5-201: 100%|██████████| 2480/2480 [00:00<00:00, 26568.81it/s]


Gene ITGA5-201: 	condition too harsh, loose to get better results
[5, 41, 75, 109, 143, 177, 211, 248, 282, 327, 368, 402, 442, 560, 594, 628, 668, 702, 736, 772, 806, 864, 925, 1026, 1078, 1112, 1236, 1278, 1312, 1346, 1419, 1470, 1514, 1569, 1642, 1676, 1710, 1774, 1811, 1849, 1918, 2014, 2048, 2103, 2174, 2217, 2294, 2328, 2362, 2427]


position_searching_ITGB1-232: 100%|██████████| 1944/1944 [00:00<00:00, 108286.10it/s]


Gene ITGB1-232: 	condition too harsh, loose to get better results
[22, 147, 161, 163, 166, 168, 170, 176, 178, 180, 182, 386, 389, 391, 393, 395, 397, 399, 401, 403, 453, 455, 457, 459, 461, 463, 465, 468, 799, 801, 803, 805, 1380, 1384, 1577, 1580, 1582, 1584, 1586, 1588, 1590, 1592, 1597, 1599, 1601, 1603, 1605, 1608, 1628, 1878, 1904, 1906, 1908]


position_searching_EOMES-202: 100%|██████████| 1656/1656 [00:00<00:00, 25944.15it/s]


Gene EOMES-202: 	condition too harsh, loose to get better results
[25, 45, 56, 67, 125, 136, 147, 167, 251, 262, 273, 285, 296, 309, 320, 331, 342, 353, 407, 427, 643, 680, 745, 861, 884, 895, 907, 918, 941, 1023, 1046, 1057, 1068, 1079, 1091, 1143, 1165, 1176, 1272, 1283, 1331, 1351, 1401, 1416, 1461, 1472, 1483, 1613, 1628, 1639]


position_searching_DAB2-201: 100%|██████████| 1811/1811 [00:00<00:00, 33626.62it/s]


Gene DAB2-201: 	condition too harsh, loose to get better results
[52, 72, 137, 157, 173, 183, 193, 203, 213, 302, 315, 325, 335, 345, 383, 393, 466, 476, 527, 537, 547, 575, 602, 622, 632, 642, 652, 664, 718, 732, 743, 758, 779, 794, 804, 814, 836, 847, 857, 870, 890, 937, 958, 978, 990, 1000, 1066, 1251, 1269, 1793, 1803]


position_searching_DNMT3L-201: 100%|██████████| 892/892 [00:00<00:00, 26305.27it/s]


Gene DNMT3L-201: 	condition too harsh, loose to get better results
[1, 26, 40, 52, 66, 78, 90, 102, 114, 126, 138, 150, 178, 204, 224, 241, 253, 265, 277, 289, 304, 461, 473, 494, 513, 525, 540, 552, 564, 576, 588, 600, 612, 642, 668, 688, 700, 712, 724, 737, 749, 762, 775, 789, 813, 826, 838, 850, 862, 880]


position_searching_CGB5-201: 100%|██████████| 360/360 [00:00<00:00, 22559.45it/s]


Gene CGB5-201: 	condition too harsh, loose to get better results
[13, 57, 62, 77, 82, 87, 91, 97, 101, 105, 109, 117, 121, 125, 129, 133, 137, 152, 157, 172, 176, 180, 184, 190, 194, 198, 202, 207, 243, 250, 254, 258, 262, 267, 271, 275, 279, 283, 287, 304, 308, 324, 328, 332, 336, 340, 344, 348, 352, 356]


position_searching_CGB8-201: 100%|██████████| 360/360 [00:00<00:00, 21233.40it/s]


Gene CGB8-201: 	condition too harsh, loose to get better results
[13, 57, 62, 77, 82, 87, 91, 97, 101, 105, 109, 117, 121, 125, 129, 133, 137, 152, 157, 172, 176, 180, 184, 190, 194, 198, 202, 207, 243, 250, 254, 258, 262, 267, 271, 275, 279, 283, 287, 304, 308, 324, 328, 332, 336, 340, 344, 348, 352, 356]


position_searching_MOB3B-201: 100%|██████████| 481/481 [00:00<00:00, 30144.19it/s]

Gene MOB3B-201: 	condition too harsh, loose to get better results
[21, 24, 28, 30, 33, 35, 37, 43, 45, 47, 49, 91, 227, 229, 231, 233, 248, 290, 294, 296, 298, 300, 302, 304, 306, 308, 310, 313, 320, 322, 325, 332, 334, 419, 430, 432, 434, 436, 438, 440, 442, 444, 448, 450, 452, 454, 456, 458, 460, 462, 464, 466, 468, 470, 472, 474, 476, 478, 480]



position_searching_NLRP9-201: 100%|██████████| 2342/2342 [00:00<00:00, 46965.68it/s]


Gene NLRP9-201: 	condition too harsh, loose to get better results
[167, 173, 179, 185, 191, 197, 233, 240, 246, 252, 336, 344, 350, 356, 362, 368, 411, 431, 437, 443, 449, 540, 546, 553, 559, 566, 573, 709, 715, 729, 735, 1193, 1199, 1205, 1219, 1239, 1249, 1255, 1261, 1267, 1476, 1488, 1581, 1910, 2042, 2050, 2056, 2062, 2070, 2215, 2221, 2227, 2233, 2315]


position_searching_ZSCAN4-201: 100%|██████████| 1002/1002 [00:00<00:00, 55817.84it/s]


Gene ZSCAN4-201: 	Not enough pos for 50 binding sites.
Gene ZSCAN4-201: 	condition too harsh, loose to get better results
[2, 4, 5, 6, 9, 71, 72, 274, 275, 277, 281, 282, 283, 284, 285, 286, 287, 288, 289, 290, 291, 292, 293, 294, 295, 296, 297, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 309, 310, 316, 317, 319, 320, 321, 322, 323, 324, 325, 326, 327, 630, 632, 633, 634, 635, 636, 637, 638, 639, 640, 644, 645, 646, 647, 648, 649, 650, 651, 652, 661, 662, 743, 744, 745, 746, 747, 748, 749, 750, 758, 925, 926, 927, 928]


position_searching_ATP1B3-201: 100%|██████████| 632/632 [00:00<00:00, 70428.83it/s]


Gene ATP1B3-201: 	Not enough pos for 50 binding sites.
Gene ATP1B3-201: 	condition too harsh, loose to get better results
[27, 357, 358, 359, 485, 486]


position_searching_NANOG-201: 100%|██████████| 696/696 [00:00<00:00, 24064.86it/s]


Gene NANOG-201: 	condition too harsh, loose to get better results
[13, 23, 36, 44, 56, 62, 85, 105, 111, 117, 123, 129, 135, 141, 147, 153, 159, 165, 171, 177, 183, 189, 195, 201, 207, 215, 221, 227, 233, 239, 245, 251, 271, 291, 297, 303, 309, 386, 392, 399, 407, 413, 419, 427, 433, 439, 503, 570, 578, 584, 595, 601, 646, 666]


position_searching_SOX2-201: 100%|██████████| 724/724 [00:00<00:00, 21350.46it/s]


Gene SOX2-201: 	condition too harsh, loose to get better results
[0, 10, 18, 105, 113, 121, 139, 147, 159, 167, 176, 184, 192, 200, 209, 217, 225, 233, 241, 249, 257, 265, 273, 291, 311, 320, 330, 338, 346, 354, 362, 370, 470, 489, 498, 513, 522, 531, 539, 547, 557, 565, 614, 624, 632, 640, 648, 657, 672, 692, 700, 711, 719]


position_searching_POU5F1-201: 100%|██████████| 827/827 [00:00<00:00, 36053.31it/s]


Gene POU5F1-201: 	condition too harsh, loose to get better results
[2, 8, 15, 22, 28, 35, 41, 89, 99, 105, 147, 153, 159, 165, 171, 177, 183, 189, 210, 254, 260, 266, 272, 278, 284, 392, 399, 405, 411, 448, 454, 461, 467, 479, 485, 491, 498, 510, 518, 530, 536, 545, 551, 557, 588, 594, 600, 606, 619, 625, 671, 677, 683, 693, 703, 713]


position_searching_KLF4-201: 100%|██████████| 1112/1112 [00:00<00:00, 20272.20it/s]


Gene KLF4-201: 	condition too harsh, loose to get better results
[1, 15, 29, 47, 67, 81, 99, 116, 130, 147, 161, 175, 192, 206, 223, 243, 259, 273, 287, 308, 322, 378, 422, 436, 493, 507, 523, 537, 551, 565, 579, 639, 653, 667, 684, 698, 712, 819, 837, 851, 869, 889, 903, 917, 974, 991, 1005, 1019, 1071, 1091]


position_searching_PRDM14-201: 100%|██████████| 1334/1334 [00:00<00:00, 32985.13it/s]


Gene PRDM14-201: 	condition too harsh, loose to get better results
[4, 14, 24, 69, 82, 191, 340, 352, 362, 372, 387, 438, 451, 506, 516, 526, 541, 752, 858, 873, 883, 893, 907, 918, 929, 939, 949, 959, 969, 979, 989, 1017, 1037, 1048, 1101, 1115, 1131, 1141, 1151, 1161, 1171, 1181, 1191, 1201, 1211, 1221, 1231, 1241, 1251, 1264, 1277, 1287, 1297]


position_searching_TBX3-201: 100%|██████████| 1746/1746 [00:00<00:00, 24314.56it/s]


Gene TBX3-201: 	condition too harsh, loose to get better results
[14, 35, 55, 75, 96, 116, 135, 155, 174, 194, 213, 233, 263, 284, 304, 322, 340, 359, 379, 397, 416, 434, 452, 473, 544, 597, 661, 679, 741, 761, 781, 799, 825, 845, 863, 908, 927, 953, 973, 1045, 1306, 1408, 1432, 1452, 1472, 1621, 1666, 1686, 1716, 1738]


position_searching_GDF3-201: 100%|██████████| 837/837 [00:00<00:00, 25431.44it/s]


Gene GDF3-201: 	condition too harsh, loose to get better results
[11, 17, 65, 80, 132, 156, 212, 216, 220, 224, 229, 233, 237, 242, 246, 250, 254, 260, 280, 300, 313, 404, 425, 430, 445, 488, 492, 496, 506, 512, 520, 524, 528, 532, 536, 540, 544, 558, 562, 579, 583, 606, 716, 734, 752, 758, 773, 777, 781, 785]


position_searching_NODAL-201: 100%|██████████| 796/796 [00:00<00:00, 24186.75it/s]


Gene NODAL-201: 	condition too harsh, loose to get better results
[6, 14, 21, 34, 46, 54, 63, 89, 96, 103, 110, 117, 124, 131, 139, 146, 168, 190, 201, 220, 227, 277, 284, 302, 337, 419, 439, 446, 453, 460, 534, 544, 552, 559, 582, 602, 609, 616, 623, 630, 639, 646, 655, 662, 670, 733, 740, 747, 759, 766, 779, 786, 793]


position_searching_LEFTY2-201: 100%|██████████| 841/841 [00:00<00:00, 16865.05it/s]


Gene LEFTY2-201: 	condition too harsh, loose to get better results
[0, 10, 20, 30, 43, 94, 166, 183, 203, 213, 223, 233, 243, 311, 321, 332, 352, 366, 416, 426, 436, 446, 456, 466, 476, 486, 496, 506, 516, 526, 545, 596, 609, 619, 629, 639, 650, 670, 680, 690, 700, 710, 720, 730, 740, 785, 797, 807, 817, 827, 838]


position_searching_UTF1-201: 100%|██████████| 782/782 [00:00<00:00, 23759.63it/s]


Gene UTF1-201: 	condition too harsh, loose to get better results
[0, 2, 12, 14, 16, 18, 20, 22, 32, 34, 54, 202, 204, 209, 213, 216, 277, 279, 281, 283, 285, 287, 289, 291, 293, 296, 298, 303, 305, 309, 318, 366, 370, 372, 374, 377, 462, 466, 482, 484, 486, 488, 490, 492, 494, 511, 514, 541, 596, 617, 620, 637, 655, 657, 659, 661, 665, 737, 748, 766]


position_searching_ZIC2-201: 100%|██████████| 1241/1241 [00:00<00:00, 17282.52it/s]


Gene ZIC2-201: 	condition too harsh, loose to get better results
[72, 129, 149, 165, 181, 197, 213, 229, 249, 265, 281, 297, 313, 342, 361, 380, 396, 412, 428, 444, 465, 485, 503, 519, 539, 559, 575, 592, 612, 629, 645, 666, 684, 704, 732, 775, 791, 823, 843, 869, 889, 957, 1046, 1062, 1078, 1094, 1125, 1172, 1195, 1216, 1236]


position_searching_FGF4-201: 100%|██████████| 457/457 [00:00<00:00, 17623.94it/s]


Gene FGF4-201: 	condition too harsh, loose to get better results
[0, 4, 8, 13, 19, 23, 27, 31, 35, 80, 84, 88, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 160, 168, 172, 180, 188, 192, 198, 214, 234, 240, 245, 254, 258, 262, 266, 270, 274, 279, 283, 288, 292, 296, 304, 312, 316, 364, 378]


position_searching_TBXT-203: 100%|██████████| 1009/1009 [00:00<00:00, 23528.00it/s]


Gene TBXT-203: 	condition too harsh, loose to get better results
[0, 41, 67, 87, 98, 109, 120, 131, 142, 153, 164, 175, 186, 200, 281, 292, 303, 314, 325, 336, 347, 372, 392, 403, 414, 425, 436, 447, 463, 589, 604, 615, 626, 641, 694, 705, 726, 746, 757, 768, 779, 790, 847, 860, 871, 882, 893, 905, 916, 927, 976, 991, 1002]


position_searching_GSC-201: 100%|██████████| 580/580 [00:00<00:00, 24230.77it/s]


Gene GSC-201: 	condition too harsh, loose to get better results
[9, 20, 27, 32, 38, 43, 77, 83, 88, 108, 128, 133, 139, 144, 149, 154, 159, 165, 170, 175, 192, 197, 212, 217, 222, 227, 232, 237, 242, 247, 252, 257, 262, 267, 272, 277, 282, 287, 292, 297, 302, 413, 427, 432, 438, 453, 509, 514, 521, 531, 536, 541, 556, 561, 566, 576]


position_searching_GATA4-206: 100%|██████████| 1026/1026 [00:00<00:00, 18704.47it/s]


Gene GATA4-206: 	condition too harsh, loose to get better results
[0, 24, 44, 55, 66, 77, 88, 99, 110, 121, 132, 143, 154, 165, 176, 195, 215, 238, 249, 260, 271, 282, 293, 304, 315, 330, 341, 352, 363, 374, 385, 396, 407, 418, 429, 440, 452, 463, 481, 492, 503, 514, 618, 679, 690, 701, 716, 853, 864, 875, 1006]


position_searching_SOX17-201: 100%|██████████| 957/957 [00:00<00:00, 24602.21it/s]


Gene SOX17-201: 	condition too harsh, loose to get better results
[26, 34, 42, 50, 141, 149, 156, 163, 214, 222, 234, 242, 254, 266, 335, 342, 393, 403, 410, 423, 430, 437, 444, 451, 458, 467, 477, 484, 492, 499, 508, 550, 563, 680, 699, 706, 713, 720, 733, 740, 757, 767, 805, 812, 819, 826, 833, 844, 878, 885, 892, 899]


position_searching_PDGFRA-201: 100%|██████████| 2576/2576 [00:00<00:00, 39736.84it/s]


Gene PDGFRA-201: 	condition too harsh, loose to get better results
[2, 147, 163, 174, 232, 259, 318, 340, 351, 449, 460, 699, 824, 835, 846, 930, 954, 1027, 1074, 1336, 1348, 1360, 1371, 1382, 1402, 1413, 1424, 1435, 1448, 1469, 1480, 1492, 1510, 1609, 1632, 1879, 1894, 1948, 2036, 2083, 2337, 2356, 2367, 2378, 2395, 2406, 2417, 2441, 2540, 2565]


position_searching_PODXL-202: 100%|██████████| 1303/1303 [00:00<00:00, 22292.11it/s]


Gene PODXL-202: 	condition too harsh, loose to get better results
[8, 28, 47, 68, 97, 116, 216, 276, 298, 317, 337, 386, 405, 424, 480, 500, 519, 538, 557, 576, 595, 614, 633, 652, 698, 717, 736, 772, 793, 813, 832, 883, 902, 943, 962, 981, 1002, 1039, 1058, 1077, 1097, 1116, 1135, 1156, 1176, 1195, 1214, 1233, 1252, 1271, 1290]


position_searching_LAMA1-201: 100%|██████████| 7344/7344 [00:00<00:00, 36136.92it/s]
position_searching_AMOTL1-203: 100%|██████████| 2257/2257 [00:00<00:00, 31184.78it/s]


Gene AMOTL1-203: 	condition too harsh, loose to get better results
[0, 41, 121, 154, 185, 216, 370, 401, 432, 464, 509, 551, 582, 613, 699, 784, 815, 848, 974, 1005, 1075, 1106, 1174, 1254, 1327, 1358, 1389, 1421, 1482, 1513, 1545, 1585, 1616, 1647, 1683, 1714, 1745, 1776, 1808, 1841, 1894, 1929, 1964, 1995, 2026, 2073, 2104, 2183, 2214, 2245]


position_searching_ANXA3-201: 100%|██████████| 738/738 [00:00<00:00, 123307.83it/s]


Gene ANXA3-201: 	Not enough pos for 50 binding sites.
Gene ANXA3-201: 	condition too harsh, loose to get better results
[87, 88, 89, 90, 91, 93, 94, 95, 96, 482, 574, 594, 595, 596, 597, 598, 605, 606, 607, 608, 609, 610, 611, 616, 617, 618, 625, 626, 710, 711]


position_searching_OTX2-209: 100%|██████████| 676/676 [00:00<00:00, 33892.96it/s]


Gene OTX2-209: 	condition too harsh, loose to get better results
[77, 86, 90, 94, 98, 112, 116, 120, 149, 230, 235, 245, 249, 258, 269, 279, 289, 293, 299, 303, 309, 313, 317, 321, 329, 333, 337, 341, 381, 414, 418, 422, 426, 462, 466, 470, 474, 478, 482, 562, 566, 570, 577, 582, 586, 591, 595, 599, 605, 611, 615, 624, 636]


position_searching_FOXA2-202: 100%|██████████| 1074/1074 [00:00<00:00, 21538.15it/s]


Gene FOXA2-202: 	condition too harsh, loose to get better results
[0, 10, 20, 30, 40, 50, 60, 78, 98, 108, 118, 285, 299, 314, 443, 454, 464, 474, 484, 494, 504, 514, 528, 538, 548, 570, 590, 601, 611, 621, 632, 642, 652, 668, 678, 688, 698, 716, 736, 746, 756, 766, 814, 894, 994, 1004, 1015, 1025, 1035, 1060]


position_searching_BMP2-201: 100%|██████████| 913/913 [00:00<00:00, 28604.08it/s]


Gene BMP2-201: 	condition too harsh, loose to get better results
[60, 82, 151, 171, 174, 177, 184, 187, 190, 220, 281, 284, 293, 370, 373, 376, 379, 383, 388, 393, 396, 399, 403, 475, 480, 483, 490, 500, 503, 596, 616, 619, 724, 727, 741, 744, 748, 773, 776, 793, 796, 805, 866, 869, 872, 875, 878, 881, 884, 889, 892, 895, 898, 901, 904, 909, 912]


position_searching_COL4A1-201: 100%|██████████| 3968/3968 [00:00<00:00, 34597.23it/s]
position_searching_TP63-201: 100%|██████████| 1595/1595 [00:00<00:00, 28058.55it/s]

Gene TP63-201: 	condition too harsh, loose to get better results
[2, 18, 65, 113, 132, 257, 273, 289, 305, 323, 339, 355, 375, 395, 412, 497, 546, 676, 777, 793, 809, 934, 951, 988, 1052, 1068, 1085, 1101, 1123, 1143, 1159, 1175, 1192, 1209, 1229, 1249, 1267, 1283, 1302, 1318, 1342, 1362, 1378, 1394, 1412, 1432, 1448, 1464, 1480, 1496, 1512, 1528]



position_searching_KRT7-201: 100%|██████████| 1088/1088 [00:00<00:00, 29485.06it/s]


Gene KRT7-201: 	condition too harsh, loose to get better results
[57, 76, 85, 94, 104, 113, 122, 134, 143, 161, 181, 198, 226, 242, 251, 260, 273, 282, 293, 302, 327, 390, 399, 414, 462, 471, 480, 489, 498, 507, 517, 526, 537, 546, 555, 564, 582, 800, 810, 819, 828, 838, 847, 858, 867, 887, 896, 919, 966, 980, 1046]


position_searching_CGB3-201: 100%|██████████| 360/360 [00:00<00:00, 21235.49it/s]


Gene CGB3-201: 	condition too harsh, loose to get better results
[13, 57, 62, 77, 82, 87, 91, 97, 101, 105, 109, 117, 121, 125, 129, 133, 137, 152, 157, 172, 176, 180, 184, 190, 194, 198, 202, 207, 243, 250, 254, 258, 262, 267, 271, 275, 279, 283, 287, 304, 308, 324, 328, 332, 336, 340, 344, 348, 352, 356]


position_searching_CGA-202: 100%|██████████| 316/316 [00:00<00:00, 21123.93it/s]


Gene CGA-202: 	condition too harsh, loose to get better results
[93, 96, 107, 113, 116, 130, 132, 134, 136, 138, 140, 142, 144, 150, 152, 164, 166, 168, 170, 172, 184, 189, 203, 209, 212, 223, 227, 229, 231, 233, 235, 237, 239, 243, 247, 249, 251, 253, 255, 257, 259, 263, 268, 273, 275, 277, 279, 281, 284, 288, 293, 295]


position_searching_PLAC1-201: 100%|██████████| 473/473 [00:00<00:00, 26347.39it/s]


Gene PLAC1-201: 	Not enough pos for 50 binding sites.
Gene PLAC1-201: 	condition too harsh, loose to get better results
[2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 15, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 81, 82, 138, 139, 143, 144, 145, 146, 158, 159, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 254, 256, 259, 291, 304, 305, 306, 313, 333, 351, 441, 453, 454, 455, 456, 457, 458]


position_searching_SYN1-201: 100%|██████████| 1656/1656 [00:00<00:00, 31930.89it/s]

Gene SYN1-201: 	condition too harsh, loose to get better results
[169, 179, 187, 295, 343, 352, 360, 561, 598, 651, 659, 674, 691, 699, 707, 715, 725, 769, 857, 882, 954, 963, 974, 982, 990, 998, 1006, 1014, 1022, 1030, 1050, 1076, 1084, 1093, 1199, 1209, 1273, 1288, 1301, 1309, 1317, 1342, 1420, 1436, 1510, 1518, 1528, 1536, 1617, 1637, 1648]



position_searching_SYN2-206: 100%|██████████| 1361/1361 [00:00<00:00, 28429.94it/s]


Gene SYN2-206: 	condition too harsh, loose to get better results
[17, 219, 232, 245, 258, 271, 284, 297, 319, 339, 443, 460, 494, 516, 529, 543, 577, 597, 652, 665, 678, 691, 704, 717, 730, 743, 756, 769, 782, 795, 867, 921, 942, 963, 977, 990, 1003, 1016, 1029, 1043, 1059, 1072, 1085, 1165, 1178, 1191, 1204, 1286, 1299, 1320]


position_searching_IGF2-207: 100%|██████████| 529/529 [00:00<00:00, 21216.98it/s]


Gene IGF2-207: 	condition too harsh, loose to get better results
[0, 6, 12, 18, 24, 30, 38, 44, 89, 108, 128, 134, 140, 146, 152, 158, 164, 170, 212, 218, 224, 230, 239, 247, 253, 261, 267, 273, 279, 285, 291, 297, 303, 309, 315, 321, 327, 368, 377, 384, 393, 407, 413, 427, 446, 452, 458, 466, 472, 478, 486, 496]


position_searching_HAND1-201: 100%|██████████| 480/480 [00:00<00:00, 20052.85it/s]


Gene HAND1-201: 	condition too harsh, loose to get better results
[0, 5, 9, 14, 68, 72, 76, 80, 84, 88, 92, 96, 100, 104, 108, 112, 116, 120, 124, 128, 132, 136, 140, 144, 148, 152, 159, 165, 169, 173, 179, 185, 189, 193, 197, 201, 205, 209, 213, 217, 228, 232, 236, 246, 254, 262, 272, 276, 282, 349, 440, 444, 448, 452, 456, 460, 464, 468, 479]


position_searching_HLA-G-204: 100%|██████████| 786/786 [00:00<00:00, 23180.28it/s]


Gene HLA-G-204: 	condition too harsh, loose to get better results
[0, 24, 55, 113, 133, 181, 189, 197, 211, 234, 242, 299, 317, 325, 377, 385, 393, 401, 410, 419, 427, 435, 443, 451, 459, 476, 489, 503, 511, 523, 531, 539, 547, 555, 563, 571, 579, 587, 595, 603, 611, 619, 627, 698, 706, 714, 722, 730, 738, 746, 754, 770]


position_searching_PGF-202: 100%|██████████| 494/494 [00:00<00:00, 15978.05it/s]


Gene PGF-202: 	condition too harsh, loose to get better results
[9, 17, 24, 30, 37, 44, 50, 56, 63, 72, 84, 96, 104, 116, 122, 128, 134, 140, 146, 152, 176, 196, 204, 210, 216, 224, 231, 237, 243, 251, 257, 263, 270, 276, 282, 288, 309, 320, 326, 332, 340, 351, 357, 363, 371, 377, 383, 389, 395, 411, 431, 445, 469, 488]


position_searching_ENG-202: 100%|██████████| 1543/1543 [00:00<00:00, 21311.36it/s]


Gene ENG-202: 	condition too harsh, loose to get better results
[0, 23, 46, 75, 98, 121, 180, 203, 227, 260, 292, 315, 338, 363, 386, 413, 436, 468, 524, 547, 582, 610, 654, 685, 708, 731, 754, 786, 809, 832, 857, 880, 944, 967, 1011, 1034, 1116, 1139, 1162, 1188, 1214, 1237, 1285, 1321, 1344, 1389, 1414, 1438, 1470, 1499, 1523]


position_searching_HTRA4-201: 100%|██████████| 1105/1105 [00:00<00:00, 42617.59it/s]


Gene HTRA4-201: 	condition too harsh, loose to get better results
[104, 108, 112, 381, 385, 389, 393, 409, 415, 468, 622, 673, 677, 693, 700, 704, 714, 739, 745, 749, 764, 768, 787, 799, 834, 876, 882, 886, 890, 894, 898, 906, 910, 929, 949, 954, 958, 963, 967, 971, 975, 980, 1041, 1045, 1049, 1053, 1061, 1065, 1069, 1073, 1077, 1097]


position_searching_MMP2-201: 100%|██████████| 1547/1547 [00:00<00:00, 26291.20it/s]


Gene MMP2-201: 	condition too harsh, loose to get better results
[58, 103, 123, 198, 218, 248, 268, 367, 413, 433, 453, 490, 511, 532, 552, 572, 592, 612, 632, 654, 674, 694, 742, 762, 782, 802, 834, 854, 882, 902, 922, 942, 971, 991, 1011, 1161, 1181, 1201, 1230, 1283, 1303, 1323, 1343, 1366, 1389, 1409, 1431, 1452, 1472, 1492, 1512]


position_searching_ISL1-201: 100%|██████████| 800/800 [00:00<00:00, 25066.06it/s]


Gene ISL1-201: 	condition too harsh, loose to get better results
[24, 39, 48, 67, 76, 91, 100, 109, 118, 127, 136, 161, 179, 192, 243, 257, 266, 275, 296, 306, 315, 324, 333, 342, 351, 360, 369, 380, 389, 400, 419, 439, 448, 457, 466, 475, 485, 495, 505, 522, 531, 540, 549, 560, 569, 578, 587, 596, 641, 653, 673, 682]


position_searching_PAX2-212: 100%|██████████| 1001/1001 [00:00<00:00, 25093.83it/s]


Gene PAX2-212: 	condition too harsh, loose to get better results
[0, 45, 59, 71, 83, 144, 156, 169, 181, 193, 205, 256, 268, 280, 300, 322, 334, 354, 368, 380, 392, 433, 445, 573, 585, 597, 609, 621, 634, 646, 660, 712, 724, 744, 767, 779, 791, 809, 822, 869, 881, 893, 905, 917, 929, 941, 953, 965, 977, 989]


position_searching_HOXA11-201: 100%|██████████| 714/714 [00:00<00:00, 19886.93it/s]

Gene HOXA11-201: 	condition too harsh, loose to get better results
[2, 8, 64, 71, 77, 83, 91, 97, 103, 109, 116, 125, 131, 145, 151, 157, 166, 258, 345, 355, 361, 367, 375, 381, 401, 421, 428, 434, 489, 495, 501, 507, 559, 565, 571, 577, 583, 623, 629, 636, 642, 648, 656, 662, 668, 674, 687, 693, 707, 713]



position_searching_HOXA13-202: 100%|██████████| 895/895 [00:00<00:00, 21367.11it/s]


Gene HOXA13-202: 	condition too harsh, loose to get better results
[78, 84, 90, 96, 102, 108, 114, 120, 126, 135, 141, 147, 155, 161, 167, 173, 191, 211, 217, 224, 230, 279, 285, 291, 297, 303, 309, 319, 325, 331, 341, 347, 354, 361, 367, 381, 387, 401, 407, 413, 454, 463, 477, 483, 497, 503, 509, 516, 536, 556, 564, 725, 737, 757]


position_searching_AQP1-208: 100%|██████████| 608/608 [00:00<00:00, 20182.32it/s]


Gene AQP1-208: 	condition too harsh, loose to get better results
[0, 10, 24, 44, 65, 77, 97, 112, 122, 132, 143, 153, 163, 173, 183, 223, 233, 243, 253, 263, 273, 295, 315, 325, 335, 345, 355, 365, 375, 386, 396, 422, 435, 445, 455, 465, 475, 485, 495, 506, 516, 526, 536, 546, 556, 566, 577, 587, 597, 607]


position_searching_TFAP2A-203: 100%|██████████| 1016/1016 [00:00<00:00, 24251.71it/s]


Gene TFAP2A-203: 	condition too harsh, loose to get better results
[0, 19, 28, 39, 48, 58, 68, 77, 88, 97, 106, 127, 160, 229, 240, 249, 258, 271, 284, 293, 304, 313, 406, 418, 427, 436, 445, 454, 463, 472, 481, 490, 499, 508, 529, 549, 600, 609, 620, 671, 680, 689, 698, 708, 717, 728, 737, 746, 755, 767, 776, 787, 796, 805, 824, 845]


position_searching_GABRP-203: 100%|██████████| 1019/1019 [00:00<00:00, 31930.76it/s]


Gene GABRP-203: 	condition too harsh, loose to get better results
[251, 256, 264, 284, 288, 340, 344, 546, 550, 555, 559, 563, 567, 575, 649, 653, 657, 661, 720, 724, 731, 739, 743, 747, 751, 759, 763, 783, 826, 830, 834, 838, 842, 847, 851, 855, 859, 871, 888, 892, 896, 901, 906, 910, 914, 918, 922, 978, 982, 993, 997]


position_searching_AFP-201: 100%|██████████| 1457/1457 [00:00<00:00, 44269.87it/s]


Gene AFP-201: 	Not enough pos for 50 binding sites.
Gene AFP-201: 	condition too harsh, loose to get better results
[35, 45, 133, 134, 135, 136, 152, 153, 154, 155, 156, 173, 174, 175, 176, 276, 277, 278, 279, 280, 281, 282, 292, 293, 294, 295, 296, 298, 299, 300, 301, 302, 305, 306, 307, 308, 309, 310, 311, 312, 313, 314, 315, 316, 318, 319, 320, 321, 322, 323, 324, 325, 326, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 432, 433, 434, 435, 745, 746, 747, 879, 880, 881]


position_searching_PRDM1-208: 100%|██████████| 1976/1976 [00:00<00:00, 24460.34it/s]


Gene PRDM1-208: 	condition too harsh, loose to get better results
[89, 133, 172, 202, 227, 253, 280, 310, 335, 360, 412, 437, 463, 498, 523, 604, 641, 668, 695, 723, 749, 774, 799, 825, 861, 890, 918, 978, 1013, 1041, 1066, 1095, 1120, 1145, 1170, 1195, 1243, 1279, 1304, 1333, 1416, 1475, 1507, 1548, 1573, 1616, 1676, 1701, 1869, 1967]


position_searching_LIN28A-201: 100%|██████████| 464/464 [00:00<00:00, 38769.61it/s]


Gene LIN28A-201: 	condition too harsh, loose to get better results
[7, 12, 14, 27, 29, 31, 33, 35, 37, 39, 41, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 67, 69, 71, 75, 77, 79, 81, 83, 85, 87, 89, 91, 101, 216, 221, 223, 225, 227, 229, 231, 233, 235, 237, 367, 379, 381, 383, 385, 387, 389, 393, 395, 416, 425, 427, 429, 431, 433, 436]


position_searching_KIT-209: 100%|██████████| 2308/2308 [00:00<00:00, 56444.81it/s]


Gene KIT-209: 	condition too harsh, loose to get better results
[6, 10, 115, 279, 283, 338, 342, 346, 403, 626, 630, 635, 714, 720, 724, 728, 745, 751, 755, 759, 1072, 1218, 1228, 1232, 1236, 1250, 1270, 1320, 1324, 1328, 1332, 1336, 1345, 1353, 1357, 1461, 1587, 2003, 2066, 2070, 2086, 2100, 2104, 2108, 2137, 2164, 2168, 2185, 2205, 2209, 2213, 2217, 2221, 2225, 2229, 2234, 2238, 2242]


position_searching_DAZ1-201: 100%|██████████| 1749/1749 [00:00<00:00, 33723.49it/s]


Gene DAZ1-201: 	condition too harsh, loose to get better results
[521, 532, 537, 542, 547, 552, 557, 562, 567, 586, 902, 915, 920, 928, 934, 939, 948, 954, 965, 974, 994, 1009, 1027, 1032, 1037, 1042, 1047, 1052, 1057, 1062, 1081, 1397, 1410, 1415, 1423, 1429, 1434, 1443, 1449, 1460, 1469, 1489, 1504, 1522, 1527, 1532, 1537, 1542, 1547, 1552, 1557, 1576]


position_searching_DDX4-204: 100%|██████████| 1701/1701 [00:00<00:00, 142124.57it/s]


Gene DDX4-204: 	Not enough pos for 50 binding sites.
Gene DDX4-204: 	condition too harsh, loose to get better results
[520, 521, 522, 523, 524, 525, 526, 527, 924, 929, 930, 931, 1092, 1213, 1409, 1410, 1411, 1412, 1417, 1418, 1558]


position_searching_NANOS3-201: 100%|██████████| 425/425 [00:00<00:00, 19370.18it/s]


Gene NANOS3-201: 	condition too harsh, loose to get better results
[28, 34, 40, 46, 52, 58, 64, 70, 76, 82, 88, 94, 101, 107, 113, 119, 127, 133, 140, 148, 154, 160, 166, 172, 178, 184, 200, 206, 220, 226, 232, 239, 245, 261, 267, 281, 287, 293, 299, 305, 311, 317, 324, 330, 337, 347, 353, 359, 367, 373, 379, 385]


position_searching_PAX6-230: 100%|██████████| 1009/1009 [00:00<00:00, 34885.69it/s]


Gene PAX6-230: 	condition too harsh, loose to get better results
[22, 27, 32, 37, 42, 48, 53, 58, 63, 102, 108, 113, 121, 126, 131, 136, 141, 146, 151, 156, 161, 166, 171, 179, 199, 224, 286, 291, 571, 590, 595, 600, 605, 610, 615, 620, 625, 630, 712, 717, 722, 822, 842, 896, 905, 911, 916, 923, 928, 997, 1002, 1007]


position_searching_NES-201: 100%|██████████| 3854/3854 [00:00<00:00, 47899.82it/s]


Gene NES-201: 	condition too harsh, loose to get better results
[0, 29, 152, 181, 225, 327, 357, 457, 486, 638, 672, 718, 747, 782, 871, 1004, 1083, 1112, 1162, 1210, 1292, 1466, 1816, 1983, 2271, 2338, 2481, 2512, 2674, 2739, 2931, 2960, 2990, 3042, 3094, 3123, 3152, 3214, 3265, 3294, 3323, 3358, 3387, 3416, 3445, 3474, 3503, 3532, 3652, 3762]


position_searching_NEUROD1-201: 100%|██████████| 817/817 [00:00<00:00, 22756.08it/s]


Gene NEUROD1-201: 	condition too harsh, loose to get better results
[15, 118, 127, 139, 147, 165, 172, 185, 192, 201, 221, 241, 248, 257, 264, 271, 279, 287, 297, 304, 362, 369, 382, 389, 396, 403, 412, 419, 438, 458, 465, 472, 479, 486, 494, 501, 508, 519, 526, 568, 575, 582, 589, 596, 608, 615, 622, 671, 678, 691, 698, 798, 805]


position_searching_SOX9-201: 100%|██████████| 1184/1184 [00:00<00:00, 24227.03it/s]


Gene SOX9-201: 	condition too harsh, loose to get better results
[7, 18, 29, 40, 51, 62, 73, 84, 95, 106, 117, 128, 147, 167, 189, 209, 350, 397, 408, 522, 533, 609, 659, 706, 717, 728, 739, 830, 841, 852, 863, 879, 912, 930, 941, 952, 963, 974, 985, 1005, 1023, 1035, 1046, 1057, 1068, 1079, 1090, 1101, 1112, 1123, 1134, 1145, 1166, 1179]


position_searching_FZD7-201: 100%|██████████| 1341/1341 [00:00<00:00, 18419.32it/s]


Gene FZD7-201: 	condition too harsh, loose to get better results
[3, 26, 51, 72, 93, 119, 140, 162, 197, 219, 255, 276, 298, 330, 351, 373, 394, 416, 437, 485, 512, 535, 565, 586, 607, 628, 649, 671, 692, 718, 739, 760, 782, 819, 845, 923, 974, 1042, 1063, 1084, 1105, 1126, 1147, 1168, 1189, 1255, 1276, 1297, 1318, 1339]


position_searching_MESP1-201: 100%|██████████| 607/607 [00:00<00:00, 20988.64it/s]


Gene MESP1-201: 	condition too harsh, loose to get better results
[2, 22, 36, 41, 46, 50, 57, 184, 187, 193, 196, 199, 202, 258, 269, 272, 275, 278, 281, 284, 287, 290, 293, 296, 299, 302, 305, 308, 311, 314, 317, 320, 323, 326, 329, 332, 335, 338, 341, 359, 379, 382, 385, 390, 412, 426, 431, 435, 438, 441, 571, 585, 589, 593, 596, 601, 604]


position_searching_TBX6-202: 100%|██████████| 1009/1009 [00:00<00:00, 24088.32it/s]


Gene TBX6-202: 	condition too harsh, loose to get better results
[1, 40, 49, 57, 101, 109, 119, 127, 139, 196, 216, 224, 232, 240, 248, 263, 285, 344, 354, 362, 381, 389, 399, 419, 439, 459, 479, 487, 495, 503, 511, 519, 527, 586, 602, 610, 618, 632, 640, 762, 778, 786, 794, 802, 810, 879, 887, 899, 907, 915, 929, 1005]


position_searching_NKX2-5-201: 100%|██████████| 741/741 [00:00<00:00, 21851.32it/s]


Gene NKX2-5-201: 	condition too harsh, loose to get better results
[5, 27, 35, 42, 49, 65, 72, 85, 92, 101, 109, 116, 129, 136, 148, 155, 168, 175, 182, 189, 197, 204, 265, 272, 285, 292, 299, 306, 317, 324, 331, 340, 381, 388, 395, 402, 409, 416, 423, 524, 531, 538, 545, 552, 559, 570, 609, 616, 624, 642, 705, 712, 719, 740]


position_searching_MEOX1-201: 100%|██████████| 573/573 [00:00<00:00, 24977.77it/s]


Gene MEOX1-201: 	condition too harsh, loose to get better results
[20, 24, 29, 33, 37, 41, 52, 57, 61, 72, 79, 90, 94, 99, 114, 120, 128, 132, 136, 141, 145, 149, 171, 262, 272, 283, 366, 377, 414, 419, 423, 427, 439, 447, 467, 471, 483, 487, 491, 497, 501, 505, 509, 513, 518, 528, 532, 536, 540, 548, 552, 556, 563, 572]


position_searching_PDGFRB-201: 100%|██████████| 2617/2617 [00:00<00:00, 25725.29it/s]
position_searching_MYOD1-201: 100%|██████████| 731/731 [00:00<00:00, 21557.19it/s]


Gene MYOD1-201: 	condition too harsh, loose to get better results
[11, 19, 27, 71, 79, 87, 95, 103, 152, 160, 168, 219, 227, 235, 243, 251, 259, 267, 361, 369, 378, 386, 396, 404, 412, 420, 428, 436, 445, 453, 465, 473, 490, 510, 518, 530, 538, 546, 554, 610, 655, 663, 671, 679, 687, 695, 703, 711, 719, 728]


position_searching_RUNX2-209: 100%|██████████| 1265/1265 [00:00<00:00, 22252.12it/s]


Gene RUNX2-209: 	condition too harsh, loose to get better results
[94, 114, 127, 139, 151, 179, 199, 264, 276, 288, 300, 312, 324, 336, 384, 404, 424, 444, 457, 469, 483, 495, 507, 519, 531, 543, 561, 574, 588, 639, 651, 665, 679, 703, 757, 769, 1012, 1024, 1036, 1048, 1060, 1072, 1084, 1096, 1108, 1120, 1132, 1144, 1162, 1252, 1264]


position_searching_SNAI2-201: 100%|██████████| 607/607 [00:00<00:00, 28981.88it/s]


Gene SNAI2-201: 	condition too harsh, loose to get better results
[20, 42, 65, 68, 98, 101, 118, 121, 124, 133, 153, 173, 176, 180, 183, 186, 256, 259, 262, 265, 268, 271, 274, 367, 406, 409, 412, 415, 418, 421, 424, 474, 494, 500, 503, 506, 509, 512, 515, 518, 521, 524, 527, 530, 533, 536, 539, 542, 545, 548, 551, 554, 557, 560, 563, 568, 571, 576]


position_searching_HNF4A-201: 100%|██████████| 1101/1101 [00:00<00:00, 20071.93it/s]


Gene HNF4A-201: 	condition too harsh, loose to get better results
[12, 26, 49, 69, 166, 214, 228, 243, 258, 272, 286, 300, 314, 331, 348, 362, 410, 430, 444, 458, 478, 492, 515, 536, 550, 568, 583, 598, 647, 663, 725, 740, 780, 794, 808, 822, 836, 853, 905, 919, 933, 962, 978, 992, 1006, 1020, 1034, 1048, 1065, 1085, 1100]


position_searching_NKX2-1-201: 100%|██████████| 926/926 [00:00<00:00, 19521.82it/s]


Gene NKX2-1-201: 	condition too harsh, loose to get better results
[45, 56, 118, 176, 184, 193, 201, 210, 218, 228, 298, 318, 328, 336, 348, 356, 366, 374, 382, 394, 402, 414, 424, 432, 445, 453, 461, 469, 487, 507, 521, 575, 583, 595, 603, 611, 619, 627, 639, 647, 655, 666, 674, 682, 690, 698, 711, 723, 731, 756, 776, 800]


position_searching_GATA6-201: 100%|██████████| 1392/1392 [00:00<00:00, 17317.29it/s]


Gene GATA6-201: 	condition too harsh, loose to get better results
[159, 177, 197, 210, 223, 237, 250, 263, 276, 289, 302, 315, 333, 346, 359, 402, 415, 428, 441, 454, 511, 524, 543, 563, 582, 595, 608, 621, 638, 658, 782, 795, 809, 894, 985, 998, 1011, 1024, 1091, 1111, 1129, 1142, 1155, 1168, 1181, 1194, 1207, 1220, 1233, 1246, 1261, 1310, 1323, 1338]


position_searching_HHEX-201: 100%|██████████| 611/611 [00:00<00:00, 36037.29it/s]


Gene HHEX-201: 	condition too harsh, loose to get better results
[131, 163, 180, 184, 188, 192, 198, 202, 206, 210, 257, 261, 266, 270, 274, 278, 282, 286, 290, 296, 309, 313, 317, 321, 325, 329, 333, 337, 375, 379, 383, 387, 391, 395, 399, 403, 407, 411, 422, 429, 456, 471, 475, 479, 483, 487, 491, 495, 499, 503]


position_searching_FGF8-201: 100%|██████████| 549/549 [00:00<00:00, 21172.45it/s]


Gene FGF8-201: 	condition too harsh, loose to get better results
[35, 40, 45, 50, 55, 60, 80, 100, 105, 110, 115, 120, 125, 130, 135, 140, 149, 182, 187, 192, 222, 231, 242, 247, 252, 267, 287, 293, 298, 303, 316, 321, 326, 336, 341, 346, 351, 356, 361, 367, 372, 377, 382, 387, 392, 397, 402, 407, 412, 417, 422, 430, 508, 513, 528]


position_searching_MIXL1-202: 100%|██████████| 539/539 [00:00<00:00, 20784.50it/s]


Gene MIXL1-202: 	condition too harsh, loose to get better results
[18, 22, 31, 35, 39, 64, 68, 72, 76, 86, 98, 102, 106, 126, 130, 134, 158, 162, 174, 178, 182, 186, 190, 194, 198, 207, 256, 260, 264, 270, 274, 280, 284, 290, 294, 299, 303, 307, 311, 316, 320, 325, 329, 333, 337, 341, 345, 349, 353, 357, 361, 365, 369, 373, 470]


position_searching_FZR1-202: 100%|██████████| 1153/1153 [00:00<00:00, 19932.62it/s]


Gene FZR1-202: 	condition too harsh, loose to get better results
[0, 20, 38, 58, 76, 94, 112, 130, 149, 167, 227, 245, 265, 283, 301, 324, 344, 362, 413, 431, 450, 470, 524, 543, 561, 628, 646, 664, 682, 702, 722, 740, 758, 776, 797, 851, 870, 890, 908, 934, 954, 972, 991, 1009, 1027, 1045, 1064, 1083, 1109, 1151]


position_searching_SNAI1-201: 100%|██████████| 597/597 [00:00<00:00, 19952.98it/s]


Gene SNAI1-201: 	condition too harsh, loose to get better results
[0, 19, 39, 48, 57, 66, 75, 84, 93, 102, 111, 120, 129, 138, 147, 156, 165, 174, 183, 194, 203, 212, 222, 231, 242, 251, 260, 269, 278, 287, 296, 305, 316, 325, 412, 428, 440, 451, 460, 469, 478, 487, 496, 507, 516, 525, 534, 543, 552, 561, 574, 583, 594]


position_searching_SUSD2-201: 100%|██████████| 1937/1937 [00:00<00:00, 23121.51it/s]


Gene SUSD2-201: 	condition too harsh, loose to get better results
[1, 36, 70, 110, 162, 193, 224, 255, 287, 318, 378, 409, 440, 471, 502, 535, 587, 622, 673, 704, 772, 808, 839, 893, 977, 1047, 1082, 1113, 1144, 1185, 1216, 1270, 1312, 1343, 1379, 1411, 1445, 1476, 1513, 1566, 1598, 1629, 1660, 1691, 1768, 1799, 1832, 1863, 1894, 1933]


position_searching_TACSTD2-201: 100%|██████████| 738/738 [00:00<00:00, 20554.58it/s]


Gene TACSTD2-201: 	condition too harsh, loose to get better results
[0, 10, 20, 30, 41, 79, 97, 107, 117, 132, 152, 164, 174, 184, 194, 204, 214, 224, 234, 244, 254, 264, 274, 284, 294, 304, 314, 325, 335, 345, 369, 389, 407, 417, 427, 440, 451, 461, 471, 484, 494, 505, 522, 532, 542, 552, 562, 573, 583, 628, 638, 648, 658, 680, 730]


position_searching_KRT19-201: 100%|██████████| 923/923 [00:00<00:00, 19690.27it/s]


Gene KRT19-201: 	condition too harsh, loose to get better results
[29, 40, 51, 62, 74, 88, 107, 140, 152, 163, 174, 185, 196, 212, 232, 243, 254, 265, 291, 309, 320, 331, 342, 353, 457, 472, 487, 498, 509, 521, 532, 549, 560, 580, 595, 606, 618, 632, 648, 660, 671, 682, 699, 732, 757, 777, 788, 799, 810, 821, 832, 852, 899, 916]


position_searching_BAMBI-201: 100%|██████████| 587/587 [00:00<00:00, 32700.98it/s]


Gene BAMBI-201: 	condition too harsh, loose to get better results
[15, 18, 21, 79, 82, 85, 100, 105, 120, 125, 128, 212, 215, 218, 221, 224, 227, 230, 322, 327, 330, 333, 347, 350, 353, 398, 407, 411, 414, 417, 420, 423, 426, 429, 432, 435, 438, 441, 444, 447, 450, 453, 456, 459, 462, 465, 468, 471, 474, 477, 480, 581]


position_searching_KCNMA1-210: 100%|██████████| 3055/3055 [00:00<00:00, 36905.63it/s]

Gene KCNMA1-210: 	condition too harsh, loose to get better results
[0, 141, 161, 181, 201, 234, 276, 319, 339, 359, 379, 399, 488, 508, 528, 548, 568, 634, 873, 893, 913, 933, 953, 1018, 1104, 1124, 1163, 1183, 1259, 1279, 1299, 1353, 1750, 1770, 1790, 1847, 1867, 1962, 1982, 2003, 2171, 2320, 2348, 2450, 2470, 2520, 2933, 2962, 2982, 3043]



position_searching_FABP5-201: 100%|██████████| 288/288 [00:00<00:00, 289193.09it/s]


Gene FABP5-201: 	Not enough pos for 50 binding sites.
Gene FABP5-201: 	condition too harsh, loose to get better results
[103, 104]


position_searching_VTCN1-202: 100%|██████████| 648/648 [00:00<00:00, 59090.12it/s]


Gene VTCN1-202: 	Not enough pos for 50 binding sites.
Gene VTCN1-202: 	condition too harsh, loose to get better results
[169, 170, 171, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 239, 240, 241, 242, 243, 244, 245, 246, 247, 248, 249, 250, 252, 378, 379, 380, 381, 382, 383, 388, 389, 390, 613, 614, 615, 625, 627, 628, 629, 630, 631, 633]


position_searching_IGFBP3-201: 100%|██████████| 676/676 [00:00<00:00, 21864.87it/s]


Gene IGFBP3-201: 	condition too harsh, loose to get better results
[0, 54, 69, 114, 142, 148, 155, 167, 175, 197, 203, 217, 226, 286, 292, 298, 304, 310, 316, 323, 331, 352, 358, 368, 374, 391, 397, 403, 412, 418, 424, 430, 436, 482, 493, 499, 505, 513, 519, 525, 533, 543, 552, 563, 610, 618, 626, 632, 639, 646, 652, 659, 665, 671]


position_searching_PRKD1-202: 100%|██████████| 2171/2171 [00:00<00:00, 48868.34it/s]

Gene PRKD1-202: 	condition too harsh, loose to get better results
[169, 174, 179, 184, 313, 786, 791, 796, 801, 809, 814, 819, 824, 829, 834, 839, 844, 892, 916, 1127, 1132, 1137, 1142, 1147, 1215, 1270, 1275, 1280, 1285, 1290, 1295, 1300, 1487, 1493, 1501, 1582, 1587, 1593, 1598, 1603, 1763, 1771, 1781, 1786, 1803, 1808, 1823, 1828, 1833, 1846, 1851, 1871, 2109, 2130]



position_searching_COL5A1-201: 100%|██████████| 4375/4375 [00:00<00:00, 34005.43it/s]
position_searching_TEAD1-206: 100%|██████████| 985/985 [00:00<00:00, 35274.24it/s]


Gene TEAD1-206: 	condition too harsh, loose to get better results
[410, 424, 430, 441, 447, 485, 490, 494, 498, 502, 506, 510, 514, 518, 522, 526, 530, 535, 539, 543, 548, 587, 591, 595, 611, 615, 631, 635, 639, 643, 647, 654, 671, 691, 696, 703, 707, 711, 723, 727, 731, 738, 742, 746, 750, 754, 758, 766, 770, 774, 778, 790, 794, 886, 977, 984]


position_searching_NR2F2-201: 100%|██████████| 957/957 [00:00<00:00, 20860.03it/s]


Gene NR2F2-201: 	condition too harsh, loose to get better results
[33, 45, 109, 134, 146, 160, 172, 185, 197, 215, 235, 247, 259, 287, 307, 319, 344, 358, 370, 382, 394, 406, 419, 431, 443, 488, 500, 512, 524, 536, 580, 592, 604, 623, 680, 692, 704, 716, 728, 740, 752, 764, 776, 794, 806, 818, 830, 842, 868, 880]


## Blast and extract blast results

In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f:
#     fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(os.path.join(os.path.join(output, blast_results_file)), "w") as f:
#     f.write(handle.read())

In [45]:
# Extract interested information from blast_results
from Bio.Blast import NCBIXML


align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try:
            FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except:
            FOI.loc[loca, "plus/minus"] = "NAN"

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [46]:
FOI["wanted"] = [True] * len(FOI)

In [47]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.split('-')[0].upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "protein_coding":
            FOI.loc[i, "wanted"] = False
            print(FOI.loc[i, "mol_type"])

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"].split('-')[0]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(os.path.join(output, "probes_sieve.xlsx"))

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))

Gene FABP5-201: 	Not enough pos for 3 binding sites.
Gene FABP5-201: 	condition too harsh, loose to get better results
[143, 144]
