# Probe Designer


## Environment


In [1]:
# basci env
import os
import sys
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# # get gene data from ncbi
# from Bio import Entrez

# # blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("../lib")

# dir
workdir = './dataset/2023.12.15_Sindy_Isoforms/'
os.makedirs(workdir, exist_ok=True)

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)

output = os.path.join(workdir, 'results', formatted_time)
pre_binding_dir = os.path.join(output, "pre_binding")
os.makedirs(output, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total.fasta"

# tmp file
gene_name_list_file = "gene_list.xlsx"
pre_binding_num_file = "pre_binding_num.json"
blast_results_file = "blast_results.xml"

In [18]:
organism = 'mouse'
gene_info = pd.read_excel(os.path.join(workdir, gene_name_list_file), sheet_name='Sheet1')
gene_list = list(gene_info['gene_name'].unique())
id_list = list(gene_info['ensembl_id'])
isoform_list = list(gene_info['isoform'])

sequences_of_all = dict()

## Get seq from ensembl dataset

In [14]:
from lib.database_interaction import ensembl_id_to_seqs
import time

skip = 0
trial = 0

tmp_isoform_list = isoform_list[skip:]
tmp_id_list = id_list[len(sequences_of_all)+skip:]

for i in range(len(tmp_isoform_list)):
    isoform = tmp_isoform_list[i]
    id = tmp_id_list[i]
    sequences_of_all[f'{id}_{isoform}'] = dict()
    sequences = ensembl_id_to_seqs(gene=isoform, gene_id=id.split('.')[0], seq_type='cds')
    for desc, sequence in sequences.items():
        sequences_of_all[f'{id}_{isoform}'][desc] = sequence

Gene:	Gm16024-201: 100%|██████████| 7/7 [00:05<00:00,  1.23it/s]
Gene:	Gm47551-201: 100%|██████████| 10/10 [00:07<00:00,  1.26it/s]
Gene:	Gm26822-201: 100%|██████████| 1/1 [00:00<00:00,  1.93it/s]
Gene:	Chek2-202: 100%|██████████| 3/3 [00:03<00:00,  1.07s/it]
Gene:	Malat1-204: 100%|██████████| 1/1 [00:00<?, ?it/s]
Gene:	Brms1l-204: 100%|██████████| 6/6 [00:04<00:00,  1.25it/s]
Gene:	Glipr1-205: 100%|██████████| 4/4 [00:02<00:00,  1.52it/s]
Gene:	Malat1-203: 100%|██████████| 17/17 [00:10<00:00,  1.60it/s]
Gene:	Gm11032-201: 100%|██████████| 1/1 [00:00<?, ?it/s]
Gene:	Gm48610-201: 100%|██████████| 1/1 [00:00<00:00,  1.57it/s]
Gene:	Tbk1-202: 100%|██████████| 4/4 [00:02<00:00,  1.91it/s]
Gene:	Cep57-202: 100%|██████████| 11/11 [00:07<00:00,  1.38it/s]
Gene:	Cisd3-204: 100%|██████████| 4/4 [00:05<00:00,  1.29s/it]
Gene:	Gm47777-201: 100%|██████████| 1/1 [00:00<00:00,  2.00it/s]
Gene:	Malat1-201: 100%|██████████| 23/23 [00:15<00:00,  1.51it/s]
Gene:	Churc1-202: 100%|██████████| 6/6 [00:03<0

In [8]:
from lib.database_interaction import ensembl_name_to_seqs
import time

skip = 0
trial = 0

for i in range(1):
    if trial > 3:
        skip += 1
        trial = 0
    try:
        tmp_isoform_list = isoform_list[skip:]
        tmp_id_list = id_list[len(sequences_of_all)+skip:]
        for i in range(len(tmp_isoform_list)):
            isoform = tmp_isoform_list[i]
            sequences = ensembl_name_to_seqs(gene=isoform, species='mouse', seq_type='cds')
            for desc, sequence in sequences.items():
                sequences_of_all[desc] = sequence
        skip += 1
    except: 
        trial += 1
        time.sleep(5)

Gene:	Gm16024-201: 100%|██████████| 1/1 [00:00<?, ?it/s]


## Binding site Searcher


In [None]:
from lib.search_binding import step_by_step, find_max_min_difference_fixed_length_subsequence, seq_minus

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(os.path.join(file_out_dir, total_pre_binding_file_name), "w") as handle:
    handle.write("")

for desc, seq in sequences_of_all.items():
    id, gene_name, mol_type = desc.split('|')
    minus_seq = seq_minus(seq)
    
    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        minus_seq,
        BDS_len=40,
        BDS_num=50,
        min_gap=1,
        better_gap=40,
        gene=gene_name,
        G_min=0.25,
        G_max=0.7,
        G_consecutive=5,
        Tm_low=50,
        Tm_high=65,
    )
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = os.path.join(file_out_dir, gene_name + pre_binding_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir + total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(os.path.join(output, pre_binding_num_file), "w") as f:
    json.dump(pre_binding_num, f)

## Blast and extract blast results

In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f:
#     fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(os.path.join(os.path.join(output, blast_results_file)), "w") as f:
#     f.write(handle.read())

In [45]:
# Extract interested information from blast_results
from Bio.Blast import NCBIXML


align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try:
            FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except:
            FOI.loc[loca, "plus/minus"] = "NAN"

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [46]:
FOI["wanted"] = [True] * len(FOI)

In [None]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.split('-')[0].upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "protein_coding":
            FOI.loc[i, "wanted"] = False
            print(FOI.loc[i, "mol_type"])

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"].split('-')[0]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(os.path.join(output, "probes_sieve.xlsx"))

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))