# Probe Designer


## Environment


In [None]:
# basci env
import os
import sys
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# get gene data from ncbi
from Bio import Entrez

# blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("../lib")

# dir
workdir = "./dataset/2024.3.27_mousebrain_HP_projection/"
os.makedirs(workdir, exist_ok=True)

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)
tmp = os.path.join(workdir, "results", formatted_time, "tmp")
output = os.path.join(workdir, "results", formatted_time)
pre_binding_dir = os.path.join(tmp, "pre_binding")
os.makedirs(tmp, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total_pre_binding.fasta"

# tmp file
gene_name_list_file = "gene_name_list.txt"
gene_id_name_file = "gene_id_list.txt"
gene_seq_in_file = "gene_seq_in_file.gb"
pre_binding_num_file = "pre_binding_num.json"
blast_results_file = "blast_results.xml"

## Get genbank file of each gene from ncbi dataset

https://www.ncbi.nlm.nih.gov/books/NBK25499/#chapter4.EFetch


In [None]:
# Get gene id and other information from ncbi dataset(api)
## Generate gene_search_list from gene_name_list
organism_of_interest = "Mus musculus"
n_type_of_interest = "mRNA"
with open(os.path.join(tmp, gene_name_list_file)) as f: gene_name_list = f.read().splitlines()

## Read id_list from existing file
with open(os.path.join(tmp, gene_id_name_file), "r") as f: id_list = f.read().split("\n")

In [None]:
# ## Get gene id list using Entrez.esearch
# gene_search_list = [", ".join([name, organism_of_interest, n_type_of_interest])
#     for name in gene_name_list]
# id_list = []
# for gene_search in gene_search_list:
#     Entrez.email = "1418767067@qq.com"
#     handle = Entrez.esearch(db="nuccore", term=gene_search)
#     record = Entrez.read(handle)
#     handle.close()
#     id_list += record["IdList"][:1]  # set number of search results to read
# with open(tmp + gene_id_name_file, "w") as f:
#     f.write("\n".join(id_list))

In [None]:
# Get the genbank file of each gene by id list
fetch_per_round = 3
round = -(-len(id_list) // fetch_per_round)

# initialization of gb file
with open(os.path.join(tmp, gene_seq_in_file), "w") as f:
    f.write("")

for i in tqdm(range(round)):
    id_list_per_round = id_list[i * fetch_per_round : (i + 1) * fetch_per_round]
    Entrez.email = "1418767067@qq.com"
    handle = Entrez.efetch(
        db="nuccore",
        strand=1,  # plus if strand=1
        id=id_list_per_round,
        rettype="gbwithparts",
        retmode="text",
    )
    seq_record = handle.read()
    handle.close()
    with open(os.path.join(tmp, gene_seq_in_file), "a") as f:
        f.write(seq_record)

## Binding site Searcher


In [None]:
from lib.search_binding import step_by_step, find_max_min_difference_fixed_length_subsequence, gb_extract

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_in = os.path.join(tmp, gene_seq_in_file)
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(os.path.join(file_out_dir, total_pre_binding_file_name), "w") as handle:
    handle.write("")

for record in SeqIO.parse(os.path.join(tmp, gene_seq_in_file), "genbank"):
    id, gene_name, mol_type, organism, minus_seq = gb_extract(record, CDS=True)
    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        minus_seq, gene=gene_name, 
        BDS_len=40, BDS_num=50, min_gap=1, better_gap=40,
        G_min=0.25, G_max=0.7, G_consecutive=5, Tm_low=50, Tm_high=65,
    )
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = os.path.join(file_out_dir, gene_name + pre_binding_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(os.path.join(file_out_dir, total_pre_binding_file_name), "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(os.path.join(tmp, pre_binding_num_file), "w") as f:
    json.dump(pre_binding_num, f)

## Blast and extract blast results

NCBIXML: https://homolog.us/Biopython/Bio.Blast.NCBIXML.html#read/0

BlastRecord: https://biopython.org/docs/1.75/api/Bio.Blast.Record.html

XMLReader: https://codebeautify.org/xmlviewer#


In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f:
#     fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(tmp + blast_results_file, "w") as f:
#     f.write(handle.read())

In [None]:
# Extract interested information from blast_results
align_num = []

# read the id/plus-minus part/align_num
with open(os.path.join(tmp, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try:
            FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except:
            FOI.loc[loca, "plus/minus"] = "NAN"

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [None]:
FOI["wanted"] = [True] * len(FOI)

In [None]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_name_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "mRNA":
            FOI.loc[i, "wanted"] = False

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

In [None]:
# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(os.path.join(tmp, "probes_sieve.xlsx"))

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))