# Probe Designer


## Environment


In [None]:
# basci env
import os
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.SeqUtils import MeltingTemp as mt

# # get gene data from ncbi
# from Bio import Entrez

# # blast and xml file process
from Bio.Blast import NCBIWWW
from Bio.Blast import NCBIXML

# add package to sys var
# os.chdir(os.path.dirname(os.path.abspath(__file__)))
# sys.path.append("../lib")

# dir
workdir = './dataset/2024.3.27_mousebrain_HP_projection/'
os.makedirs(workdir, exist_ok=True)

current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)

output = os.path.join(workdir, 'results', formatted_time)
pre_binding_dir = os.path.join(output, "pre_binding")
os.makedirs(output, exist_ok=True)

# basic variables
gene_name_list_tosearch = "gene_name_list_tosearch.txt"
pre_binding_file_suffix = "_pre_binding.fasta"
total_pre_binding_file_name = "_total.fasta"

# tmp file
pre_binding_num_file = "pre_binding_num.json"
blast_results_file = "blast_results.xml"

In [None]:
organism = 'mouse'
gene_info = pd.read_excel(os.path.join(workdir, "Gene list_mouse.xlsx"), sheet_name='Sheet1')
gene_list = list(gene_info['gene_name'].unique())

## Get seq from ensembl dataset

In [None]:
# from lib.database_interaction import ensembl_id_to_seqs
# import time

# skip = 0
# trial = 0

# tmp_isoform_list = isoform_list[skip:]
# tmp_id_list = id_list[len(sequences_of_all)+skip:]

# for i in range(len(tmp_isoform_list)):
#     isoform = tmp_isoform_list[i]
#     id = tmp_id_list[i]
#     sequences_of_all[f'{id}_{isoform}'] = dict()
#     sequences = ensembl_id_to_seqs(gene=isoform, gene_id=id.split('.')[0], seq_type='cds')
#     for desc, sequence in sequences.items():
#         sequences_of_all[f'{id}_{isoform}'][desc] = sequence

In [None]:
from lib.database_interaction import ensembl_name_to_seqs
import time


max_trial = 3
sequences_of_all = dict()
error_messages = {gene: [] for gene in gene_list}

with tqdm(total=len(gene_list), desc="total_process", position=0) as pbar_total:
    for gene in gene_list:
        sequences_of_all[gene] = {}
        trial_success = False
        
        # Reset the trial progress bar for each gene
        for trial in range(1, max_trial+1):  # Retrying up to 3 times
            try:
                # Attempt to retrieve sequences
                sequences_of_all[gene] = ensembl_name_to_seqs(gene=gene, species='mouse', seq_type='cds', tqdm_args={'position': 1,'leave': False})
                trial_success = True
                break
            except Exception as e:
                time.sleep(1)

        if not trial_success:
            error_messages[gene].append(f"Failed to retrieve sequences for {gene} after {max_trial} attempts.:{e}")

        pbar_total.update(1)  # Update the main progress bar after each gene

for gene, messages in error_messages.items():
    for message in messages:
        print(message)

with open(os.path.join(output, 'sequence_of_all.json'), 'w') as file: json.dump(sequences_of_all, file)

In [None]:
with open(os.path.join(output, 'sequence_of_all.json'), 'r') as file: sequences_of_all = json.load(file)

longest_isoforms = {}
for gene, isoforms in sequences_of_all.items():
    longest_isoform = None
    max_length = 0
    for isoform in isoforms:
        if len(isoform['seq']) > max_length:
            max_length = len(isoform['seq'])
            longest_isoform = isoform
    if longest_isoform:
        longest_isoforms[gene] = longest_isoform


In [None]:
with open(os.path.join(output, 'longest_isoforms.json'), 'w') as file: json.dump(longest_isoforms, file)

In [None]:
longest_isoform

## Binding site Searcher


In [None]:
from lib.search_binding import step_by_step, find_max_min_difference_fixed_length_subsequence, seq_minus

# Initiation of array
binding_site_FOIs = [
    "accession",
    "gene_name",
    "mol_type",
    "organism",
    "pos_on_seq",
    "binding",
    "Tm_l",
    "Tm_r",
    "wanted",
]
align_FOIs = ["align_num", "align_accession", "align_descrip", "plus/minus"]
FOI = pd.DataFrame(columns=binding_site_FOIs + align_FOIs)

# Search binding sites on mRNA sequence
file_out_dir = pre_binding_dir
try:
    os.mkdir(file_out_dir)
except:
    pass

pre_binding_num = {}

# initialization of file
with open(os.path.join(file_out_dir, total_pre_binding_file_name), "w") as handle:
    handle.write("")

for desc, info in longest_isoforms.items():
    seq = info['seq']
    id, gene_name, mol_type = info['id'], info['external_name'], info['biotype']
    minus_seq = seq_minus(seq)
    
    Tm_l, Tm_r, selected_substrings, pos_on_seq = step_by_step(
        minus_seq, gene=gene_name,
        BDS_len=40, BDS_num=50, min_gap=1, better_gap=40,
        G_min=0.25, G_max=0.7, G_consecutive=5, Tm_low=50, Tm_high=65)
    
    record_list = []
    for i, pre_binding_tmp in enumerate(selected_substrings):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="pre_binding" + str(i),
                description="|".join([id, gene_name, organism, mol_type]),
            )
        )

    # add information about binding sites to FOI
    add = pd.DataFrame(
        {
            "accession": [id] * len(selected_substrings),
            "gene_name": [gene_name] * len(selected_substrings),
            "mol_type": [mol_type] * len(selected_substrings),
            "organism": [organism] * len(selected_substrings),
            "binding": selected_substrings,
            "Tm_l": Tm_l,
            "Tm_r": Tm_r,
            "pos_on_seq": pos_on_seq,
        }
    )
    FOI = pd.concat([FOI, add], ignore_index=True)

    file_out = os.path.join(file_out_dir, gene_name + pre_binding_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list:
            SeqIO.write(new_record, f, "fasta")
    with open(file_out_dir + total_pre_binding_file_name, "a") as handle:
        for new_record in record_list:
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(selected_substrings)

with open(os.path.join(output, pre_binding_num_file), "w") as f:
    json.dump(pre_binding_num, f)

## Blast and extract blast results

In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f:
#     fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(os.path.join(os.path.join(output, blast_results_file)), "w") as f:
#     f.write(handle.read())

In [None]:
# Extract interested information from blast_results
from Bio.Blast import NCBIXML


align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        FOI.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)

        # add align_descrip to df
        FOI.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)

        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]

        # add plus/minus to df
        try: FOI.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except: FOI.loc[loca, "plus/minus"] = pd.NA

        loca += 1

FOI["align_num"] = align_num

## Select wanted binding site


In [None]:
FOI["wanted"] = [True] * len(FOI)

In [None]:
# sieve for the suitable binding site
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(FOI)):
    # check gene_name
    gene_name = FOI.loc[i, "gene_name"]
    if gene_name.split('-')[0].upper() not in gene_name_list:
        FOI.loc[i, "wanted"] = False
    else:
        try:
            gene_name_list_out.remove(gene_name)
        except:
            pass

    # check DNA or mRNA type
    if FOI.loc[i, "wanted"] == True:
        if FOI.loc[i, "mol_type"] != "protein_coding":
            FOI.loc[i, "wanted"] = False
            print(FOI.loc[i, "mol_type"])

    # check gene_organism name
    if FOI.loc[i, "wanted"] == True:
        spe_ori, gene_ori = FOI.loc[i, "organism"], FOI.loc[i, "gene_name"].split('-')[0]
        descrip = FOI.loc[i, "align_descrip"].split("|")
        for des in descrip:
            if gene_ori not in des and spe_ori in des:
                FOI.loc[i, "wanted"] = False
                break

    # check plus/minus
    if FOI.loc[i, "wanted"] == True:
        if pd.isnull(FOI.loc[i, "plus/minus"]):
            FOI.loc[i, "wanted"] = False
        else:
            pm_list = FOI.loc[i, "plus/minus"].split(",")
            if "-1" not in pm_list:
                FOI.loc[i, "wanted"] = False

# write the whole information of interest to a excel file in tmp dir
FOI.to_excel(os.path.join(output, "probes_sieve.xlsx"))

out_tmp = FOI[FOI["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_of_True = list(out_tmp[out_tmp.gene_name == gene]["pos_on_seq"])
    best_pos = find_max_min_difference_fixed_length_subsequence(
        pos_of_True,
        length=3,
        min_gap=40,
        better_gap=80,
        gene=gene,
    )
    out_subset = out_tmp[out_tmp.gene_name == gene]
    out_subset = out_subset[out_subset["pos_on_seq"].isin(best_pos)]
    output_df = pd.concat([output_df, out_subset])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))