# Probe Designer


## Environment


In [5]:
# basci env
import os
from pathlib import Path
import pandas as pd
import time
import json
from tqdm import tqdm

# data process of file from ncbi
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

# dir
DATASET_DIR = Path('/mnt/f/spatial_data/probe')
RUNID = 'Example_dataset'
workdir = DATASET_DIR / RUNID
os.makedirs(workdir, exist_ok=True)
organism = 'human'

In [6]:
# create results dir
current_time = time.localtime()
formatted_time = time.strftime("%Y%m%d_%H%M%S", current_time)
output = os.path.join(workdir, 'results', formatted_time+'_ensembl')
bds_candidate_dir = os.path.join(output, "bds_candidate")
os.makedirs(output, exist_ok=True)
os.makedirs(bds_candidate_dir, exist_ok=True)

# file name variables
bds_candidate_file_suffix = "_bds_candidate.fasta"
combined_bds_candidates_file = "total_bds_candidate.fasta"
combined_bds_candidates_blast_file = 'total_bds_candidate_blast.fasta'
bds_candidate_num_file = "bds_candidate_num.json"
blast_results_file = "blast_results.xml"

In [None]:
gene_info = pd.read_excel(os.path.join(workdir, "marker_gene_list.xlsx"))
gene_info['gene'] = gene_info['gene name']
if organism == 'mouse': gene_info['gene'] = gene_info['gene'].str.capitalize()
elif organism == 'human': gene_info['gene'] = gene_info['gene'].str.upper()
gene_list = [_.strip() for _ in gene_info['gene'].unique() if _!=0]
print(len(gene_list))
gene_info.head()

## Get seq from ensembl dataset

In [4]:
# from lib.database_interaction import ensembl_id_to_seqs
# import time

# skip = 0
# trial = 0

# tmp_isoform_list = isoform_list[skip:]
# tmp_id_list = id_list[len(sequences_of_all)+skip:]

# for i in range(len(tmp_isoform_list)):
#     isoform = tmp_isoform_list[i]
#     id = tmp_id_list[i]
#     sequences_of_all[f'{id}_{isoform}'] = dict()
#     sequences = ensembl_id_to_seqs(gene=isoform, gene_id=id.split('.')[0], seq_type='cds')
#     for desc, sequence in sequences.items():
#         sequences_of_all[f'{id}_{isoform}'][desc] = sequence

In [None]:
from lib.database_interaction import ensembl_name_to_seqs
import time


max_trial = 3
sequences_of_all = dict()
error_messages = {gene: [] for gene in gene_list}

with tqdm(total=len(gene_list), desc="Retriving_sequences", position=0) as pbar_total:
    for gene in gene_list:
        sequences_of_all[gene] = {}
        trial_success = False
        
        # Reset the trial progress bar for each gene
        for trial in range(1, max_trial+1):  # Retrying up to 3 times
            try:
                # Attempt to retrieve sequences
                sequences_of_all[gene] = ensembl_name_to_seqs(gene=gene, species=organism, seq_type='cds', tqdm_args={'position': 1,'leave': False})
                trial_success = True
                break
            except Exception as e:
                time.sleep(1)

        if not trial_success:
            error_messages[gene].append(f"Failed to retrieve sequences for {gene} after {max_trial} attempts.")

        pbar_total.update(1)  # Update the main progress bar after each gene

for gene, messages in error_messages.items():
    for message in messages:
        print(message)

with open(os.path.join(output, 'sequence_of_all.json'), 'w') as file: json.dump(sequences_of_all, file)

In [10]:
with open(os.path.join(output, 'sequence_of_all.json'), 'r') as file: sequences_of_all = json.load(file)

isoforms = {}
for gene, isoforms_tmp in sequences_of_all.items():
    shortest_isoform = None
    min_length = 10**6
    max_length = 0
    for isoform in isoforms_tmp:
        try: isoform_name = isoform['external_name']
        except: continue
        if len(isoform['seq']) < min_length and gene.upper() in isoform_name.upper():
            min_length = len(isoform['seq'])
            shortest_isoform = isoform
    if shortest_isoform:
        isoforms[gene] = shortest_isoform
with open(os.path.join(output, 'shortest_isoforms.json'), 'w') as file: json.dump(isoforms, file)

## Binding site Searcher


In [None]:
%reload_ext autoreload
%autoreload 2

# set lib auto reload in jupyter notebook
from lib.search_binding import position_search, optimize_subsequence, seq_minus

# Initiation of array
binding_site_entry = [
    "accession", "gene_name", "mol_type", "organism",
    "pos", "bds", "Tm", "Tm_l", "Tm_r", "mfe", "wanted"]
alignment_entry = ["align_num", "align_accession", "align_descrip", "plus/minus"]
BDS_INFO = pd.DataFrame(columns=binding_site_entry+alignment_entry)

# Search binding sites on mRNA sequence
pre_binding_num = {}

# initialization of file
with open(os.path.join(output, combined_bds_candidates_file), "w") as handle: handle.write("")
with open(os.path.join(output, combined_bds_candidates_blast_file), "w") as f: f.write("")

for desc, info in tqdm(isoforms.items(), desc="Searching_binding_sites", position=0):
    seq = info['seq']
    if 'N' in seq: seq = seq.replace('N', '')
    try: gene_name= info['external_name']
    except: gene_name = desc

    id = info['id']
    mol_type = info['biotype']
    minus_seq = seq_minus(seq)
    
    pos_info = position_search(
        minus_seq, gene=gene_name,
        BDS_len=40, BDS_num=50, min_gap=0, better_gap=40,
        G_min=0.25, G_max=0.7, G_consecutive=5, Tm_low=48, Tm_high=60, 
        verbose_pos=1, leave=False, warn=False)
    

    record_list = []
    for i, pre_binding_tmp in enumerate([_['bds'] for _ in pos_info]):
        record_list.append(
            SeqRecord(
                Seq(pre_binding_tmp),
                id="bds_candidate" + str(i),
                description="|".join([id, gene_name, organism, mol_type])))

    # add information about binding sites to FOI
    add = pd.DataFrame(pos_info)
    add['accession'] = id
    add['gene_name'] = gene_name
    add['mol_type'] = mol_type
    add['organism'] = organism
    BDS_INFO = pd.concat([BDS_INFO, add], ignore_index=True)

    file_out = os.path.join(bds_candidate_dir, gene_name + bds_candidate_file_suffix)
    
    # write pre_binding to files
    with open(file_out, "w") as f:
        for new_record in record_list: SeqIO.write(new_record, f, "fasta")
    with open(os.path.join(output, combined_bds_candidates_file), "a") as handle:
        for new_record in record_list: SeqIO.write(new_record, handle, "fasta")
    with open(os.path.join(output, combined_bds_candidates_blast_file), "a") as handle:
        for new_record in record_list: 
            seq = str(new_record.seq)
            seq = seq[len(seq)//2-16:len(seq)//2+16]
            new_record = SeqRecord(Seq(seq), id=new_record.id, description=new_record.description)
            SeqIO.write(new_record, handle, "fasta")

    # record the num of pre_binding for each gene
    pre_binding_num[f"{id}_{gene_name}"] = len(pos_info)

with open(os.path.join(output, bds_candidate_num_file), "w") as f:
    json.dump(pre_binding_num, f)

## Blast and extract blast results
- NCBIXML: https://homolog.us/Biopython/Bio.Blast.NCBIXML.html#read/0
- BlastRecord: https://biopython.org/docs/1.75/api/Bio.Blast.Record.html
- XMLReader: https://codebeautify.org/xmlviewer#


In [None]:
# with open(file_out_dir + total_pre_binding_file_name, "r") as f: fasta_string = f.read()
# txid = [2697049]  # organism

# # Submit BLAST search and get handle object
# handle = NCBIWWW.qblast(
#     program="blastn",
#     megablast="yes",
#     database="refseq_rna",
#     sequence=fasta_string,
#     url_base="https://blast.ncbi.nlm.nih.gov/Blast.cgi",
#     format_object="Alignment",
#     format_type="Xml",
# )

# # read handle object and save to a file
# with open(os.path.join(os.path.join(output, blast_results_file)), "w") as f: f.write(handle.read())

In [None]:
# Extract interested information from blast_results
from Bio.Blast import NCBIXML


align_num = []
# read the id/plus-minus part/align_num
with open(os.path.join(output, blast_results_file), "r") as blast_output:
    blast_records = NCBIXML.parse(blast_output)
    loca = 0
    for blast_record in blast_records:
        align_accession = []
        align_descrip_list = []
        # get align num of each binding site
        length = len(blast_record.alignments)
        align_num.append(length)
        for i in range(length):
            descrip = blast_record.descriptions[i].title.split("|")
            # get accession and descrip of each align seq
            align_accession.append(descrip[3])
            align_descrip_list.append(descrip[-1])
        BDS_INFO.loc[loca, "align_accession"] = "|".join(str(_) for _ in align_accession)
        # add align_descrip to df
        BDS_INFO.loc[loca, "align_descrip"] = "|".join(str(_) for _ in align_descrip_list)
        # get plus/minus of each align seq
        p_m = [blast_record.alignments[_].hsps[0].frame[1] for _ in range(length)]
        # add plus/minus to df
        try: BDS_INFO.loc[loca, "plus/minus"] = ",".join([str(_) for _ in p_m])
        except: BDS_INFO.loc[loca, "plus/minus"] = pd.NA
        loca += 1
BDS_INFO["align_num"] = align_num

## Select wanted binding site


In [13]:
import re

def adjust_gene_name(gene_name, gene_list):
    gene_list = [x.upper() for x in gene_list]
    match = re.search(r'(.+)-(\d+)$', gene_name)
    if match:
        base_gene_name = match.group(1)
        if base_gene_name.upper() in gene_list or gene_name.upper() in gene_list: return base_gene_name
        else: return gene_name
    else: return gene_name

In [None]:
BDS_INFO["wanted"] = [True] * len(BDS_INFO)
verbose = True
# select by specifity
gene_name_list = [_.upper() for _ in gene_list]
gene_name_list_out = [i for i in gene_name_list]
for i in range(len(BDS_INFO)):
    # check gene_name
    gene_name = adjust_gene_name(BDS_INFO.loc[i, "gene_name"], gene_name_list)
    spe_ori= BDS_INFO.loc[i, "organism"]
    if gene_name.upper() not in gene_name_list: 
        BDS_INFO.loc[i, "wanted"] = False
        if verbose: print(f"{gene_name} not in gene list.")
    else:
        try: gene_name_list_out.remove(gene_name)
        except: pass

    # check DNA or mRNA type
    if BDS_INFO.loc[i, "wanted"] == True:
        if BDS_INFO.loc[i, "mol_type"] != "protein_coding":
            # BDS_INFO.loc[i, "wanted"] = False
            if verbose: print("{} is {}.".format(gene_name, BDS_INFO.loc[i, "mol_type"]))

    # check gene_organism name
    if BDS_INFO.loc[i, "wanted"] == True:
        descrip = BDS_INFO.loc[i, "align_descrip"]
        if pd.isnull(descrip):
            BDS_INFO.loc[i, "wanted"] = False
            if verbose: print(f"{gene_name} not found in BLAST.")
        else:
            descrip = descrip.split("|")
            for des in descrip:
                if gene_name not in des and spe_ori in des:
                    BDS_INFO.loc[i, "wanted"] = False
                    if verbose: print(f"{gene_name} not specific.")
                    break

    # check plus/minus
    if BDS_INFO.loc[i, "wanted"] == True:
        pm_list = BDS_INFO.loc[i, "plus/minus"].split(",")
        if "-1" not in pm_list:
            BDS_INFO.loc[i, "wanted"] = False
            if verbose: print(f"{gene_name} not plus/minus.")

# write the whole information of interest to a excel file in tmp dir
BDS_INFO.to_excel(os.path.join(output, "probes_candidates.xlsx"))

out_tmp = BDS_INFO[BDS_INFO["wanted"] == True]
output_df = pd.DataFrame()
for gene in out_tmp.gene_name.unique():
    pos_wanted = list(out_tmp[out_tmp.gene_name == gene]["pos"])
    pos_best = optimize_subsequence(pos_wanted, length=3, min_gap=40, better_gap=80, gene=gene)
    pos_output = out_tmp[out_tmp.gene_name == gene]
    pos_output = pos_output[pos_output["pos"].isin(pos_best)]
    output_df = pd.concat([output_df, pos_output])

# write the output to a xlsx file
output_df.to_excel(os.path.join(output, "probes_wanted.xlsx"))