In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
from pprint import pprint


workdir = r"dataset/2024.3.16_TCR&mutation_3_Breast_cancer"
info = pd.read_excel(os.path.join(workdir, "SPRINTSEQ-primer-BZ02TNBC-20240316.xlsx"), sheet_name='for mutation')

## get seq from ensembl

In [None]:
import requests
from Bio.SeqUtils import MeltingTemp as mt


def padlock_thre(Tm_left, Tm_right, left, right, Tm_dif_thre=10, Tm_sing_thre=45, bind_sing_thre=10):
    flag = False
    if abs(Tm_left - Tm_right) < Tm_dif_thre:
        if Tm_left > Tm_sing_thre:
            if Tm_right > Tm_sing_thre:
                flag = True
            else:
                right += 1
        else:
            left += 1
    elif Tm_left > Tm_right:
        left -= 1
        right += 1
        if left <= bind_sing_thre:
            flag = True
    else:
        left += 1
        right -= 1
        if right <= bind_sing_thre:
            flag = True
    return left, right, flag


def fetch_and_label_sequences(positions, coord_system_version = "GRCh38", gap=50, left_length=20, right_length=20, Tm_dif_thre=10,Tm_sing_thre=45, bind_sing_thre=10):
    # Ensembl REST API URL for GRCh37 batch sequence fetching
    server = "https://rest.ensembl.org"
    sequences_info = []
    
    for position in tqdm(positions):
        left = left_length
        right = right_length
        # Adjust start and end for the extra 50 base pairs
        chromosome = position["chr"].replace("chr", "")
        adjusted_start = int(position["start"])  # Ensure start is not less than 1
        adjusted_end = int(position["end"])
        ext = f"/sequence/region/human/{chromosome}:{adjusted_start}..{adjusted_end}:1?"

        options = ";".join([
                # 'content-type=text/x-fasta',
                "content-type=application/json",
                f"coord_system_version={coord_system_version}",
                f"expand_3prime={gap}",
                f"expand_5prime={gap}",
            ])

        response = requests.get(server + ext + options)

        if response.ok:
            decoder = response.json()
            seq = decoder["seq"]
            while True:
                binding_left = seq[gap + 1 + adjusted_end-adjusted_start - left: gap + 1 + adjusted_end-adjusted_start]
                binding_right = seq[-gap : -gap + right]
                Tm_left = mt.Tm_NN(binding_left, nn_table=mt.R_DNA_NN1)
                Tm_right = mt.Tm_NN(binding_right, nn_table=mt.R_DNA_NN1)
                left, right, flag = padlock_thre(Tm_left, Tm_right, left, right, 
                                                 Tm_dif_thre=Tm_dif_thre, Tm_sing_thre=Tm_sing_thre, 
                                                 bind_sing_thre=bind_sing_thre)
                if flag:
                    break
                
            decoder['gene'] = position['gene']
            decoder["binding_left"] = binding_left
            decoder["Tm_left"] = Tm_left
            decoder["binding_right"] = binding_right
            decoder["Tm_right"] = Tm_right
            decoder['strand'] = int(position["strand"])
            sequences_info.append(decoder)
        else:
            response.raise_for_status()

    return sequences_info

In [123]:
info.head()

Unnamed: 0,Note,Chr,Start,End,Ref,Alt,Strand,Func.refGene,Gene.refGene,GeneDetail.refGene,ExonicFunc.refGene,AAChange.refGene,cytoBand
0,Y (only T 20%),chr2,74461618,74461618,C,T,-1,exonic,MOGS,.,nonsynonymous SNV,"MOGS:NM_006302:exon4:c.G2171A:p.S724N,MOGS:NM_...",2p13.1
1,Y (同时发生，only T，设计探针时需注意),chr3,30861613,30861617,CTTTC,A,-1,exonic,GADL1,.,frameshift deletion,GADL1:NM_207359:exon2:c.187_190del:p.K63Lfs*17,3p23
2,Y (only T 22%)设计探针时注意同时有个相邻碱基突变,chr3,158820216,158820217,TG,GA,1,exonic,MFSD1,.,nonsynonymous SNV,"MFSD1:NM_001167903:exon8:c.G637A:p.E213K,MFSD1...",3q25.32
3,Y (同时发生&56315177 A to T，only T),chr4,56315178,56315178,-,CTTT,1,exonic,CRACD,.,frameshift insertion,CRACD:NM_020722:exon8:c.1676_1677insCTTT:p.P56...,4q12
4,Y (同时发生&56315177 A to T，only T),chr4,56315185,56315188,AAAG,-,1,exonic,CRACD,.,frameshift deletion,CRACD:NM_020722:exon8:c.1683_1686del:p.K562Tfs*27,4q12


In [None]:
# Example usage
positions = [
    {
        "gene": info.loc[line, "Gene.refGene"],
        "chr": info.loc[line, "Chr"],
        "start": info.loc[line, "Start"],
        "end": info.loc[line, "End"],
        "strand": info.loc[line, "Strand"]
        # "strand": 1
    }
    for line in info.index
]

gap = 50
sequences = fetch_and_label_sequences(positions, gap=gap)

## perform mutation 

In [None]:
from Bio.SeqUtils import MeltingTemp as mt


def binding_mutation(mutation, seq, adjusted_start, adjusted_end, left, right):
    if mutation["ref"] == "-":
        mut_seq = seq[:adjusted_end] + mutation["alt"] + seq[adjusted_end:]
        add = len(mutation["alt"])
        if add < left:
            binding_left = seq[:adjusted_end][add - left :] + mutation["alt"]
        else:
            binding_left = mutation["alt"][-left:]
        binding_right = seq[adjusted_end:][:right]

    elif mutation["alt"] == "-":
        mut_seq = seq[:adjusted_start] + seq[adjusted_end:]
        binding_left = seq[:adjusted_start][-left:]
        binding_right = seq[adjusted_end:][:right]

    else:
        mut_seq = seq[:adjusted_start] + mutation["alt"] + seq[adjusted_end:]
        add = len(mutation["alt"])
        if add < left:
            binding_left = seq[:adjusted_start][add - left :] + mutation["alt"]
        else:
            binding_left = mutation["alt"][-left:]
        binding_right = seq[adjusted_end:][:right]
    return mut_seq, binding_left, binding_right


def perform_mutation(
    positions,
    mutations,
    sequences,
    gap=50,
    left_length=20,
    right_length=20,
    Tm_thre=45, 
    Tm_dif_thre=10,
    bind_single_thre=10,
):
    mutation_seqs = []
    for _ in range(len(mutations)):
        position = positions[_]
        mutation = mutations[_]
        sequence = sequences[_]
        left, right = left_length, right_length

        adjusted_start = gap  # Ensure start is not less than 1
        adjusted_end = int(position["end"] - position["start"] + gap + 1)
        seq = sequence["seq"]

        while True:
            mut_seq, binding_left, binding_right = binding_mutation(mutation, seq, adjusted_start, adjusted_end, left, right)
            Tm_left = mt.Tm_NN(binding_left, nn_table=mt.R_DNA_NN1)
            Tm_right = mt.Tm_NN(binding_right, nn_table=mt.R_DNA_NN1)
            left, right, flag = padlock_thre(Tm_left, Tm_right, left, right, 
                                             Tm_dif_thre=Tm_dif_thre, Tm_sing_thre=Tm_thre, 
                                             bind_sing_thre=bind_single_thre)
            if flag:
                break

        mutation_seq_info = {
            "gene": sequence["gene"],
            "id": sequence["id"],
            "strand": sequence["strand"],
            "molecule": sequence["molecule"],
            "seq": mut_seq,
            "binding_left": binding_left,
            "Tm_left": Tm_left,
            "binding_right": binding_right,
            "Tm_right": Tm_right,
        }

        mutation_seqs.append(mutation_seq_info)

    return mutation_seqs

In [None]:
mutations = [{"ref": info.loc[line, "Ref"], "alt": info.loc[line, "Alt"]} for line in info.index]
mut_sequences = perform_mutation(positions=positions, mutations=mutations, sequences=sequences, gap=50)

## generate fasta file

### for mannual

In [None]:
# from Bio.SeqUtils import MeltingTemp as mt


# mannual_info_df = pd.read_excel(os.path.join(workdir, 'LCM final RNA mutation list.xlsx'), sheet_name='mannual')
# sequences = []
# mut_sequences = []
# for row_num in range(len(mannual_info_df)):
#     sequence = dict()
#     sequence['id'] = mannual_info_df.loc[row_num, 'RefSeq_gid']
#     try: mut = mannual_info_df.loc[row_num, 'name'].split('-')[-1]
#     except: continue
#     sequence['gene'] = mannual_info_df.loc[row_num, 'name']
#     binding_site = mannual_info_df.loc[row_num, 'binding']
#     sequence['binding'] = binding_site
#     sequence["binding_left"] = binding_site[: 20]
#     sequence["binding_right"] = binding_site[20: ]
#     sequence['seq'] = binding_site
#     sequence['Tm_left'] = mt.Tm_NN(sequence["binding_left"], nn_table=mt.R_DNA_NN1)
#     sequence['Tm_right'] = mt.Tm_NN(sequence["binding_right"], nn_table=mt.R_DNA_NN1)
    
#     if 'norm' in mut:
#         sequences.append(sequence)
#     else:
#         mut_sequences.append(sequence)

In [None]:
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord


def generate_binding_fastq(sequences, outpath):
    with open(outpath, "w") as f:
        for sequence in sequences:
            record = SeqRecord(
                Seq(sequence["binding_left"] + sequence["binding_right"]),
                id=sequence["id"],
                description=sequence['gene'],
            )
            SeqIO.write(record, f, "fasta")

In [None]:
generate_binding_fastq(sequences=sequences, outpath=os.path.join(workdir, 'nor_binding_mannual.fasta'))
generate_binding_fastq(sequences=mut_sequences, outpath=os.path.join(workdir, 'mut_binding_mannual.fasta'))

## blast_test

In [None]:
from Bio.Blast import NCBIXML
from copy import deepcopy

def extract_blast(sequences, blast_results):
    sequences_tmp = deepcopy(sequences)
    # read the id/plus-minus part/align_num
    with open(blast_results, "r") as blast_output:
        blast_records = NCBIXML.parse(blast_output)
        for num, blast_record in enumerate(blast_records):
            length = len(blast_record.alignments)
            sequences_tmp[num]["align_num"] = length
            sequences_tmp[num]['descrip'] = dict()
            for i in range(length):
                pm = blast_record.alignments[i].hsps[0].frame[1]
                sequences_tmp[num]["descrip"][i+1] = (
                    blast_record.descriptions[i].title + f"| pm={pm}"
                )

    return sequences_tmp

In [None]:
sequences_with_blast = extract_blast(sequences=sequences, blast_results=os.path.join(workdir, 'nor_blast_results.xml'))
mut_sequences_with_blast = extract_blast(sequences=mut_sequences, blast_results=os.path.join(workdir, 'mut_blast_results.xml'))

## perform minus for positive combined seq

In [None]:
def trans(seq):
    translib = {"A": "T", "T": "A", "C": "G", "G": "C"}
    seq_minus = "".join(list(reversed([translib[i] for i in seq])))
    return seq_minus


def specificity_judge(sequence_with_blast):
    gene = sequence_with_blast['gene']
    # print(gene)
    if ';' in gene:
        gene = list(gene.split(';'))
    if '-' in gene:
        gene = [list(gene.split('-'))[0]]
    else:
        gene = [gene]

    perform_minus = False
    specify = True
    if len(sequence_with_blast['descrip'].values()) == 0:
        pass
    else:
        for desc in sequence_with_blast['descrip'].values():
            if 'PREDICTED' in desc:
                continue
            
            flag = True
            for subgene in gene:
                if subgene.upper() in desc.upper():
                    flag = False
                    pm = int(desc.split('|')[-1].split('=')[-1])
                    if pm == 1:
                        perform_minus = True
            if flag:
                specify = False
    return perform_minus, specify
    
    
def minus(sequences_with_blast, mut_sequences_with_blast):
    sequences_final = []
    mut_sequences_final = []
    for i in range(len(sequences_with_blast)):
        seq_record = {}
        mut_seq_record = {}

        sequence_with_blast = sequences_with_blast[i]
        mut_sequence_with_blast = mut_sequences_with_blast[i]

        seq_record['gene'] = sequence_with_blast['gene']
        mut_seq_record['gene'] = mut_sequence_with_blast['gene']
        
        perform_minus, seq_record['specificity'] = specificity_judge(sequence_with_blast)
        _, mut_seq_record['specificity'] = specificity_judge(mut_sequence_with_blast)
        
        # if 'strand' in sequence_with_blast:
        #     perform_minus = sequence_with_blast['strand'] == 1   
        seq_record['strand'] = sequence_with_blast['strand']
        mut_seq_record['strand'] = mut_sequence_with_blast['strand']

        seq_record['perform_minus'] = perform_minus
        mut_seq_record['perform_minus'] = perform_minus
 
        if perform_minus:
            seq_record['binding_left'] = trans(sequence_with_blast['binding_right'])
            seq_record['binding_right'] = trans(sequence_with_blast['binding_left'])
            seq_record['Tm_left'] = sequence_with_blast['Tm_right']
            seq_record['Tm_right'] = sequence_with_blast['Tm_left']
            seq_record['seq'] = sequence_with_blast['seq']
            
            mut_seq_record['binding_left'] = trans(mut_sequence_with_blast['binding_right'])
            mut_seq_record['binding_right'] = trans(mut_sequence_with_blast['binding_left'])
            mut_seq_record['Tm_left'] = mut_sequence_with_blast['Tm_right']
            mut_seq_record['Tm_right'] = mut_sequence_with_blast['Tm_left']
            mut_seq_record['seq'] = mut_sequence_with_blast['seq']

        else:
            seq_record['binding_left'] = sequence_with_blast['binding_left']
            seq_record['binding_right'] = sequence_with_blast['binding_right']
            seq_record['Tm_left'] = sequence_with_blast['Tm_left']
            seq_record['Tm_right'] = sequence_with_blast['Tm_right']
            seq_record['seq'] = sequence_with_blast['seq']
            
            mut_seq_record['binding_left'] = mut_sequence_with_blast['binding_left']
            mut_seq_record['binding_right'] = mut_sequence_with_blast['binding_right']
            mut_seq_record['Tm_left'] = mut_sequence_with_blast['Tm_left']
            mut_seq_record['Tm_right'] = mut_sequence_with_blast['Tm_right']
            mut_seq_record['seq'] = mut_sequence_with_blast['seq']
            
        sequences_final.append(seq_record)
        mut_sequences_final.append(mut_seq_record)
    
    return sequences_final, mut_sequences_final

In [None]:
sequences_final, mut_sequences_final = minus(sequences_with_blast, mut_sequences_with_blast)

In [None]:
interest = ['gene', 'binding_left','binding_right','Tm_left','Tm_right','specificity','perform_minus', 'strand', 'seq']

df_dict = dict()
for inte in interest:
    df_dict[inte] = []

for _ in range(len(sequences_final)):
    sequence = sequences_final[_]
    mut_sequence = mut_sequences_final[_]
    for inte in interest:
        if inte == 'gene':
            df_dict[inte] += [sequence[inte], mut_sequence[inte]+'_mut']
        else:
            df_dict[inte] += [sequence[inte], mut_sequence[inte]]
    

In [None]:
df = pd.DataFrame(df_dict)
df['binding'] = df['binding_left'] + df['binding_right']
# df = df[['gene', 'binding', 'binding_left','binding_right','Tm_left','Tm_right','specificity','perform_minus', 'seq']]
df.to_excel(os.path.join(workdir, 'binding_site.xlsx'))
df

## threshold by blast results

In [None]:
def thre_by_blast(sequences, ):
    non_specific = []
    for sequence in sequences:
        gene = sequence['gene'].split('-')[0]
        descrip = sequence['descrip']
        flag = True
        for desc in descrip.values():
            if gene.upper() not in desc.upper() and "PREDICT" not in desc:
                flag = False
        if not flag:
            sequence['descrip'] = {key:value for key, value in sequence['descrip'].items() if "PREDICT" not in value}
            non_specific.append(sequence)
    
    return non_specific

In [None]:
non_specific = thre_by_blast(sequences=sequences_with_blast)
pprint(non_specific,sort_dicts=False, width=200)

In [None]:
non_specific = thre_by_blast(sequences=mut_sequences_with_blast)
pprint(non_specific,sort_dicts=False, width=200)