In [8]:
from seqfold import dg, fold, dot_bracket
import random

from Bio.Blast import NCBIXML


def random_seq_list(length=20, num=50):
    nucleotides = ['A', 'T', 'C', 'G']
    sequence = [''.join(random.choice(nucleotides) for _ in range(length)) for j in range(num)]
    return sequence


def hum_dis(seq1, seq2):
    if len(seq1) != len(seq2):
        print("seq_not_match")
    else:
        cont = 0
        for char in range(len(seq1)):
            if seq1[char] != seq2[char]:
                cont += 1
        return cont


def dna_sec_struct(seq, temp=45):
    # Predict the minimum free energy
    mfe = dg(seq, temp=temp)
    # `fold` returns a list of `seqfold.Struct` from the minimum free energy structure
    structs = fold(seq, temp=temp)
    return mfe, structs


def thre_by_blast(file="./JXAKR9US016-Alignment.xml", thre=18):
    pos = []
    with open(file, "r") as blast_output:
        blast_records = NCBIXML.parse(blast_output)
        for blast_record in blast_records:
            save = True
            for alignment in blast_record.alignments:
                # print("Alignment title:", alignment.title)
                # print("Length of the alignment:", alignment.length)

                # # Iterate over the high-scoring pairs (HSPs) in the alignment
                for hsp in alignment.hsps:
                    # print("HSP score:", hsp.score)
                    if hsp.score >= thre:
                        pos.append(False)
                        save = False
                        break
                    # print("HSP bits:", hsp.bits)
                    # print("HSP query sequence:", hsp.query)
                    # print("HSP match sequence:", hsp.match)
                    # print("HSP subject sequence:", hsp.sbjct)
            if save:
                pos.append(True)
    return pos

In [103]:
seq_list = random_seq_list()

seq_list_export = []
for seq in seq_list:
    seq = seq.upper()

    # GGGGG
    if "GGGGG" in seq:
        print(f"{seq}: \tthre_by_G")
        continue

    # dif
    dif = True
    for tmp_seq in seq_list_export:
        if hum_dis(tmp_seq, seq) < 10:
            dif = False
            print(f"{seq}: \tthre_by_dif")
    if not dif:
        continue

    # secondary structure
    mfe, structs = dna_sec_struct(seq, temp=45)
    if mfe < 0:
        print(f"{seq}: \tthre_by_stru\t", dot_bracket(seq, structs))
        continue

    seq_list_export.append(seq)

print(f'{len(seq_list_export)} seqs remained')

ATAGCACATTGTAGCCGTGC: 	thre_by_stru	 ...((((.........))))
ACTGATTCCCGTGCAAGCGG: 	thre_by_stru	 ........(((......)))
ATTCAAAGTGAATAACATCC: 	thre_by_stru	 (((((...))))).......
ACACGCTTAGCACCTCTGCT: 	thre_by_stru	 ........((((....))))
GGCTTGCCTACGCCCGCCTC: 	thre_by_stru	 (((..((....))..)))..
GAAATCGTGGGGCTCCCGAG: 	thre_by_dif
GGTAGCCCGCATAAGGGCAG: 	thre_by_stru	 ....((((......))))..
TGGGGTATACCCGTTTCGGT: 	thre_by_stru	 .(((.....)))........
GTTTGCAGCGTAGCGGCAGT: 	thre_by_stru	 ...(((.((...)).)))..
GTGTCCAACAGGGGGACAAC: 	thre_by_G
GAATTCAACAGCCACCAGCA: 	thre_by_dif
ACAATGCCCTTATTGTATGT: 	thre_by_stru	 (((((......)))))....
AGTGAGTTACCCTTAACTCT: 	thre_by_stru	 ...((((((....)))))).
ACATTAAACAAGGTAAGCAT: 	thre_by_dif
CTTGAGGCCCTAGGTCTCAG: 	thre_by_stru	 ..((((.((...)).)))).
CCGGAATGGACGTCCATGTC: 	thre_by_stru	 ((.....))...........
CGGGGGCGGGTCGCGGCGAC: 	thre_by_G
GGCAAACAGGCCATGGCGAT: 	thre_by_stru	 (((......)))........
32 seqs remained


In [56]:
with open("./random_seq_filtered.txt", "w") as f:
    for _ in range(len(seq_list_export)):
        f.write(f'>seq{_}\n' + seq_list_export[_] + "\n")

In [106]:
pos = thre_by_blast(file='./JXAKR9US016-Alignment.xml', thre=18)

In [90]:
with open("./random_seq_filtered.txt", "w") as f:
    cont = 0
    for _ in range(len(seq_list_export)):
        if pos[_]:
            f.write(f">seq{cont}\n" + seq_list_export[_] + "\n")
            cont += 1

In [31]:
import pandas as pd


def dna_sec_struct(seq, temp=45):
    # Predict the minimum free energy
    mfe = dg(seq, temp=temp)
    # `fold` returns a list of `seqfold.Struct` from the minimum free energy structure
    structs = fold(seq, temp=temp)
    return mfe, structs

def seq_minus(seq):
    translib = {"A": "T", "T": "A", "C": "G", "G": "C"}
    return "".join(list(reversed([translib[i] for i in seq])))


padlock = pd.read_excel(r'E:\TMC\probe_designer\dataset\2024.3.16_TCR&mutation_3_Breast_cancer\binding_site_revised.xlsx', sheet_name='padlock')
padlock


Unnamed: 0,Name,padlock
0,CZ_TNBC_MOGS,TCCAGAGATGGCGGCTGTCGTCCCTACACGACGCTCTTCCGATCTG...
1,CZ_TNBC_MOGS_mut,TCCAGAGATGGCGGCTGTCGTCCCTACACGACGCTCTTCCGATCTT...
2,CZ_TNBC_GADL1,AAAACCACCTCTTCCATTATTCCCTACACGACGCTCTTCCGATCTG...
3,CZ_TNBC_GADL1_mut,AAAACCACCTCTTCCATTATTCCCTACACGACGCTCTTCCGATCTT...
4,CZ_TNBC_MFSD1,CACCTTAGAGAAGAAAGGTCCCTACACGACGCTCTTCCGATCTGCG...
5,CZ_TNBC_MFSD1_mut,TCCCTTAGAGAAGAAAGGTCCCTACACGACGCTCTTCCGATCTGTG...
6,CZ_TNBC_CRACD,GTCACGGGGCTCAGGTTGACTCCCTACACGACGCTCTTCCGATCTC...
7,CZ_TNBC_CRACD_mut,GACACGGGGCTCAGGTTGACTCCCTACACGACGCTCTTCCGATCTG...
8,CZ_TNBC_CDH6,TCTGCATCCGTCGCAGTGATCCCTACACGACGCTCTTCCGATCTGG...
9,CZ_TNBC_CDH6_mut,CCTGCATCCGTCGCAGTGATCCCTACACGACGCTCTTCCGATCTCT...


In [35]:
for index, (name, seq) in padlock.iterrows():
    seq = seq_minus(seq)
    seq = seq.upper() * 3
    mfe, structs = dna_sec_struct(seq, temp=60)
    # if mfe < 0:
    print(f"{name}:{mfe}\n{seq}\n{dot_bracket(seq, structs)}")
    # continue

CZ_TNBC_MOGS:-3.6
GCCCCTTTGGTTTACGCTCCCAGAACGCCCTCGGAAGAGCGGTTCAGCAGGAATGCAGAACGCCCAGATCGGAAGAGCGTCGTGTAGGGACGACAGCCGCCATCTCTGGAGCCCCTTTGGTTTACGCTCCCAGAACGCCCTCGGAAGAGCGGTTCAGCAGGAATGCAGAACGCCCAGATCGGAAGAGCGTCGTGTAGGGACGACAGCCGCCATCTCTGGAGCCCCTTTGGTTTACGCTCCCAGAACGCCCTCGGAAGAGCGGTTCAGCAGGAATGCAGAACGCCCAGATCGGAAGAGCGTCGTGTAGGGACGACAGCCGCCATCTCTGGA
.......................................................................................................................................................................................................................................((.((.((.......((.((....)).))..(((.(((....))).)))......)).)).))....................................
CZ_TNBC_MOGS_mut:-1.6
ACCCCTTTGGTTTACGCTCCCGCCCACGGATCGGAAGAGCGGTTCAGCAGGAATGCGCCCACGGAAGATCGGAAGAGCGTCGTGTAGGGACGACAGCCGCCATCTCTGGAACCCCTTTGGTTTACGCTCCCGCCCACGGATCGGAAGAGCGGTTCAGCAGGAATGCGCCCACGGAAGATCGGAAGAGCGTCGTGTAGGGACGACAGCCGCCATCTCTGGAACCCCTTTGGTTTACGCTCCCGCCCACGGATCGGAAGAGCGGTTCAGCAGGAATGCGCCCACGGAAGATCGGAAGAGC