In [1]:
from Bio import SeqIO
import pandas as pd
import numpy as np
import altair
import matplotlib.pyplot as plt

In [8]:
codon_table = {
    'TTT':'F', 'TTC':'F', 'TTA':'L', 'TTG':'L',
    'TCT':'S', 'TCC':'S', 'TCA':'S', 'TCG':'S',
    'TAT':'Y', 'TAC':'Y', 'TAA':'*', 'TAG':'*',
    'TGT':'C', 'TGC':'C', 'TGA':'*', 'TGG':'W',
    'CTT':'L', 'CTC':'L', 'CTA':'L', 'CTG':'L',
    'CCT':'P', 'CCC':'P', 'CCA':'P', 'CCG':'P',
    'CAT':'H', 'CAC':'H', 'CAA':'Q', 'CAG':'Q',
    'CGT':'R', 'CGC':'R', 'CGA':'R', 'CGG':'R',
    'ATT':'I', 'ATC':'I', 'ATA':'I', 'ATG':'M',
    'ACT':'T', 'ACC':'T', 'ACA':'T', 'ACG':'T',
    'AAT':'N', 'AAC':'N', 'AAA':'K', 'AAG':'K',
    'AGT':'S', 'AGC':'S', 'AGA':'R', 'AGG':'R',
    'GTT':'V', 'GTC':'V', 'GTA':'V', 'GTG':'V',
    'GCT':'A', 'GCC':'A', 'GCA':'A', 'GCG':'A',
    'GAT':'D', 'GAC':'D', 'GAA':'E', 'GAG':'E',
    'GGT':'G', 'GGC':'G', 'GGA':'G', 'GGG':'G'
}

reverse_codon_table = {
    'A': ['GCT','GCC','GCA','GCG'],
    'C': ['TGT','TGC'],
    'D': ['GAT','GAC'],
    'E': ['GAA','GAG'],
    'F': ['TTT','TTC'],
    'G': ['GGT','GGC','GGA','GGG'],
    'H': ['CAT','CAC'],
    'I': ['ATT','ATC','ATA'],
    'K': ['AAA','AAG'],
    'L': ['TTA','TTG','CTT','CTC','CTA','CTG'],
    'M': ['ATG'],
    'N': ['AAT','AAC'],
    'P': ['CCT','CCC','CCA','CCG'],
    'Q': ['CAA','CAG'],
    'R': ['CGT','CGC','CGA','CGG','AGA','AGG'],
    'S': ['TCT','TCC','TCA','AGT','AGC'],
    'T': ['ACT','ACC','ACA','ACG'],
    'V': ['GTT','GTC','GTA','GTG'],
    'W': ['TGG'],
    'Y': ['TAT','TAC'],
    '*': ['TAG','TGA','TAA'],
    'U': ['TCG']
}

def nc_to_aa(codon_nc, codon_table=codon_table):
    codon_nc = codon_nc.upper()
    return(codon_table[codon_nc])

def ncseq_to_aaseq(nc_seq, codon_table = codon_table):
    aa_seq = [nc_to_aa(nc_seq[i : i+3], codon_table) for i in range(0, len(nc_seq),3)]
    aa_seq = ''.join(aa_seq)
    return(aa_seq)

def aa_to_nc(aa, reverse_codon_table=reverse_codon_table):
    aa = aa.upper()
    return(codon_table[aa])

In [14]:
weights = np.loadtxt("weights/weights_kpLogo_combine.txt")

In [27]:
def sequence_list_generate(aa_seq,reverse_codon_table=reverse_codon_table,contain_mid=True):
    if contain_mid:
        if len(aa_seq) != 5:
            raise Exception('seq len is wrong!')
    else:
        if len(aa_seq) != 4:
            raise Exception('seq len is wrong!')
    
    if contain_mid == False:
        aa_seq = aa_seq[:2] + 'U' + aa_seq[2:4]
    
    seq_list = []
    for site1 in aa_to_nc(aa_seq[0]):
        for site2 in aa_to_nc(aa_seq[1]):
            for sitemid in aa_to_nc(aa_seq[2]):
                for site3 in aa_to_nc(aa_seq[3]):
                    for site4 in aa_to_nc(aa_seq[4]):
                        seq_list.append((site1 + site2 + sitemid + site3 + site4))
    
    return(seq_list)

def matrix_construct_adjust(seq, contain_mid=True, oneside_num=6):
    if contain_mid:
        if len(seq) != oneside_num * 2 + 3:
            raise Exception('seq len is wrong!')
    elif contain_mid == False:
        if len(seq) != oneside_num * 2:
            raise Exception('seq len is wrong!')

    if contain_mid:
        seq = seq[0:oneside_num] + seq[oneside_num + 3: oneside_num * 2 + 3]
    
    seq_matrix = np.zeros((oneside_num * 2,4))
    i = 0

    for nc in seq:
        if nc == 'A':
            nc = 0
        elif nc == 'T':
            nc = 1
        elif nc == 'C':
            nc = 2
        elif nc == 'G':
            nc = 3
        
        seq_matrix[i,nc] = 1
        i = i + 1
    
    return(seq_matrix)

def score_cal_adjust(seq_matrix, weights_matrix=weights,oneside_num=6):
    weights_matrix = weights_matrix[30 - oneside_num : oneside_num + 30]
    result_matrix = weights_matrix * seq_matrix
    result = (np.sum(result_matrix) - np.sum(np.min(weights_matrix,axis=1))) / (np.sum(np.max(weights_matrix,axis=1)) - np.sum(np.min(weights_matrix,axis=1)))
    #result = np.sum(result_matrix) 
    return(result)

def sequence_list_score_calc(seq_list, weights_matrix=weights,contain_mid=True,oneside_num=6):
    score_list = []
    for seq in seq_list:
        score = score_cal_adjust(matrix_construct_adjust(seq,contain_mid,oneside_num),weights_matrix,oneside_num)
        score_list.append(score)
    return(score_list)

def seq_score_matrix(aa_seq,weights_matrix=weights,reverse_codon_table=reverse_codon_table,contain_mid=True,oneside_num=6):
    seqs = sequence_list_generate(aa_seq,reverse_codon_table,contain_mid)
    scores = sequence_list_score_calc(seqs,weights_matrix,contain_mid,oneside_num)
    score_matrix = pd.DataFrame(list(zip(seqs,scores)),columns=['Seq','Score'])
    score_matrix = score_matrix.sort_values('Score',axis=0,ascending=False, ignore_index=True)
    return(score_matrix)

In [70]:
def ncseq_score_matrix(ncseq,weights_matrix=weights,contain_mid=True,oneside_num=6):
    ncseq = ncseq.upper()
    aa_seq = ncseq_to_aaseq(ncseq,codon_table)
    aa_ori = aa_seq
    aa_seq = aa_seq[0:2] + 'U' + aa_seq[3:5]
    score_ori = score_cal_adjust(matrix_construct_adjust(ncseq,contain_mid,oneside_num),weights_matrix,oneside_num)
    score_df = seq_score_matrix(aa_seq,weights_matrix,reverse_codon_table,contain_mid,oneside_num)
    score_df.loc[:,'delta_Score'] = score_df.loc[:,'Score'] - score_ori
    score_df.loc[:,'AA'] = aa_seq
    score_df.loc[:,'AA_original'] = aa_ori
    score_df.loc[:,'Score_original'] = score_ori
    return(score_df)

In [56]:
sequence = 'atgGTGtcCAAGGGCGAGGAGCTGTTCACCGGGGTGGTGCCCATCCTGGTCGAGCTGGACGGCGACGTAAACGGCCACAAGTTCAGCGTGTCCGGCGAGGGCGAGGGCGATGCCACCTACGGCAAGCTGACCCTGAAGTTCATCTGCACCACCGGCAAGCTGCCCGTGCCCTGGCCCACCCTCGTGACCACCCTGACCTACGGCGTGCAGTGCTTCAGCCGCTACCCCGACCACATGAAGCAGCACGACTTCTTCAAGTCCGCCATGCCCGAAGGCTACGTCCAGGAGCGCACCATCTTCTTCAAGGACGACGGCAACTACAAGACCCGCGCCGAGGTGAAGTTCGAGGGCGACACCCTGGTGAACCGCATCGAGCTGAAGGGCATCGACTTCAAGGAGGACGGCAACATCCTGGGGCACAAGCTGGAGTACAACTACAACAGCCACAACGTCTcgATCATGGCCGACAAGCAGAAGAACGGCATCAAGGTGAACTTCAAGATCCGCCACAACATCGAGGACGGCAGCGTGCAGCTCGCCGACCACTACCAGCAGAACACCCCCATCGGCGACGGCCCCGTGCTGCTGCCCGACAACCACTACCTGAGCACCCAGTCCGCCCTGAGCAAAGACCCCAACGAGAAGCGCGATCACATGGTCCTGCTGGAGTTCGTGACCGCCGCCGGGATCACTCTCGGCATGGACGAGCTGTACAAG'

In [71]:
def whole_seq_calc(seq_full, weights_matrix = weights):
    sequence = seq_full.upper()
    result_df = pd.DataFrame()
    i = 0

    for index in range(0,int(len(sequence)/3)):
        if index < 2:
            continue
        if index > (int(len(sequence)/3) - 2  - 1):
            break
        
        start_site = index * 3 - 6
        end_site = index * 3 + 3 + 6
        ncseq = sequence[start_site:end_site]
        score_df = ncseq_score_matrix(ncseq,weights_matrix)

        result_df.loc[i, 'AA_site'] = int(index)
        result_df.loc[i, 'AA_aa'] = nc_to_aa(ncseq[6:9])
        result_df.loc[i, 'AA'] = score_df.loc[0,'AA']
        result_df.loc[i, 'AA_original'] = score_df.loc[0,'AA_original']
        ncseq_r = score_df.loc[0,'Seq']
        result_df.loc[i, 'NC'] = (ncseq_r[0:6] + 'TCG' + ncseq_r[9:15])
        result_df.loc[i, 'NC_original'] = (ncseq)
        result_df.loc[i, 'Seq_type'] = 'highest'
        result_df.loc[i, 'Score'] = score_df.loc[0,'Score']
        result_df.loc[i, 'Score_original'] = score_df.loc[0,'Score_original']
        result_df.loc[i, 'delta_score'] = score_df.loc[0,'delta_Score']
        i = i + 1

        result_df.loc[i, 'AA_site'] = int(index)
        result_df.loc[i, 'AA_aa'] = nc_to_aa(ncseq[6:9])
        result_df.loc[i, 'AA'] = score_df.loc[(len(score_df) - 1),'AA']
        result_df.loc[i, 'AA_original'] = score_df.loc[(len(score_df) - 1),'AA_original']
        ncseq_r = score_df.loc[(len(score_df) - 1),'Seq']
        result_df.loc[i, 'NC'] = (ncseq_r[0:6] + 'TCG' + ncseq_r[9:15])
        result_df.loc[i, 'NC_original'] = (ncseq)
        result_df.loc[i, 'Seq_type'] = 'lowest'
        result_df.loc[i, 'Score'] = score_df.loc[(len(score_df) - 1),'Score']
        result_df.loc[i, 'Score_original'] = score_df.loc[(len(score_df) - 1),'Score_original']
        result_df.loc[i, 'delta_score'] = score_df.loc[(len(score_df) - 1),'delta_Score']
        i = i + 1
    
    return(result_df)


In [72]:
weights1 = np.loadtxt("weights/weights_kpLogo_combine.txt")
weights2 = np.loadtxt("weights/weights_kpLogo.txt")
weights3 = np.loadtxt("weights/weights_kpLogo_rank.txt")

In [73]:
writer = pd.ExcelWriter('predict/sequence_redesign_gfp_dwl.xlsx', engine = 'xlsxwriter')

for s_name, weights in zip(['combined','score','rank'],[weights1,weights2,weights3]):
    r_df = whole_seq_calc(sequence, weights)
    r_df.to_excel(writer, sheet_name=s_name,index=False)

writer.close()