In [5]:
import numpy as np
import pandas as pd
import scipy.stats as stats
from sklearn import preprocessing
import tensorflow as tf
from tensorflow.python.keras import backend as K
from tensorflow import keras

def one_hot_encode(df, col='seq', seq_len=44):
    # Dictionary returning one-hot encoding of nucleotides. 
    nuc_d = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    
    # Creat empty matrix.
    vectors=np.empty([len(df),seq_len,4])
    
    # Iterate through UTRs and one-hot encode
    for i,seq in enumerate(df[col].str[:seq_len]): 
        seq = seq.lower()
        lst = [nuc_d[x] for x in seq]
        if seq_len>len(seq):
            lst += [nuc_d['n']]*(seq_len-len(seq))
        a = np.array(lst)
        vectors[i] = a
    return vectors

def r2(x,y):
    slope, intercept, r_value, p_value, std_err = stats.linregress(x,y)
    return r_value**2


datadir=f"/Users/john/data"
name,seq_len="pl3-1-2",118
original_seq = 'atcccgggtgaggcatcccaccatcctcagtcacagagagacccaatctaccatcagcatcagccagtaaagattaagaaaaacttagggtgaaagaaatttcacctaacacggcgca'
original_seq=original_seq.upper()
model = keras.models.load_model(f"{datadir}/models/pl3-1-2.keras")
prefix,suffix = original_seq[:72],original_seq[96:]
df = pd.read_csv(f"{datadir}/Promter/results/{name}/{name}_final.csv")
df["seq"]= df.seq.apply(lambda x:prefix+x+suffix)
scaler = preprocessing.StandardScaler()
scaler.fit_transform(df.loc[:,"score"].values.reshape(-1,1))
df["isCore"] = df["seq"].apply(lambda x: x.startswith(prefix) and x.endswith(suffix))
df["core"] =df.seq.apply(lambda x:x[72:96])

# Genetic algorithm

In [6]:
import random
import math

def vectorizeSequence(seq):
    # the order of the letters is not arbitrary.
    # Flip the matrix up-down and left-right for reverse compliment
    ltrdict = {'a':[1,0,0,0],'c':[0,1,0,0],'g':[0,0,1,0],'t':[0,0,0,1], 'n':[0,0,0,0]}
    return np.array([ltrdict[x] for x in seq])

def ret_rand_nuc(idx):
    lst = [0,1,2,3]
    lst.remove(idx)
    x = random.sample(lst,1)[0]
    if x == 0:
        return [1,0,0,0] # A
    if x == 1:
        return [0,1,0,0] # C
    if x == 2:
        return [0,0,1,0] # G
    if x == 3:
        return [0,0,0,1] # T
    
def vector_to_nuc(arr, seq_len=24):
    seq = ''
    for i in range(seq_len):
        if arr[i,0] == 1:
            seq = seq + 'A'
        if arr[i,1] == 1:
            seq = seq + 'C'
        if arr[i,2] == 1:
            seq = seq + 'G'
        if arr[i,3] == 1:
            seq = seq + 'T'
    return seq

def convert_and_save(sequences, predictions):
    # Convert the one-hot encoded sequences to A, C, T, G
    seqs = []
    for nbr in range(len(sequences)):
        seqs.append(vector_to_nuc(sequences[nbr]))
    df = pd.DataFrame(data=[seqs,predictions.tolist()]).transpose()
    df.columns = ['seq', 'prediction']
    df.sort_values('prediction', ascending=False, inplace=True)
    return df

def make_random_sequences(nbr_sequences, length, constant='', no_uaug=False, no_stop=False):
    # Make randomized sequences, allowing for the inclusion / exclusion of uATGs / stop codons
    seqs = []
    nucs = {0:'A', 1:'T', 2:'C', 3:'G'}
    i = 0
    while i < nbr_sequences:
        new_seq = ''
        for n in range(length - len(constant)):
            new_seq = new_seq + nucs[random.randint(0,3)]
        
        if no_uaug == False or (no_uaug==True and 'ATG' not in new_seq):
            if no_stop == False or (no_stop == True and ('TAG' not in new_seq and 'TGA' not in new_seq and 'TAA' not in new_seq)):
                new_seq = new_seq + constant
                seqs.append(new_seq)
                i+=1
    return seqs

def simple_mutate(seq, nbr_bases=2, prob=1,seq_len=24):
    if nbr_bases > 1 and prob > random.random():
        nbr_bases = nbr_bases
    else:
        nbr_bases = 1
    lst = list(range(seq_len))
    poss = random.sample(lst,nbr_bases)
    for pos in poss:
        idx = np.argmax(seq[pos])
        seq[pos] = ret_rand_nuc(idx)
    return seq

def check_for_uaug(seq,seq_len=24):
    seq = vector_to_nuc(seq,seq_len)
    return 'ATG' in seq[:seq_len]

def check_for_stops(seq,seq_len=24):
    seq = vector_to_nuc(seq)
    if 'TAG' in seq[:seq_len] or 'TGA' in seq[:seq_len] or 'TAA' in seq[:seq_len]:
        return True
    return False

def negative_selection(seq, model, scaler, target_val, no_uaug=False, no_stop=False, nbr_bases_to_mutate=1, multi_mutate_prob=1):
    seqs = np.empty([2,54,4])
    seqs[0] = seq.copy()
    seqs[1] = simple_mutate(seq.copy(), nbr_bases=nbr_bases_to_mutate, prob=multi_mutate_prob)
    
    if no_uaug == True and check_for_uaug(seqs[1]):
        return seqs[0]
    if no_stop == True and check_for_stops(seqs[1]):
        return seqs[0]
    
    scores = model.predict(seqs).reshape()
    scores = scaler.inverse_transform(scores)
    if scores[1] < scores[0]:
        if scores[1] >= target_val:
            return seqs[1]
        else:
            return seqs[0]
    else:
        return seqs[0]    

def selection(seq, model, scaler, target_val, no_uaug=False, no_stop=False, nbr_bases_to_mutate=1, multi_mutate_prob=1):
    seqs = np.empty([2,50,4])
    seqs[0] = seq
    seqs[1] = simple_mutate(seq.copy(), nbr_bases=nbr_bases_to_mutate, prob=multi_mutate_prob)
    
    if no_uaug == True and check_for_uaug(seqs[1]):
        return seqs[0]
    if no_stop == True and check_for_stops(seqs[1]):
        return seqs[0]
    
    scores = model.predict(seqs).reshape(-1)
    scores = scaler.inverse_transform(scores)
    if scores[1] > scores[0]:
        if scores[1] <= target_val:
            return seqs[1]
        else:
            return seqs[0]
    else:
        return seqs[0]  

def wrap_seq(seqs, seq_len):
    ret = np.empty([len(seqs),len(prefix)+seq_len+len(suffix),4])
    for i in range(len(seqs)):
        for j in range(len(prefix)):
            ret[i][j] = vectorizeSequence(prefix[j].lower())
        for j in range(seq_len):
            idx = len(prefix)+j
            ret[i][idx] = seqs[i][j]
        for j in range(len(suffix)):
            idx = len(prefix)+seq_len+j
            ret[i][idx] = vectorizeSequence(suffix[j].lower())
    return ret
    
def selection_to_target(seq, model, scaler, target_val, no_uaug=False, no_stop=False, nbr_bases_to_mutate=1, multi_mutate_prob=1, seq_len=50, accept_range=0.1):
    seqs = np.empty([2,seq_len,4])
    # Save the incoming sequence before mutating
    seqs[0] = seq.copy()
    # The mutated sequence
    seqs[1] = simple_mutate(seq.copy(), nbr_bases=nbr_bases_to_mutate, prob=multi_mutate_prob,seq_len=seq_len)
    
    # Decide whether to continue with the new sequence based on the uAUG / stop codon preference
    if no_uaug == True and check_for_uaug(seqs[1]):
        return seqs[0]
    if no_stop == True and check_for_stops(seqs[1]):
        return seqs[0]
    
    # Accept sequences that fall within this range. May provide more sequence diversity
    s0, s1 = vector_to_nuc(seqs[0]),vector_to_nuc(seqs[1])
    scores = model.predict(wrap_seq(seqs,seq_len),verbose=0).reshape(-1,1)
    scores = scaler.inverse_transform(scores).reshape(-1)
    if scores[0] >= target_val - accept_range and scores[0] <= target_val + accept_range:
        return seqs[0]
    else:
        if abs(target_val - scores[1]) <= abs(target_val - scores[0]):
            return seqs[1]
        else:
            return seqs[0] 

# Evolve new sequences to hit target MRLs

In [7]:
import tqdm
import numpy as np
# Dictionary where new sequences are saved
evolved_seqs = {}

# Number of evolution iterations
iterations = 400
# Number of bases to mutate if the probability to 'multi-mutate' is exceeded
nbr_bases_to_mutate = 2
# Probability to change multiple bases in an iteration
prob_of_multi_mutation = 0.5
# If using the original evolution model, set seq_len to 54. That model was
# trained on UTRs that included the first for basees of the CDS (ATGG).
seq_len = 24
accept_range=0.1
# Choose target MRLs and the number of sequences to create for each
targets = [-6, -4,-2,0]
seqs_per_target = [5, 5,5,5]
# Choose whether or not to allow uAUGs and / or stop codons
no_uaug = True
no_stop = False
seed=42
random.seed(seed)
np.random.seed(seed)

for target_rl, nbr_sequences in zip(targets, seqs_per_target):
    print('Working on target_rl {} with {} sequences:'.format(target_rl, nbr_sequences))
    df = df.sample(frac=1.0)
    # Randomly generate starting sequences for evolving
#     rand_seqs = make_random_sequences(nbr_sequences, seq_len, no_uaug=no_uaug, no_stop=no_stop)
    rand_seqs= df[:nbr_sequences]["core"]
    test_sequences = np.empty([len(rand_seqs), seq_len, 4])
    i = 0
    # One-hot encode sequences
    for seq in rand_seqs:
        test_sequences[i] = vectorizeSequence(seq.lower())
        i += 1
    # Evolve sequences
    for generation in tqdm.tqdm(range(iterations)):
        
        for i in range(len(test_sequences)):
            test_sequences[i] = selection_to_target(seq=test_sequences[i], model=model, scaler=scaler, target_val=target_rl,no_uaug=no_uaug,
                                        no_stop=no_stop, nbr_bases_to_mutate=nbr_bases_to_mutate, multi_mutate_prob=prob_of_multi_mutation, seq_len=seq_len)         
        if (generation + 1) %  100 == 0:
            # Final prediction then convert to text sequence
            predictions = model.predict(wrap_seq(test_sequences,seq_len),verbose=0).reshape(-1,1)
            predictions = scaler.inverse_transform(predictions).reshape(-1)
            mean = abs((predictions - target_rl).mean())
            print(f"after{generation} loss mean {mean}")
            if mean<accept_range:
                break
        
    converted_df = convert_and_save(test_sequences,predictions)
    
    evolved_seqs[target_rl] = converted_df

Working on target_rl -6 with 5 sequences:


 25%|████████████████████████████▉                                                                                        | 99/400 [00:30<01:33,  3.22it/s]


after99 loss mean 0.003731155302375555
Working on target_rl -4 with 5 sequences:


 25%|████████████████████████████▉                                                                                        | 99/400 [00:32<01:38,  3.05it/s]


after99 loss mean 0.09879522025585175
Working on target_rl -2 with 5 sequences:


 25%|█████████████████████████████                                                                                       | 100/400 [00:33<01:42,  2.92it/s]

after99 loss mean 0.5655797719955444


 50%|██████████████████████████████████████████████████████████                                                          | 200/400 [01:04<01:06,  3.00it/s]

after199 loss mean 0.38822731375694275


 75%|███████████████████████████████████████████████████████████████████████████████████████                             | 300/400 [01:42<00:40,  2.46it/s]

after299 loss mean 0.26839232444763184


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:17<00:00,  2.91it/s]


after399 loss mean 0.24443097412586212
Working on target_rl 0 with 5 sequences:


 25%|█████████████████████████████                                                                                       | 100/400 [00:31<01:34,  3.16it/s]

after99 loss mean 2.887516975402832


 50%|██████████████████████████████████████████████████████████                                                          | 200/400 [01:02<01:04,  3.09it/s]

after199 loss mean 2.2127747535705566


 75%|███████████████████████████████████████████████████████████████████████████████████████                             | 300/400 [01:32<00:31,  3.14it/s]

after299 loss mean 1.8687245845794678


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 400/400 [02:03<00:00,  3.24it/s]

after399 loss mean 1.5693457126617432





In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(9,6), dpi= 80)
for i in evolved_seqs:
    if i:
        sns.kdeplot(evolved_seqs[i]['prediction'], shade=True, legend=True, label=i)
# Decoration
plt.title('Design Sev', fontsize=22)
plt.legend()
plt.show()
# evolved_seqs[8].head()