Implements deterministic baselines (two versions of each: one version keeps X as list of one-hots while other version converts X to a list of strings of aa seqs)

In [1]:
import torch
from torch import distributions as dist
import itertools
import pickle

from scipy.stats import norm
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gptorch import kernels, models

In [4]:
with open('../inputs/phoq.pkl', 'rb') as f:
    t = pickle.load(f)

X = t[0] # one-hot encoding of X
T = t[1] # tokenized encoding of X
y = t[2].values

In [2]:
def decode_X(X):
    """ Takes in one-hot encoding X and decodes it to
    return a string of four amino acids. """
    
    amino_acids = 'ARNDCQEGHILKMFPSTWYV'
    
    pos_X = [i for i, x in enumerate(X) if x == 1.0] # positions of amino acids
    pos_X = [(p - 20 * i) for i, p in enumerate(pos_X)] # make sure indexing is same as in str amino_acids
    aa_X = [amino_acids[p] for i, p in enumerate(pos_X)] # amino acid chars in X
    return ''.join(aa_X)

In [25]:
def baseline_fixed(wt, X, y): # deterministic
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) 
    at each position out of the four possible positions in the wildtype sequence 
    by fixing the three other positions, then continues onto the next position in
    the wildtype sequence by fixing the best amino acid in the previous position.
    So the fixed substring is not necessarily a fixed substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant (as a one-hot). """  
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store initial indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned

    for i in range(4):
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq

        index = [] # index of xs in X with fixed substring
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst == True:
                index.append(j)
                
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[index[max_ind]][(i * 20):(i * 20 + 20)]

        # update inds to include index of new amino acid stored in baseline
        for j, num in enumerate(X[index[max_ind]][(i * 20):(i * 20 + 20)]):
            if num == 1.0:
                inds[i] = j + i * 20
                break
    
    return baseline
                
seq1 = baseline_fixed(X[150614], X, y)
print("One hot: {}".format(seq1)) # TDST, X[41418]
print("aa: {}".format(decode_X(seq1)))

y_seq1 = y[41418]
print("y value: {}".format(y_seq1))
print("global max: {}".format(np.max(y)))

One hot: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.]
aa: TDST
y value: 1.3861272508754858
global max: 1.788094549112096


In [23]:
def baseline_fixed(wt, X, y): # deterministic
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) 
    at each position out of the four possible positions in the wildtype sequence 
    by fixing the three other positions, then continues onto the next position in
    the wildtype sequence by fixing the best amino acid in the previous position.
    So the fixed substring is not necessarily a fixed substring of the wildtype sequence.
    
    Note: wildtype sequence expected as string. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant (as a string). """  
    
    X_decode = [decode_X(x) for x in X]
    baseline = ""
    wt = list(wt)
    
    for i in range(4):
        fixed = ''.join(wt)
        
        # index of xs in X with fixed substring
        index = [j for j, x in enumerate(X_decode) if fixed[0:i] == x[0:i] and fixed[i + 1:len(fixed)] == x[i + 1:len(x)]] 
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids
        
        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        baseline += X_decode[index[max_ind]][i]
        
        wt[i] = X_decode[index[max_ind]][i]
    
    return baseline

wt = decode_X(X[150614])
seq1 = baseline_fixed(wt, X, y)
print("aa: {}".format(seq1))

# y_seq1 = y[41418]
# print("y value: {}".format(y_seq1))
# print("global max: {}".format(np.max(y)))

# X_decode = [decode_X(x) for x in X]

aa: TDST


In [50]:
print(np.delete(np.arange(len(wt)), [0,1]))

[2 3]


In [55]:
# returns all 24 possibilities

def baseline_fixed(wt, X, y): # deterministic
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) 
    at each position out of the four possible positions in the wildtype sequence 
    by fixing the three other positions, then continues onto the next position in
    the wildtype sequence by fixing the best amino acid in the previous position.
    So the fixed substring is not necessarily a fixed substring of the wildtype sequence.
    
    Note: wildtype sequence expected as string. X expected as an array or list of
    one-hot encodings.
    
    Returns list of all possible 24 optimal untested variants (as a string). """  
    
    X_decode = [decode_X(x) for x in X]
    baseline = []
    
    for i in range(4):
        seq = list(wt)
        base1 = []
        
        fixed = ''.join(seq)

        # index of xs in X with fixed substring
        index = [j for j, x in enumerate(X_decode) if fixed[0:i] == x[0:i] and fixed[i + 1:len(fixed)] == x[i + 1:len(x)]] 
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        base1.append(X_decode[index[max_ind]][i])

        seq[i] = X_decode[index[max_ind]][i]
        
        for k in np.delete(np.arange(len(wt)), i):
            base2 = list(base1)
            
            fixed = ''.join(seq)

            # index of xs in X with fixed substring
            index = [j for j, x in enumerate(X_decode) if fixed[0:k] == x[0:k] and fixed[k + 1:len(fixed)] == x[k + 1:len(x)]] 
            ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

            max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

            # store amino acid in position being varied in baseline
            base2.append(X_decode[index[max_ind]][k])

            seq[k] = X_decode[index[max_ind]][k]
        
            for m in np.delete(np.arange(len(wt)), [i, k]):
                base3 = list(base2)
            
                fixed = ''.join(seq)

                # index of xs in X with fixed substring
                index = [j for j, x in enumerate(X_decode) if fixed[0:m] == x[0:m] and fixed[m + 1:len(fixed)] == x[m + 1:len(x)]] 
                ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

                max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

                # store amino acid in position being varied in baseline
                base3.append(X_decode[index[max_ind]][m])

                seq[m] = X_decode[index[max_ind]][m]
                
                for n in np.delete(np.arange(len(wt)), [i, k, m]):
                    base4 = list(base3)
            
                    fixed = ''.join(seq)

                    # index of xs in X with fixed substring
                    index = [j for j, x in enumerate(X_decode) if fixed[0:n] == x[0:n] and fixed[n + 1:len(fixed)] == x[n + 1:len(x)]] 
                    ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

                    max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

                    # store amino acid in position being varied in baseline
                    base4.append(X_decode[index[max_ind]][n])

                    seq[n] = X_decode[index[max_ind]][n]
                    
                    baseline.append(''.join(base4))
                
    
    return baseline

wt = decode_X(X[150614])
seqs = baseline_fixed(wt, X, y)
print("aa: {}".format(seqs))

# y_seq1 = y[41418]
# print("y value: {}".format(y_seq1))
# print("global max: {}".format(np.max(y)))

# X_decode = [decode_X(x) for x in X]

aa: ['TDST', 'TDTS', 'TSDT', 'TSTD', 'TTDS', 'TTSD', 'DTST', 'DTTS', 'DSTT', 'DSTT', 'DTTS', 'DTST', 'CADQ', 'CAQD', 'CDFQ', 'CDQF', 'CQFD', 'CQDF', 'WTDS', 'WTSD', 'WDTS', 'WDST', 'WSTD', 'WSDT']


In [57]:
# find y-values corresponding to 24 possible baselines from baseline_fixed()

X_decode = [decode_X(x) for x in X]
ys_baseline = [y[X_decode.index(x)] for x in seqs]
max_baseline = seqs[ys_baseline.index(max(ys_baseline))]
max_baseline

'TDST'

In [24]:
def baseline_vary(wt, X, y): # deterministic
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) at each 
    position out of the four possible positions in the wildtype sequence by fixing the three
    other positions, then takes the best amino acid at each position. The fixed substring
    in each iteration is a substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant (as a one-hot). """
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned
    
    for i in range(4): # vary amino acid in each position
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq
        
        xs_inds = [] # index of x's in X with those 3 fixed amino acids
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst:
                xs_inds.append(j)
        
        ys = [y[j] for j in xs_inds] # stores y values of x's in X with those 3 fixed amino acids
        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value
        
        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[xs_inds[max_ind]][(i * 20):(i * 20 + 20)] 
    
    return baseline

seq2 = baseline_vary(X[150614], X, y)
print("One-hot: {}".format(seq2)) # TDCW, X[44500]
print("aa: {}".format(decode_X(seq2)))

y_seq2 = y[44500]
print("y value: {}".format(y_seq2))
print("global max: {}".format(np.max(y)))

One-hot: [ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.]
aa: TDCW
y value: 1.0122630505442225
global max: 1.788094549112096


In [30]:
def baseline_vary(wt, X, y): # deterministic
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) at each 
    position out of the four possible positions in the wildtype sequence by fixing the three
    other positions, then takes the best amino acid at each position. The fixed substring
    in each iteration is a substring of the wildtype sequence.
    
    Note: wildtype sequence expected as a string. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant (as a string). """
    
    X_decode = [decode_X(x) for x in X]
    baseline = "" # stores baseline untested variant to be returned
    wt = list(wt)
    
    for i in range(4): # vary amino acid in each position
        fixed = ''.join(wt) # list of 3 fixed amino acids in each iteration through wt seq
        
        # index of xs in X with fixed substring
        index = [j for j, x in enumerate(X_decode) if fixed[0:i] == x[0:i] and fixed[i + 1:len(fixed)] == x[i + 1:len(x)]] 
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids
        
        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        baseline += X_decode[index[max_ind]][i]
    
    return baseline

wt = decode_X(X[150614])
seq2 = baseline_vary(wt, X, y)
print("aa: {}".format(seq2))

# y_seq2 = y[44500]
# print("y value: {}".format(y_seq2))
# print("global max: {}".format(np.max(y)))

aa: TDCW
