First, makes predictions on a library of 160,000 PhoQ variants using GP and Matern Kernel then computes objective. Combines gp_ssm and objective_ssm notebooks. 

Includes functions that compute each of the two baselines:
1. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then continues onto the next position by fixing the best amino acid in the previous position.
2. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then takes the best amino acid at each position.

In [1]:
import torch
import itertools
import pickle

from scipy.stats import norm

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gptorch import kernels, models

In [2]:
with open('../inputs/phoq.pkl', 'rb') as f:
    t = pickle.load(f)

X = t[0] # one-hot encoding of X
T = t[1] # tokenized encoding of X
y = t[2].values

In [3]:
# Use GP to make predictions

def GP_train(X, y, n):
    """ GP that uses Matern Kernel and trains on n number of samples--where n is the batch 
    size--of examples from X and makes predictions on the rest of X. """
    
    ke = kernels.MaternKernel()
    mo = models.GPRegressor(ke)

    np.random.seed(1)
    rand_inds = np.random.choice(len(X), n, replace=True) # generate random indices for 100 X's to sample from
    test_inds = [i for i in np.arange(len(X)) if i not in rand_inds]
    X_train = X[rand_inds]
    y_train = y[rand_inds]
    X_test = X[test_inds]
    y_true = y[test_inds]

    # make data into tensors
    X_train = torch.Tensor(X_train)
    X_test = torch.Tensor(np.array(X_test))
    y_train_scaled = (np.array(y_train) - np.mean(np.array(y_train))) / np.std(np.array(y_train)) # scale y_train
    y_train_scaled = torch.Tensor(y_train_scaled.reshape(len(y_train_scaled), 1)).double()

    his = mo.fit(X_train, y_train_scaled, its=500) # fit model with training set
    
    # make predictions
    means, stds = [], []
    ind = 0
    for i in range(1000, len(X) + 1000, 1000):
        mu_scaled, var = mo.forward(X_test[ind:i]) # make predictions
        mu = mu_scaled * np.std(np.array(y_train)) + np.mean(np.array(y_train)) # unscale predictions
        std = np.sqrt(np.diag(var.detach().numpy())) * np.std(np.array(y_train))

        for j, m in enumerate(mu):
            means.append(m)
            stds.append(std[j])

        ind = i
    
    # used for list slicing of means and stds - split into two lists to be dumped into two pickle files
    half = (len(X) - n) // 2
    full = len(X) - n
    
    # write results out to pickle files
    with open('GP_ssm_results1.pkl', 'wb') as f:
        pickle.dump((means[0:half], stds[0:half], y_true[0:half], max(y)), f) # pass in tau (best experimental value) as well

    with open('GP_ssm_results2.pkl', 'wb') as f:
        pickle.dump((means[half:full], stds[half:full], y_true[half:full]), f)
    

In [4]:
GP_train(X, y, 100)

Iteration 500 of 500	NLML: 40.0319	

KeyboardInterrupt: 

In [None]:
# Computing objective

def objective(items, means, stds, tau, n):
    """ Takes in items (library), means/predictions, standard deviations stds, 
    best experimental y value tau, and batch size n. 
    
    Expects library to be a list of lists.
    
    Returns objective to be maximized. """
    
    N = 1 # represents the product of sequence of # aas at each position
    for i in items:
        N *= len(i)
    
    obj = 0
    for i, mu in enumerate(means): # compute objective
        obj += (1 - scipy.stats.norm(mu.detach().numpy(), stds1[i]).cdf(tau)) * (1 - ((N - 1) / N) ** n) 
    
    return obj

In [7]:
def baseline_fixed(wt, X, y):
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) 
    at each position out of the four possible positions in the wildtype sequence 
    by fixing the three other positions, then continues onto the next position in
    the wildtype sequence by fixing the best amino acid in the previous position.
    So the fixed substring is not necessarily a fixed substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant. """  
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store initial indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned

    for i in range(4):
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq

        index = [] # index of xs in X
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst == True:
                index.append(j)
                
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[index[max_ind]][(i * 20):(i * 20 + 20)]

        # update inds to include index of new amino acid stored in baseline
        for j, num in enumerate(X[index[max_ind]][(i * 20):(i * 20 + 20)]):
            if num == 1.0:
                inds[i] = j + i * 20
                break
    
    return baseline
                
seq = baseline_fixed(X[150614], X, y)
print(seq)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  1.  0.  0.  0.]


In [12]:
def baseline_vary(wt, X, y):
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) at each 
    position out of the four possible positions in the wildtype sequence by fixing the three
    other positions, then takes the best amino acid at each position. The fixed substring
    in each iteration is a substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant. """
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned
    
    for i in range(4): # vary amino acid in each position
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq
        
        xs_inds = [] # index of x's in X with those 3 fixed amino acids
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst:
                xs_inds.append(j)
        
        ys = [y[j] for j in xs_inds] # stores y values of x's in X with those 3 fixed amino acids
        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value
        
        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[xs_inds[max_ind]][(i * 20):(i * 20 + 20)] 
    
    return baseline

seq = baseline_vary(X[150614], X, y)
print(seq)

[ 0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.
  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  1.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.  0.
  0.  0.  0.  0.  0.  1.  0.  0.]
