First, makes predictions on a library of 160,000 PhoQ variants using GP and Matern Kernel then computes objective. Combines gp_ssm and objective_ssm notebooks. 

Includes functions that compute each of the three baselines:
1. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then continues onto the next position by fixing the best amino acid in the previous position.
2. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then takes the best amino acid at each position.
3. Baseline that uses greedy algorithm to maximize objective. Starts out with best prediction then continues to add amino acids until objective stops increasing.

In [1]:
import torch
from torch import distributions as dist
import itertools
import pickle

from scipy.stats import norm
import operator

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gptorch import kernels, models

In [2]:
with open('../inputs/phoq.pkl', 'rb') as f:
    t = pickle.load(f)

X = t[0] # one-hot encoding of X
T = t[1] # tokenized encoding of X
y = t[2].values

In [3]:
def decode_X(X):
    """ Takes in one-hot encoding X and decodes it to
    return a string of four amino acids. """
    
    amino_acids = 'ARNDCQEGHILKMFPSTWYV'
    
    pos_X = [i for i, x in enumerate(X) if x == 1.0] # positions of amino acids
    pos_X = [(p - 20 * i) for i, p in enumerate(pos_X)] # make sure indexing is same as in str amino_acids
    aa_X = [amino_acids[p] for i, p in enumerate(pos_X)] # amino acid chars in X
    return ''.join(aa_X)

# test on AWSS
# decode_X([ 1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
#         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
#         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,
#         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
#         0.,  0.,  0.,  1.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,
#         0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  0.,  1.,  0.,  0.,
#         0.,  0.])

In [4]:
## UPDATED VERSION OF GP_train() method

def get_predictions(X_train, y_train, X_test, its=500):
    """
    Train GP regressor on X_train and y_train. 
    Predict mean and std for X_test. 
    Return P(y > y_train_max) as dictionary eg 'AGHU': 0.78
    NB: for X_test in X_train, P ~= 0
    Be careful with normalization
    
    Expects X_train, y_train, and X_test as np.arrays
    """
    
    ke = kernels.MaternKernel()
    mo = models.GPRegressor(ke)
    
    # make data into tensors
    X_train = torch.Tensor(X_train)
    X_test = torch.Tensor(np.array(X_test))
    y_train_scaled = (np.array(y_train) - np.mean(np.array(y_train))) / np.std(np.array(y_train)) # scale y_train
    y_train_scaled = torch.Tensor(y_train_scaled.reshape(len(y_train_scaled), 1)).double()
    
    his = mo.fit(X_train, y_train_scaled, its=its) # fit model with training set
    
    # make predictions
    dic = {} # use dictionary to store probs
    ind = 0 # index for feeding in batches of X_test
    tau = y_train_scaled.max().float()
    
    for i in range(1000, len(X) + 1000, 1000):
        mu, var = mo.forward(X_test[ind:i]) # make predictions
        std = torch.sqrt(var.diag())
        mu = mu.squeeze()
        prob = 1 - dist.Normal(mu, std).cdf(tau) # compute probabilities for all means, stds

        for j, p in enumerate(prob):
            seq = decode_X(X_test[ind:i][j])
            dic[seq] = p # store prob for each seq

        ind = i
        
    return dic

In [5]:
np.random.seed(1)
rand_inds = np.random.choice(len(X), 100, replace=True) # generate random indices for 100 X's to sample from
X_train = X[rand_inds]
y_train = y[rand_inds]
X_test = X
y_true = y

dic = get_predictions(X_train, y_train, X_test, its=10)

Iteration 1 of 10	NLML: 45.3735	Iteration 2 of 10	NLML: 44.1491	Iteration 3 of 10	NLML: 43.1219	Iteration 4 of 10	NLML: 42.1918	Iteration 5 of 10	NLML: 41.3986	Iteration 6 of 10	NLML: 40.8094	Iteration 7 of 10	NLML: 40.4363	Iteration 8 of 10	NLML: 40.2570	Iteration 9 of 10	NLML: 40.2296	Iteration 10 of 10	NLML: 40.2686	

In [None]:
# Use GP to make predictions - returns dictionary of means and stds

def GP_train(X, y, n):
    """ GP that uses Matern Kernel and trains on n number of samples--where n is the batch 
    size--of examples from X and makes predictions on the rest of X. """
    
    ke = kernels.MaternKernel()
    mo = models.GPRegressor(ke)

    np.random.seed(1)
    rand_inds = np.random.choice(len(X), n, replace=True) # generate random indices for 100 X's to sample from
    test_inds = [i for i in np.arange(len(X)) if i not in rand_inds]
    X_train = X[rand_inds]
    y_train = y[rand_inds]
    X_test = X[test_inds]
    y_true = y[test_inds]

    # make data into tensors
    X_train = torch.Tensor(X_train)
    X_test = torch.Tensor(np.array(X_test))
    y_train_scaled = (np.array(y_train) - np.mean(np.array(y_train))) / np.std(np.array(y_train)) # scale y_train
    y_train_scaled = torch.Tensor(y_train_scaled.reshape(len(y_train_scaled), 1)).double()

    his = mo.fit(X_train, y_train_scaled, its=500) # fit model with training set
    
    # make predictions
    dic = {} # use dictionary to store means, stds
    ind = 0
    for i in range(1000, len(X) + 1000, 1000):
        mu_scaled, var = mo.forward(X_test[ind:i]) # make predictions
        mu = mu_scaled * np.std(np.array(y_train)) + np.mean(np.array(y_train)) # unscale predictions
        std = np.sqrt(np.diag(var.detach().numpy())) * np.std(np.array(y_train))
        
        for j, m in enumerate(mu):
            dic[X_test[ind:i][j]] = (m, std[j]) # store means, stds

        ind = i
        
    return dic
#     # used for list slicing of means and stds - split into two lists to be dumped into two pickle files
#     half = (len(X) - n) // 2
#     full = len(X) - n
    
#     # write results out to pickle files
#     with open('GP_ssm_results1.pkl', 'wb') as f:
#         pickle.dump((means[0:half], stds[0:half], y_true[0:half], max(y)), f) # pass in tau (best experimental value) as well

#     with open('GP_ssm_results2.pkl', 'wb') as f:
#         pickle.dump((means[half:full], stds[half:full], y_true[half:full]), f)
    
dic = GP_train(X, y, 100)

In [6]:
# Computing objective

def objective(X, probs, n):
    """ Takes in library X, probabilities, and batch size n.
    
    Expects X to be a list of tuples, and probs to be a dictionary.
    
    Returns objective to be maximized. """
    
    N = 1 # represents the product of sequence of # aas at each position
    for i in X:
        N *= len(i)
    
    # filter thru probs to find prob of x's in X
    X.sort(key=lambda tup: tup[1])

    X_str = [[tup[0] for i, tup in enumerate(X) if tup[1] == j] for j in range(4)] # generate list of lists of strings
    X_str = [''.join(s) for s in itertools.product(*X_str)] # generate list of strings of 4 aa seqs

    p = torch.Tensor([probs[key] for key in X_str])
    obj = torch.sum(p) * (1 - (1 - 1 / N) ** n)
    
    return obj

In [None]:
def baseline_fixed(wt, X, y):
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) 
    at each position out of the four possible positions in the wildtype sequence 
    by fixing the three other positions, then continues onto the next position in
    the wildtype sequence by fixing the best amino acid in the previous position.
    So the fixed substring is not necessarily a fixed substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant. """  
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store initial indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned

    for i in range(4):
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq

        index = [] # index of xs in X
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst == True:
                index.append(j)
                
        ys = [y[j] for j in index] # stores y values of x's in X with those 3 fixed amino acids

        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value

        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[index[max_ind]][(i * 20):(i * 20 + 20)]

        # update inds to include index of new amino acid stored in baseline
        for j, num in enumerate(X[index[max_ind]][(i * 20):(i * 20 + 20)]):
            if num == 1.0:
                inds[i] = j + i * 20
                break
    
    return baseline
                
seq = baseline_fixed(X[150614], X, y)
print(seq)

In [None]:
def baseline_vary(wt, X, y):
    """ Takes in wildtype sequence, X, and y to compute baseline that creates 
    optimal sequence from X's given optimal amino acids (those with max y-values) at each 
    position out of the four possible positions in the wildtype sequence by fixing the three
    other positions, then takes the best amino acid at each position. The fixed substring
    in each iteration is a substring of the wildtype sequence.
    
    Note: wildtype sequence expected as one-hot encoding. X expected as an array or list of
    one-hot encodings.
    
    Returns optimal untested variant. """
    
    inds = [i for i, num in enumerate(wt) if num == 1.0] # store indices of four positions with amino acids
    baseline = np.zeros((80,)) # stores baseline untested variant to be returned
    
    for i in range(4): # vary amino acid in each position
        fixed = list(inds[0:i] + inds[i + 1:len(inds)]) # list of 3 fixed amino acids in each iteration through wt seq
        
        xs_inds = [] # index of x's in X with those 3 fixed amino acids
        for j, x in enumerate(X):
            in_lst = True
            for m in fixed:
                if x[m] != 1.0:
                    in_lst = False
            if in_lst:
                xs_inds.append(j)
        
        ys = [y[j] for j in xs_inds] # stores y values of x's in X with those 3 fixed amino acids
        max_ind = np.where(ys==max(ys))[0][0] # takes first occurrence of index with maximum y value
        
        # store amino acid in position being varied in baseline
        baseline[(i * 20):(i * 20 + 20)] = X[xs_inds[max_ind]][(i * 20):(i * 20 + 20)] 
    
    return baseline

seq = baseline_vary(X[150614], X, y)
print(seq)

In [7]:
def avail_aa(X):
    """ Takes in a library X and returns a list of tuples of the available amino acids 
    at each position that can be added to the library. """
    
    amino_acids = 'ARNDCQEGHILKMFPSTWYV'
    return [(aa, i) for i in range(4) for aa in amino_acids if (aa, i) not in X]

In [8]:
def baseline_greedy(probs, seed, n):
    """ Takes in probabilities, seed (the best prediction), and batch size
    to create baseline optimal library using the Greedy algorithm. The algo 
    starts out with the seed then continues to add amino acids until obj
    stops increasing.
    
    Note: probs expected as a dictionary, and seed expected as list of tuples.
    
    Returns optimal untested library. """
    
    X = seed # library X starts with seed
    
    obj = objective(X, probs, n)
    aa = avail_aa(X) # determine available/unincluded amino acids at each position of X
    
    while True:
        lst = [objective(X + [a], probs, n) for a in aa] # lst of obj's for library w each available aa added
        index, obj_next = max(enumerate(lst), key=operator.itemgetter(1)) # determine which aa maximizes obj
        
        if obj_next < obj: # if obj stops increasing, exit
            break
        else:
            X.append(aa[index]) # add aa that maximizes obj to X
            obj = obj_next
            aa.remove(aa[index])
            
    return X

In [9]:
def get_seed(probs):
    """ Takes in a dictionary of amino acids to probabilities as
    generated by the get_predictions() function, and returns the 
    seed (the four amino acid seq with the best prediction, aka the 
    highest probabilitiy). 
    
    Returns a list of tuples representing the seed.
    
    Currently, 'SSSG' is the seed. """
    
    seq = max(probs.items(), key=operator.itemgetter(1))[0]
    return [(aa, i) for aa, i in zip(seq, range(4))]

seed = get_seed(dic)

In [10]:
seed

[('S', 0), ('S', 1), ('S', 2), ('G', 3)]

In [11]:
baseline_greedy(dic, seed, 100)

[('S', 0), ('S', 1), ('S', 2), ('G', 3), ('L', 3), ('T', 1), ('C', 2)]