First, makes predictions on a library of 160,000 PhoQ variants using GP and Matern Kernel then computes objective. Combines gp_ssm and objective_ssm notebooks. 

Includes functions that compute each of the three baselines:
1. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then continues onto the next position by fixing the best amino acid in the previous position.
2. Baseline that creates optimal sequence from X's given optimal amino acids (those with max y-values) at each position out of the four possible positions in the wildtype sequence by fixing the three other positions, then takes the best amino acid at each position.
3. Baseline that uses greedy algorithm to maximize objective. Starts out with best prediction then continues to add amino acids until objective stops increasing.

In [1]:
import torch
from torch import distributions as dist
import itertools

import operator
import pickle
import importlib

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gptorch import kernels, models
import bases, helpers, opt

In [2]:
with open('../inputs/phoq.pkl', 'rb') as f:
    t = pickle.load(f)

X = t[0] # one-hot encoding of X
T = t[1] # tokenized encoding of X
y = t[2].values

In [3]:
seq_to_x = {} # dictionary of strings of aa with corresponding index in X
for i, x in enumerate(X):
    seq = helpers.decode_X(x)
    seq_to_x[seq] = i

In [4]:
# DET BASELINE_FIXED

wt = helpers.decode_X(X[150614]) # wt as string
seqs = bases.det_fixed(wt, X, y)
print("aa: {}".format(seqs))

# find y-values corresponding to 24 possible baselines from baseline_fixed() --> take aa seq with max y

seqs = list(set(seqs)) # remove duplicates
X_decode = [helpers.decode_X(x) for x in X]
ys_baseline = [y[X_decode.index(x)] for x in seqs]
max_baseline = seqs[ys_baseline.index(max(ys_baseline))]

y_seq1 = max(ys_baseline)
print("best baseline: {}".format(max_baseline))
print("y value: {}".format(y_seq1))
print("global max: {}".format(np.max(y)))

aa: ['TDST', 'TDST', 'TDST', 'TRSQ', 'TRSQ', 'TRSQ', 'TDST', 'TDST', 'ADAR', 'ADAR', 'TDST', 'ADAT', 'ADCQ', 'ADCT', 'SDCT', 'FDCQ', 'ADCT', 'SDCT', 'TDSW', 'TSAW', 'TSLW', 'ASRW', 'THIW', 'ASIW']
best baseline: ADAR
y value: 1.7134569882257067
global max: 1.788094549112096


In [5]:
# DET BASELINE_VARY

wt = helpers.decode_X(X[150614])  # wt as string
seq2 = bases.det_vary(wt, X, y)
print("aa: {}".format(seq2))

y_seq2 = y[X_decode.index(seq2)]
print("y value: {}".format(y_seq2))
print("global max: {}".format(np.max(y)))

aa: TDCW
y value: 1.0122630505442225
global max: 1.788094549112096


In [None]:
# BASELINE GREEDY ALGO

np.random.seed(1)
rand_inds = np.random.choice(len(X), 100, replace=True) # generate random indices for 100 X's to sample from
X_train = X[rand_inds]
y_train = y[rand_inds]
X_test = X
y_true = y

dic, means = helpers.get_predictions(X_train, y_train, X_test, its=500)

In [None]:
L = 4
seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
chosen

In [None]:
helpers = importlib.reload(helpers)
opt = importlib.reload(opt)

seqs = helpers.seqs_from_set(chosen, 4)
X_sampled = [X_train]
y_sampled = [y_train]

In [None]:
##it should eventually come to a place where adding 
#items to the set doesn't improve the objective (or it's added everything to the set)

n_start = 35
max_its = 30
rounds = 4
L = 4
n = 100

libraries = []
histories = []

best_loss = 0.0
best_X = None
best_h = None
    
for rou in range(rounds):
    print('Round %d' %rou)
    dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500)
    print()
    
    #for i in range(max_its):
    seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
    chosen, h = bases.greedy(dic, seed, 100, L)
    if h < best_loss:
        best_loss = h
        best_X = chosen
        best_h = h
        
    libraries.append(chosen)
    histories.append(h)
    seqs = helpers.seqs_from_set(best_X, L)
    inds = np.random.choice(len(seqs), n, replace=True)
    sampled_seqs = [seqs[i] for i in inds]
    inds = [seq_to_x[s] for s in sampled_seqs]
    X_sampled.append(X[inds])
    y_sampled.append(y[inds])

In [None]:
libraries

In [None]:
histories

In [None]:
## FIRST ITERATION
n_start = 35
max_its = 30
rounds = 4
L = 4
n = 100

best_loss = 0.0
best_X = None
best_h = None

lst_ys = []
libraries = []
histories = []
preds = [] # to keep track of predictions after each iteration through greedy algorithm

dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500)
preds.append(means)

seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
if h < best_loss:
    best_loss = h
    best_X = chosen
    best_h = h

libraries.append(chosen)
histories.append(h)
seqs = helpers.seqs_from_set(chosen, L)
inds = np.random.choice(len(seqs), n, replace=True)
sampled_seqs = [seqs[i] for i in inds]
inds = [seq_to_x[s] for s in sampled_seqs]
X_sampled.append(X[inds])
y_sampled.append(y[inds])

In [None]:
## SECOND ITERATION
dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500)
preds.append(means)

seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
if h < best_loss:
    best_loss = h
    best_X = chosen
    best_h = h

libraries.append(chosen)
histories.append(h)
seqs = helpers.seqs_from_set(chosen, L)
inds = np.random.choice(len(seqs), n, replace=True)
sampled_seqs = [seqs[i] for i in inds]
inds = [seq_to_x[s] for s in sampled_seqs]
X_sampled.append(X[inds])
y_sampled.append(y[inds])

In [None]:
## THIRD ITERATION
dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500)
preds.append(means)

seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
if h < best_loss:
    best_loss = h
    best_X = chosen
    best_h = h

libraries.append(chosen)
histories.append(h)
seqs = helpers.seqs_from_set(chosen, L)
inds = np.random.choice(len(seqs), n, replace=True)
sampled_seqs = [seqs[i] for i in inds]
inds = [seq_to_x[s] for s in sampled_seqs]
X_sampled.append(X[inds])
y_sampled.append(y[inds])

In [None]:
## FOURTH ITERATION
dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500)
preds.append(means)

seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
if h < best_loss:
    best_loss = h
    best_X = chosen
    best_h = h

libraries.append(chosen)
histories.append(h)
seqs = helpers.seqs_from_set(chosen, L)
inds = np.random.choice(len(seqs), n, replace=True)
sampled_seqs = [seqs[i] for i in inds]
inds = [seq_to_x[s] for s in sampled_seqs]
X_sampled.append(X[inds])
y_sampled.append(y[inds])

In [None]:
## FIFTH ITERATION
dic, means = helpers.get_predictions(np.concatenate(X_sampled), np.concatenate(y_sampled), X_test, its=500, jitter=1e-5)
preds.append(means)

seed = helpers.get_seed(dic) # should seed in greedy algo still be best prediction?
chosen, h = bases.greedy(dic, seed, 100, L)
if h < best_loss:
    best_loss = h
    best_X = chosen
    best_h = h

libraries.append(chosen)
histories.append(h)
seqs = helpers.seqs_from_set(chosen, L)
inds = np.random.choice(len(seqs), n, replace=True)
sampled_seqs = [seqs[i] for i in inds]
inds = [seq_to_x[s] for s in sampled_seqs]
X_sampled.append(X[inds])
y_sampled.append(y[inds])

In [None]:
histories

In [None]:
# Plot y vs mean error (for each iteration)

errs = [helpers.get_mean_abs_err(X, y, mu, lib)[1] for mu, lib in zip(preds, libraries)]

_ = plt.title("Mean absolute error for iterations of Greedy Algorithm")
_ = plt.plot(np.arange(len(errs)) + 1, errs, marker='o', linestyle='none')

_ = plt.show() 

In [None]:
 # Plot mean error for y's vs. y's sorted (for each iteration)

abs_errs = [helpers.get_mean_abs_err(X, y, mu, lib)[0] for mu, lib in zip(preds, libraries)]
_ = plt.title("Absolute error vs y's tested on for iterations of Greedy Algorithm")

for err in abs_errs: # each err is a tuple of y_test, abs errs
    sorted_ind = sorted(range(len(err[0])), key=lambda k: err[0][k]) # get indexes of y sorted
    y_sort = np.sort(err[0]) # sort y
    err_sort = [err[1][i] for i in sorted_ind]
    _ = plt.plot(y_sort, err_sort, marker='.')

_ = plt.show()

In [9]:
def ecdf(data):
    """Compute ECDF for a one-dimensional array of measurements."""
    # Number of data points: n
    n = len(data)

    # x-data for the ECDF: x
    x = np.sort(data)

    # y-data for the ECDF: y
    y = np.arange(1, n + 1) / n

    return x, y

In [None]:
# Compute ECDF
x_val, y_val = ecdf(y)

plt.gcf().subplots_adjust(bottom=0.15)
# Generate plot
#_ = plt.title("ECDF of log10 of fitness performances (y's)")
_ = plt.plot(x_val, y_val, label="orig y's", alpha=0.3)
_ = plt.plot(y[150614], y_val[np.argwhere(x_val == y[150614])[0][0]], marker='o', color='green', markersize=7, label='wildtype')
#_ = plt.plot(y_seq1, y_val[np.argwhere(x_val == y_seq1)[0][0]], marker='o', color='green', markersize=7, label='baseline_fixed')
_ = plt.plot(y_seq2, y_val[np.argwhere(x_val == y_seq2)[0][0]], marker='o', color='orange', markersize=7, label='baseline_vary')
#_ = plt.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), shadow=True, ncol=1)

# Label the axes
_ = plt.ylabel('Percentile', fontsize=20)
_ = plt.xlabel('log(y)', fontsize=20)

# Display the plot
plt.savefig('ECDF WT BASE.png', dpi=1000)
_ = plt.show()


In [None]:
# Compute ECDF
x_val, y_val = ecdf(y)

# Generate plot
_ = plt.title("ECDF of log10 of fitness performances (y's)")
_ = plt.plot(x_val, y_val, label="orig y's", alpha=0.3)
_ = plt.plot(y[150614], y_val[np.argwhere(x_val == y[150614])[0][0]], marker='o', color='green', markersize=7, label='wildtype')
_ = plt.plot(y_seq1, y_val[np.argwhere(x_val == y_seq1)[0][0]], marker='o', color='green', markersize=7, label='baseline_fixed')
_ = plt.plot(y_seq2, y_val[np.argwhere(x_val == y_seq2)[0][0]], marker='o', color='orange', markersize=7, label='baseline_vary')
_ = plt.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), shadow=True, ncol=1)

# Label the axes
_ = plt.ylabel('Percentile')
_ = plt.xlabel('log(y)')

# Display the plot
#plt.savefig('ECDF WT BASE GREEDY.png', dpi=1000)
_ = plt.show()

In [None]:
# Compute ECDF
x_val, y_val = ecdf(y)

# Generate plot
_ = plt.title("ECDF of y values")
_ = plt.plot(x_val, y_val, marker='.', linestyle='none', label="y values", alpha=0.1)
_ = plt.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), shadow=True, ncol=1)

# Label the axes
_ = plt.ylabel('ECDF')
_ = plt.xlabel('y')

# Display the plot
_ = plt.show()

In [None]:
# Plot Greedy algorithm baseline with deterministic baselines too

d = {'Iterations': [], 'Sampled ys': []}

for i in range(len(y_sampled)):
    for j in y_sampled[i]:
        d['Iterations'].append(i)
        d['Sampled ys'].append(j)
    
df = pd.DataFrame(data=d) # make dataframe of sampled ys to plot on swarmplot

sns.set(style="whitegrid")
_ = plt.title('Baselines: deterministic and Greedy algorithm')
ax = sns.swarmplot(x="Iterations", y="Sampled ys", data=df) # swamplot allows for jitter in displaying cluster of ys
_ = ax.axhline(y_seq1, color='purple', label='baseline_fixed')
_ = ax.axhline(y_seq2, color='black', label='baseline_vary')
_ = plt.legend(loc='upper center', bbox_to_anchor=(1.45, 0.8), shadow=True, ncol=1)