Dataset of full library of 160,000 variants of GB1 pulled from https://elifesciences.org/articles/16965#digest

Preprocessed like the phoq dataset.

In [1]:
import itertools
import pickle

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gpmodel import gpmodel, gpkernel, gptools, chimera_tools

In [2]:
df = pd.read_excel('../inputs/GB1_FITNESS1.xlsx')
variants = df['Variants'].values
fitness = df['Fitness'].values
log_fitness = np.log10(fitness)



In [3]:
df = pd.read_excel('../inputs/GB1_FITNESS2.xlsx')
variants_imp = df['Variants'].values
fitness_imp = df['Imputed Fitness'].values
log_fitness_imp = np.log10(fitness_imp)

In [4]:
d = {'Variants': [], 'Log Fitness': [], 'Imputed': []}

for var, fit in zip(variants, log_fitness):
    d['Variants'].append(var)
    d['Log Fitness'].append(fit)
    d['Imputed'].append(False)

for var, fit in zip(variants_imp, log_fitness_imp):
    d['Variants'].append(var)
    d['Log Fitness'].append(fit)
    d['Imputed'].append(True)

df = pd.DataFrame(data=d) # make dataframe of sampled ys to plot on swarmplot

In [5]:
wt = 'VDGV'

np.random.seed(20)
traintest = np.random.choice(df.index, size=2000, replace=False)
train_inds = traintest[:1000]
test_inds = traintest[1000:]

inds = {'train': train_inds, 'test': test_inds}

for i, j in itertools.combinations(range(4), 2):
    ind = [s[i] == wt[i] and s[j] == wt[j] for s in df['Variants'].values]
    inds[(i, j)] = ind
    
for i, j, k in itertools.combinations(range(4), 3):
    ind = [s[i] == wt[i] and s[j] == wt[j] and s[k] == wt[k] for s in df['Variants'].values]
    inds[(i, j, k)] = ind

In [10]:
def make_Xs(df):
    """ Make sure df has the index you want for the Xs"""
    contacts = list(itertools.combinations((0, 1, 2, 3), 2))

    amino_acids = 'ARNDCQEGHILKMFPSTWYV'

    sample_space = [amino_acids for _ in range(len(df.iloc[0]['Variants']))]
    seq_terms = chimera_tools.make_sequence_terms(sample_space)
    struct_terms = chimera_tools.contacting_terms(sample_space, contacts)
    all_terms = seq_terms + struct_terms
 
    seqs = df['Variants'].values
    struct_X, _ = chimera_tools.make_contact_X(seqs, None, contacts,
                                               contact_terms=struct_terms)
    seq_X, _ = chimera_tools.make_sequence_X(seqs, sample_space=sample_space,
                                             sequence_terms=seq_terms)
    #all_X = np.concatenate([seq_X, struct_X], axis=1)
    X_dumb = np.zeros((len(df), len(df.iloc[0]['Variants'])))
    aa = np.array(sorted(amino_acids))
    for i, seq in enumerate(seqs):
        for j, s in enumerate(seq):
            X_dumb[i, j] = np.argwhere(aa == s).flatten()

    return seq_X, X_dumb

X, T = make_Xs(df)
y = df.loc[:, 'Log Fitness']
y[y == -np.inf] = -4.0
imputed = df.loc[:, 'Imputed']

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [7]:
df_aa = pd.read_csv('../inputs/aaindex_cosine_64.txt', index_col=0)
mus = np.expand_dims(np.mean(df_aa, axis=1).values, 1)
var = np.expand_dims(np.std(df_aa, axis=1).values, 1)
aa = pd.DataFrame((df_aa.values - mus) / var, index=df_aa.index, columns=df_aa.columns)
X_aa = np.array([np.array([aa[s].values for s in seq]).flatten() for seq in df.loc[:, 'Variants'].values])
X_aa = pd.DataFrame(X_aa, index=df.index)
X_aa.shape

(160000, 256)

In [12]:
with open('../inputs/gb1.pkl', 'wb') as f:
    pickle.dump((X, T, X_aa.values, y, imputed, inds), f)