Generate synthetic libraries using NK model

In [1]:
import torch
from torch import distributions as dist
import itertools

import operator
import pickle
import importlib
import random

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
sns.set_style('white')
sns.set_context('paper')
# Plot adjustments:
plt.rcParams.update({'ytick.labelsize': 15})
plt.rcParams.update({'xtick.labelsize': 15})
plt.rcParams.update({'axes.labelsize': 35})
plt.rcParams.update({'legend.fontsize': 30})
plt.rcParams.update({'axes.titlesize': 16})

from gptorch import kernels, models
import bases, helpers, opt

In [2]:
class NK_Model(object):
    """ A Kauffman NK Model, with tuneable ruggedness K. 
    
    N parts the amino acids in the primary sequence
    A states are the 20 possible amino acids (A takes on values from 0 to 19)
    K measures the average number of other sites in the primary chain whose 
    amino acids bear on the functional contribution of the amino acid at a 
    given site to overall function. """
        
    # I would use a gaussian distribution for the random effects
    def __init__(self, N=4, K=1, A=20):
        """ Initialize a NK Model."""
        self.N = N
        self.K = K
        self.A = A
        self.graph = self.create_graph()
            
    def create_graph(self):
        '''        
        Returns a dictionary of N tuples of size K + 1 with their corresponding
        fitnesses drawn randomly from Gaussian distribution. Each tuple consists 
        of a specific amino acid position that is influenced by K other positions
        included in the tuple. There are A total possible amino acids at each position.
        '''
        graph = {}
        
        poss = [i for i in range(self.A)]
        nbds = [tuple(i) for i in itertools.product(poss, repeat = self.K + 1)] # all possible neighborhoods

        for i in nbds:
            graph[i] = np.random.normal(0, 1, 1)[0] # draw from Gaussian dist
            
        return graph
    
    def seq_nbds(self, seq):
        '''
        Returns a list of all contacts/neighborhoods in a given sequence.
        Each tuple consists of a specific amino acid position that is influenced 
        by K other positions included in the tuple. For each contact/neighborhood,
        we take the K positions before specific site to be in the tuple.
        '''
        if len(seq) != self.N:
            raise Exception("Wrong sequence length.")
            
        seq_nbrs = []
        
        for i in range(self.N):
            lst = [] # temp lst
            ind = i
            for k in range(self.K + 1): # take K positions before specific site
                lst.append(seq[ind])
                ind -= 1
            seq_nbrs.append(tuple(lst))
            
        return seq_nbrs
    
    def eval_fitness(self, seq):
        '''
        Returns value of fitness of an entire amino acid sequence.
        '''
        if len(seq) != self.N:
            raise Exception("Wrong sequence length.")
        
        nbrs = self.seq_nbds(seq)
        fits = [self.graph[n] for n in nbrs]
        
        return sum(fits) / len(fits) # take average

    def all_seqs(self):
        '''
        Generates a list of all possible sequences of length N 
        where each position could take on A different 
        values for each of the amino acids.
        '''
        poss = [i for i in range(self.A)] # all values of A
        seqs = [tuple(i) for i in itertools.product(poss, repeat = self.N)] # stores all possible seqs
        return seqs
        
    def enumerate_space(self):
        '''
        Exhaustively enumerate the fitness of each point in the model space.
        
        Returns dictionary of seq with corresponding fitness
        '''
        model_space = {}
        seqs = self.all_seqs() # generate all seqs in landscape
        
        for seq in seqs: # evaluate fitness of each seq
            model_space[seq] = self.eval_fitness(seq)
            
        return model_space

In [3]:
nk = NK_Model()

In [4]:
space = nk.enumerate_space()

In [21]:
space

{(0, 0, 0, 0): -0.13038930516241962,
 (0, 0, 0, 1): -0.080798545717410658,
 (0, 0, 0, 2): -0.32145241586046602,
 (0, 0, 0, 3): -0.72750461008600786,
 (0, 0, 0, 4): 0.10413381431637253,
 (0, 0, 0, 5): 0.31901803279061536,
 (0, 0, 0, 6): -0.27533707891203807,
 (0, 0, 0, 7): -0.45757353254628097,
 (0, 0, 0, 8): -0.19290274408468061,
 (0, 0, 0, 9): 0.093565239891913815,
 (0, 0, 0, 10): 0.29273291477019098,
 (0, 0, 0, 11): -0.054015535702909179,
 (0, 0, 0, 12): -0.28492484685347708,
 (0, 0, 0, 13): -0.18190672248938958,
 (0, 0, 0, 14): -0.006759598928847442,
 (0, 0, 0, 15): 0.16506138454704208,
 (0, 0, 0, 16): -0.02582538916307376,
 (0, 0, 0, 17): -0.62855788561131687,
 (0, 0, 0, 18): -0.73532729344466186,
 (0, 0, 0, 19): -0.073373856903813858,
 (0, 0, 1, 0): -0.08079854571741063,
 (0, 0, 1, 1): -0.002428076005739091,
 (0, 0, 1, 2): -0.076737273471827627,
 (0, 0, 1, 3): -0.33101215249344185,
 (0, 0, 1, 4): 0.019111820754567323,
 (0, 0, 1, 5): 0.25836555261888589,
 (0, 0, 1, 6): 0.1757102722

In [17]:
with open('../inputs/syn_lib_NOT_TIDY.pkl', 'wb') as f:
    pickle.dump((list(space.keys()), list(space.values())), f)