In [1]:
# General imports
import glob
import os
import re
import pickle
import datetime

# Data manipulation
#import growth_analysis as ga
import pandas as pd
#from multiprocesspandas import applyparallel
import numpy as np
from sklearn.impute import KNNImputer

# Basic plotting
import holoviews as hv
import bokeh
from bokeh.io import export_svg
from bokeh.models import NumeralTickFormatter

from bokeh.themes.theme import Theme
# theme = Theme(
#     json=ga.PLOT_STYLE
# )
#hv.renderer('bokeh').theme = theme

import panel as pn
pn.config.comms = "vscode"

# Large data plotting
#import datashader as ds
#from holoviews.operation.datashader import datashade, rasterize

# Making graphs
import networkx as nx
import matplotlib.pyplot as plt
import itertools
import tqdm
from multiprocessing import Pool
from operator import itemgetter

hv.extension('bokeh')

In [2]:
# Import the measured + imputed TrpB_data
TrpB_data = pd.read_csv(
     'Data/TrpB/10092023TrpB_MeasuredAndImputed_ZS.csv', 
    index_col=0
)

TrpB_data = TrpB_data.sort_values('AAs').reset_index(drop=True)

# add rank column
TrpB_data['rank'] = TrpB_data['fitness'].rank(ascending=False)

TrpB_data['scaled_fitness'] = TrpB_data['fitness'].apply(lambda x: np.log(x))

TrpB_fit_min = TrpB_data[TrpB_data['active']]['fitness'].min()

In [3]:
# Import the synthetic TrpB_data
syn_TrpB_data = pd.read_csv('Data/TrpB/112223_tuned_0.90.92_mut_pred_synthetic_landscape_TrpB.csv')

syn_TrpB_data = syn_TrpB_data.sort_values('AAs').reset_index(drop=True)

# add rank column
syn_TrpB_data['rank'] = syn_TrpB_data['fitness'].rank(ascending=False)

syn_TrpB_data['fitness'] = syn_TrpB_data['fitness'].apply(lambda x: np.exp(x))

syn_TrpB_9783 = pd.DataFrame(syn_TrpB_data.sort_values('fitness', ascending= False).values[0:9783], columns = syn_TrpB_data.columns)

# scale fitness to get TrpB max and min
max_fitness = syn_TrpB_9783['fitness'].max()
min_fitness = syn_TrpB_9783['fitness'].min()
syn_TrpB_9783['scaled_fitness'] = syn_TrpB_9783['fitness'].apply(lambda x: (x - min_fitness)/(max_fitness-min_fitness))
syn_TrpB_9783['scaled_fitness'] = syn_TrpB_9783['scaled_fitness'].apply(lambda x: 0.1 + x*(1-0.1))

syn_TrpB_9783['active'] = syn_TrpB_9783['scaled_fitness'] >= 0.1

syn_TrpB_fit_min = syn_TrpB_9783[syn_TrpB_9783['active']]['fitness'].min()

syn_TrpB_9783

Unnamed: 0,AAs,sequence,AA1,AA2,AA3,AA4,num_mutations,fitness,active,rank,scaled_fitness
0,LVVS,"[10, 19, 19, 15]",L,V,V,S,2,57817123351699917822230528.0,True,1.0,1.000000
1,MIVG,"[12, 9, 19, 7]",M,I,V,G,3,14114642197611126487777280.0,True,2.0,0.319713
2,LLVS,"[10, 10, 19, 15]",L,L,V,S,2,5355034449528910856060928.0,True,3.0,0.183358
3,CVIS,"[4, 19, 9, 15]",C,V,I,S,3,134604847342992591159296.0,True,4.0,0.102095
4,LLTS,"[10, 10, 16, 15]",L,L,T,S,3,88550991399576305401856.0,True,5.0,0.101378
...,...,...,...,...,...,...,...,...,...,...,...
9778,MWYS,"[12, 17, 18, 15]",M,W,Y,S,3,0.000033,True,9779.5,0.100000
9779,MWPS,"[12, 17, 14, 15]",M,W,P,S,3,0.000033,True,9779.5,0.100000
9780,TWVP,"[16, 17, 19, 14]",T,W,V,P,3,0.000033,True,9781.0,0.100000
9781,CSVG,"[4, 15, 19, 7]",C,S,V,G,3,0.000033,True,9782.0,0.100000


In [4]:
# VDGV is parent

# Import the measured GB1 data
GB1_measured_data = pd.read_csv('Data/GB1/GB1_Fitness.csv').rename(columns={'Variants': 'AAs'})

GB1_measured_data['imputed'] = False

# Import the imputed GB1 data
GB1_imputed_data = pd.read_csv('Data/GB1/GB1_Fitness_Imputed.csv').rename(columns={'Variants': 'AAs', 'Imputed fitness': 'Fitness'})

GB1_imputed_data['imputed'] = True

# Combine the data 
GB1_data = pd.concat([GB1_measured_data, GB1_imputed_data], ignore_index=True).sort_values('AAs').reset_index(drop=True)

# add AA1 -> AA4 columns (create columns for each mutated position)
for i in range(4):
     GB1_data.insert(i+1, f'AA{i+1}', GB1_data['AAs'].apply(lambda x: x[i]))

# Get the Fitness/max column to scale the data the same way as the TrpB data
GB1_data['Fitness/max'] = GB1_data['Fitness'] / GB1_data['Fitness'].max()
GB1_fit_min = 0.01 / GB1_data['Fitness'].max()
# GB1_min_top9783 = GB1_data[GB1_data['imputed'] == False]['Fitness'].values[9782] # wouldn't we want to sort by fitness first

# Only set as active if they are not imputed and have a fitness above the minimum. This will prevent them from being included as starting points in the path analysis, but they will still appear in the graphs.
GB1_data['active'] = GB1_data.apply(lambda x: (x['Fitness/max'] > GB1_fit_min) & (x['imputed'] == False), axis=1)

GB1_data['scaled_fitness'] = GB1_data['Fitness'].apply(lambda x: np.log(x))

GB1_data

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,HD,Count input,Count selected,Fitness,imputed,Fitness/max,active,scaled_fitness
0,AAAA,A,A,A,A,4.0,25.0,147.0,1.611610,False,0.162574,True,0.477234
1,AAAC,A,A,A,C,,,,0.049726,True,0.005016,False,-3.001227
2,AAAD,A,A,A,D,,,,0.011857,True,0.001196,False,-4.434878
3,AAAE,A,A,A,E,,,,0.011416,True,0.001152,False,-4.472753
4,AAAF,A,A,A,F,,,,0.029688,True,0.002995,False,-3.517002
...,...,...,...,...,...,...,...,...,...,...,...,...,...
159995,YYYS,Y,Y,Y,S,4.0,186.0,3.0,0.004421,False,0.000446,False,-5.421457
159996,YYYT,Y,Y,Y,T,4.0,181.0,14.0,0.021200,False,0.002139,True,-3.853763
159997,YYYV,Y,Y,Y,V,3.0,98.0,15.0,0.041952,False,0.004232,True,-3.171240
159998,YYYW,Y,Y,Y,W,4.0,30.0,1.0,0.009136,False,0.000922,False,-4.695520


In [5]:
# active only highest fitness variants
GB1_9783 = pd.DataFrame(GB1_data[GB1_data['active'] == True].sort_values('Fitness', ascending= False).values[0:9783], columns = GB1_data.columns)

# add rank column
GB1_9783['rank'] = GB1_9783['Fitness'].rank(ascending=False)

GB1_9783

Unnamed: 0,AAs,AA1,AA2,AA3,AA4,HD,Count input,Count selected,Fitness,imputed,Fitness/max,active,scaled_fitness,rank
0,FWAA,F,W,A,A,4.0,63.0,2014.0,8.761966,False,0.883879,True,2.17042,1.0
1,FYAA,F,Y,A,A,4.0,221.0,6487.0,8.045152,False,0.811569,True,2.08507,2.0
2,ANCA,A,N,C,A,4.0,14.0,386.0,7.556869,False,0.762312,True,2.022457,3.0
3,FWCA,F,W,C,A,4.0,71.0,1957.0,7.554663,False,0.76209,True,2.022165,4.0
4,FWLG,F,W,L,G,4.0,97.0,2588.0,7.312656,False,0.737677,True,1.989607,5.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9778,ACFV,A,C,F,V,3.0,17.0,10.0,0.161225,False,0.016264,True,-1.824951,9779.5
9779,ILGI,I,L,G,I,3.0,1581.0,930.0,0.161225,False,0.016264,True,-1.824951,9779.5
9780,WKFI,W,K,F,I,4.0,670.0,394.0,0.161177,False,0.016259,True,-1.82525,9781.0
9781,MQFC,M,Q,F,C,4.0,182.0,107.0,0.161137,False,0.016255,True,-1.825501,9782.0


In [6]:
# # Import the synthetic TrpB_data
# syn_GB1_data = pd.read_csv('Data/112223_tuned_0.90.92_mut_pred_synthetic_landscape_GB1.csv')

# syn_GB1_data = syn_GB1_data.sort_values('AAs').reset_index(drop=True)

# # add rank column
# syn_GB1_data['rank'] = syn_GB1_data['fitness'].rank(ascending=False)

# syn_GB1_data['fitness'] = syn_GB1_data['fitness'].apply(lambda x: np.exp(x))

# syn_GB1_9783 = pd.DataFrame(syn_GB1_data.sort_values('fitness', ascending= False).values[0:9783], columns = syn_GB1_data.columns)

# # scale fitness to get GB1 max and min
# max_fitness = syn_GB1_9783['fitness'].max()
# min_fitness = syn_GB1_9783['fitness'].min()
# syn_GB1_9783['scaled_fitness'] = syn_GB1_9783['fitness'].apply(lambda x: (x - min_fitness)/(max_fitness-min_fitness))
# syn_GB1_9783['scaled_fitness'] = syn_GB1_9783['scaled_fitness'].apply(lambda x: 0.1 + x*(1-0.1))

# syn_GB1_9783['active'] = syn_GB1_9783['scaled_fitness'] >= 0.1

# syn_GB1_fit_min = syn_GB1_9783[syn_GB1_9783['active']]['fitness'].min()

# syn_GB1_9783

In [6]:
# Copied from  Kadina's DE_simulations.py
def make_new_sequence(input_seq, new_AA, position):
    seq_list = list(input_seq)
    seq_list[position] = new_AA
    return ''.join(seq_list)

def ecdf_transform(data):
    return data.rank(method="first") / len(data)

def simulate_single_step_DE(data, seq_col, fitness_col, n_sites=4): # single-step SSM 

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']][seq_col].values
    
    position_orders = list(itertools.permutations(range(n_sites)))
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*len(position_orders))
    fitness_dict = {}
    
    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        # Loop through all possible orders of positions
        for j, temp_order in enumerate(position_orders):
            
            best_seq = start_seq
            best_fitness = start_fitness
            
            # Loop through the positions
            for pos in temp_order:
                
                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(best_seq, AA, pos)
                    
                    # Use Try/Except in case the AA combo doesn't exist in the dataframe
                    try:
                        temp_fitness = data_dict[temp_seq]
                    except:
                        temp_fitness = 0
                    
                    # If this sequence is better than any previous then keep it
                    if temp_fitness > best_fitness:
                        best_seq = temp_seq
                        best_fitness = temp_fitness
                    else:
                        pass
                
            fitness_array[len(position_orders)*i+j] = best_fitness
            fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

   # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return (fitness_array,output_df)

def simulate_simple_SSM_recomb_DE(data, seq_col, fitness_col, n_sites=4): # SSM recomb

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']][seq_col].values
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_dict = {}
    
    for start_seq in tqdm.tqdm(active_AAs):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        top_SSM_variants = {}
        
        # Loop through the positions
        for pos in range(n_sites):

            best_seq = start_seq
            best_fitness = start_fitness
            
            # Try all possible mutations at the position to find the best
            for AA in AA_list:
                temp_seq = make_new_sequence(start_seq, AA, pos)
                
                # Use Try/Except in case the AA combo doesn't exist in the dataframe
                try:
                    temp_fitness = data_dict[temp_seq]
                except:
                    temp_fitness = 0
                
                # If this sequence is better than any previous then keep it
                if temp_fitness > best_fitness:
                    best_seq = temp_seq
                    best_fitness = temp_fitness
                else:
                    pass

            # collect the best variant for this position
            top_SSM_variants[pos] = best_seq

        # simple recombination
        recomb_seq = ''.join([top_SSM_variants[pos][pos] for pos in range(n_sites)])
        try:
            recomb_fitness = data_dict[recomb_seq]
        except:
            recomb_fitness = 0

        best_seq = start_seq
        best_fitness = start_fitness

        # check if the recombined sequence is better than the starting sequence as well as all of the SSM variants. Return the best one of these.
        if recomb_fitness > best_fitness:
            best_seq = recomb_seq
            best_fitness = recomb_fitness

        for SSM_seq in top_SSM_variants.values():
            SSM_fit = data_dict[SSM_seq]
            if SSM_fit > best_fitness:
                best_seq = SSM_seq
                best_fitness = SSM_fit

        fitness_dict[start_seq] = [start_fitness, tuple(top_SSM_variants.values()), best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'index':'start_seq', 0:'start_fitness', 1:'top_SSM_variants', 2:'final_seq', 3:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return output_df

def try_start_seq(start_seq, data_dict, AA_list, n_sites, N):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]

        SSM_data = {}
        SSM_to_compare = {}
        
        # Loop through the positions to collect SSM data
        for pos in range(n_sites):

            SSM_data[pos] = {}
            SSM_to_compare[pos] = {}
            
            # Try all possible mutations at the position to find the best
            for AA in AA_list:
                temp_seq = make_new_sequence(start_seq, AA, pos)
                
                # Use Try/Except in case the AA combo doesn't exist in the dataframe
                try:
                    temp_fitness = data_dict[temp_seq]
                except:
                    temp_fitness = 0
                
                SSM_data[pos][AA] = temp_fitness
                SSM_to_compare[pos][temp_seq] = temp_fitness

        all_possible_combos = [''.join(x) for x in list(itertools.product('ACDEFGHIKLMNPQRSTVWY', repeat=n_sites))]

        calculated_improvement = {}

        for combo in all_possible_combos:
            calculated_improvement[combo] = np.product([SSM_data[i][combo[i]] / start_fitness for i in range(n_sites)])

        top_predicted = pd.DataFrame(calculated_improvement.items(), columns=['AAs', 'calculated improvement']).sort_values('calculated improvement', ascending=False).head(N)['AAs'].values

        best_seq = start_seq
        best_fitness = start_fitness

        for variant_seq in top_predicted:
            
            try:
                variant_fit = data_dict[variant_seq]
            except:
                variant_fit = 0

            if variant_fit > best_fitness:
                best_seq = variant_seq
                best_fitness = variant_fit

        # add a step where I also look at all the SSM variants and see if any of them are better than the top predicted
        for pos,temp_fit_dict in SSM_data.items():
            for SSM_seq,SSM_fit in temp_fit_dict.items():

                if SSM_fit > best_fitness:
                    best_seq = SSM_seq
                    best_fitness = SSM_fit
        
        return (start_fitness, best_seq, best_fitness)

def sample_SSM_test_top_N(data, seq_col, fitness_col, n_sites=4, N=96, max_samples=None, n_jobs=1): # SSM predict top 96

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']].sample(frac=1)['AAs'].values

    if max_samples is not None and type(max_samples) == int:
        active_AAs = active_AAs[:max_samples]
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_dict = {}

    # Get the multiprocessing args
    pool_args = [(start_seq, data_dict, AA_list, n_sites, N) for start_seq in active_AAs]

    with Pool(n_jobs) as pool:
        results = pool.starmap(try_start_seq, tqdm.tqdm(pool_args))

    fitness_dict = {active_AAs[i]:results[i] for i in range(len(active_AAs))}

    # for start_seq in tqdm.tqdm(active_AAs):
    #     fitness_dict[start_seq] = try_start_seq(start_seq, data_dict, AA_list, n_sites, N)

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'index':'start_seq', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return output_df

def simulate_iterative_SM(data, seq_col, fitness_col, n_sites=4):

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))
    
    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))
    
    active_AAs = data[data['active']]['AAs'].values
    
    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*1)
    fitness_dict = {}
    
    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        best_seq = start_seq
        best_fitness = start_fitness

        # Loop through all possible orders of positions
        remaining_positions = list(range(n_sites))
        temp_order = []

        for j in range(n_sites):
            # Loop through the positions
            previous_best_seq = best_seq
            found_improvement = False
            for pos in remaining_positions:
                
                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(previous_best_seq, AA, pos)
                    
                    # Use Try/Except in case the AA combo doesn't exist in the dataframe
                    try:
                        temp_fitness = data_dict[temp_seq]
                    except:
                        temp_fitness = 0
                    
                    # If this sequence is better than any previous then keep it
                    if temp_fitness > best_fitness:
                        best_seq = temp_seq
                        best_fitness = temp_fitness
                        best_site = pos
                        found_improvement = True
                    else:
                        pass
                    
            if found_improvement:
                remaining_positions.remove(best_site)
                temp_order.append(best_site)
            else:
                #finish if there are no more beneficial mutations
                break
            #print(start_seq, best_seq, temp_order)
        
        # print(best_seq)
        temp_order = tuple(temp_order)
        fitness_array[i] = best_fitness
        fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])
    
    return (fitness_array,output_df)

In [7]:
def simulate_double_step_DE(data, seq_col, fitness_col, n_sites=4): # double-step SSM 

    data = data.copy()
    data[seq_col] = data[seq_col].apply(lambda x: ''.join(x.split('_')))

    data_dict = dict(zip(data[seq_col].values,data[fitness_col].values))
    rank_dict = dict(zip(data[seq_col].values,data['rank'].values))

    active_AAs = data[data['active']][seq_col].values

    groups = list(itertools.combinations(range(n_sites), 2))
    original_groups = groups.copy()
    groups.reverse()
    position_orders = list(zip(original_groups, groups))

    AA_list = list('ACDEFGHIKLMNPQRSTVWY')
    fitness_array = np.empty(len(active_AAs)*len(position_orders))
    fitness_dict = {}

    for i, start_seq in tqdm.tqdm(enumerate(active_AAs)):
        
        # Draw an initial variant
        start_fitness = data_dict[start_seq]
        
        # Loop through all possible orders of positions
        for j, temp_order in enumerate(position_orders):
                
            best_seq = start_seq
            best_fitness = start_fitness
            
            # Loop through the positions
            for pos in temp_order:

                # Try all possible mutations at the position
                for AA in AA_list:
                    temp_seq = make_new_sequence(best_seq, AA, pos[0])
                    for AA in AA_list:
                        temp_seq = make_new_sequence(temp_seq, AA, pos[1])
                    
                        # Use Try/Except in case the AA combo doesn't exist in the dataframe
                        try:
                            temp_fitness = data_dict[temp_seq]
                        except:
                            temp_fitness = 0
                        
                        # If this sequence is better than any previous then keep it
                        if temp_fitness > best_fitness:
                            best_seq = temp_seq
                            best_fitness = temp_fitness
                        else:
                            pass
            fitness_array[len(position_orders)*i+j] = best_fitness
            fitness_dict[(start_seq, temp_order)] = [start_fitness, best_seq, best_fitness]

    output_df = pd.DataFrame(fitness_dict).T.reset_index().rename(columns={'level_0':'start_seq', 'level_1':'order', 0:'start_fitness', 1:'final_seq', 2:'final_fitness'})

    output_df["final_fitness ECDF"] = output_df[
        'final_fitness'
    ].transform(ecdf_transform).values

    # assign orignial rank value to each variant
    output_df['rank'] = output_df['final_seq'].apply(lambda x: rank_dict[x])

    return (fitness_array,output_df)

In [8]:
# simulate DE on GB1
# GB1_SSMN_output_df = sample_SSM_test_top_N(GB1_9783, 'AAs', 'Fitness', n_sites=4) #stops at 7337?
# GB1_SSMr_output_df = simulate_simple_SSM_recomb_DE(GB1_9783, 'AAs', 'Fitness', n_sites=4)
GB1_ssSSM_fitness_array, GB1_ssSSM_output_df = simulate_single_step_DE(GB1_9783, 'AAs', 'Fitness', n_sites=4)
GB1_dsSSM_fitness_array, GB1_dsSSM_output_df = simulate_double_step_DE(GB1_9783, 'AAs', 'Fitness', n_sites=4)


# GB1_SSMr_output_df['final_fitness'] = GB1_SSMr_output_df['final_fitness']/GB1_SSMr_output_df['final_fitness'].max()
GB1_ssSSM_output_df['final_fitness'] = GB1_ssSSM_output_df['final_fitness']/GB1_ssSSM_output_df['final_fitness'].max()
GB1_dsSSM_output_df['final_fitness'] = GB1_dsSSM_output_df['final_fitness']/GB1_dsSSM_output_df['final_fitness'].max()

0it [00:00, ?it/s]

9783it [00:25, 388.14it/s]
9783it [01:49, 89.16it/s] 


In [None]:
# simulate DE on syn_TrpB
#TrpB_SSMN_output_df = sample_SSM_test_top_N(TrpB_data, 'AAs', 'fitness', n_sites=4) # stops at 7337?
syn_GB1_SSMr_output_df = simulate_simple_SSM_recomb_DE(syn_GB1_9783, 'AAs', 'fitness', n_sites=4)
syn_GB1_ssSSM_fitness_array, syn_GB1_ssSSM_output_df = simulate_single_step_DE(syn_GB1_9783, 'AAs', 'fitness', n_sites=4)
syn_GB1_dsSSM_fitness_array, syn_GB1_dsSSM_output_df = simulate_double_step_DE(syn_GB1_9783, 'AAs', 'fitness', n_sites=4)

syn_GB1_SSMr_output_df['final_fitness'] = syn_GB1_SSMr_output_df['final_fitness']/syn_GB1_SSMr_output_df['final_fitness'].max()
syn_GB1_ssSSM_output_df['final_fitness'] = syn_GB1_ssSSM_output_df['final_fitness']/syn_GB1_ssSSM_output_df['final_fitness'].max()
syn_GB1_dsSSM_output_df['final_fitness'] = syn_GB1_dsSSM_output_df['final_fitness']/syn_GB1_dsSSM_output_df['final_fitness'].max()

In [9]:
# simulate DE on TrpB
#TrpB_SSMN_output_df = sample_SSM_test_top_N(TrpB_data, 'AAs', 'fitness', n_sites=4) # stops at 7337?
# TrpB_SSMr_output_df = simulate_simple_SSM_recomb_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)
TrpB_ssSSM_fitness_array, TrpB_ssSSM_output_df = simulate_single_step_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)
TrpB_dsSSM_fitness_array, TrpB_dsSSM_output_df = simulate_double_step_DE(TrpB_data, 'AAs', 'fitness', n_sites=4)

# TrpB_SSMr_output_df['final_fitness'] = TrpB_SSMr_output_df['final_fitness']/TrpB_SSMr_output_df['final_fitness'].max()
TrpB_ssSSM_output_df['final_fitness'] = TrpB_ssSSM_output_df['final_fitness']/TrpB_ssSSM_output_df['final_fitness'].max()
TrpB_dsSSM_output_df['final_fitness'] = TrpB_dsSSM_output_df['final_fitness']/TrpB_dsSSM_output_df['final_fitness'].max()

9783it [00:18, 532.14it/s]
9783it [01:20, 121.57it/s]


In [33]:
# simulate DE on syn_TrpB
#TrpB_SSMN_output_df = sample_SSM_test_top_N(TrpB_data, 'AAs', 'fitness', n_sites=4) # stops at 7337?
# syn_TrpB_SSMr_output_df = simulate_simple_SSM_recomb_DE(syn_TrpB_9783, 'AAs', 'fitness', n_sites=4)
syn_TrpB_ssSSM_fitness_array, syn_TrpB_ssSSM_output_df = simulate_single_step_DE(syn_TrpB_9783, 'AAs', 'fitness', n_sites=4)
syn_TrpB_dsSSM_fitness_array, syn_TrpB_dsSSM_output_df = simulate_double_step_DE(syn_TrpB_9783, 'AAs', 'fitness', n_sites=4)

# syn_TrpB_SSMr_output_df['final_fitness'] = syn_TrpB_SSMr_output_df['final_fitness']/syn_TrpB_SSMr_output_df['final_fitness'].max()
syn_TrpB_ssSSM_output_df['final_fitness'] = syn_TrpB_ssSSM_output_df['final_fitness']/syn_TrpB_ssSSM_output_df['final_fitness'].max()
syn_TrpB_dsSSM_output_df['final_fitness'] = syn_TrpB_dsSSM_output_df['final_fitness']/syn_TrpB_dsSSM_output_df['final_fitness'].max()

0it [00:00, ?it/s]

9783it [00:20, 471.91it/s]


In [None]:
# TrpB single-step SSM final_fitness median, mean
print([TrpB_ssSSM_output_df['final_fitness'].median(), TrpB_ssSSM_output_df['final_fitness'].mean()])

[0.6725398247189857, 0.630656813365602]


In [None]:
# syn_TrpB single-step SSM final_fitness median, mean
print([syn_TrpB_ssSSM_output_df['final_fitness'].median(), syn_TrpB_ssSSM_output_df['final_fitness'].mean()])

[0.09262021593420323, 0.26185319659339057]


In [None]:
# GB1 single-step SSM final_fitness median, mean
print([GB1_ssSSM_output_df['final_fitness'].median(), GB1_ssSSM_output_df['final_fitness'].mean()])

[0.742180293379264, 0.7216938451389487]


In [34]:
# combining ssSSM results
temp_GB1 = GB1_ssSSM_output_df.reset_index()
temp_GB1['protein'] = 'GB1'

# temp_syn_GB1 = syn_GB1_ssSSM_output_df.reset_index()
# temp_syn_GB1['protein'] = 'syn_GB1'

temp_TrpB = TrpB_ssSSM_output_df.reset_index()
temp_TrpB['protein'] = 'TrpB'

temp_syn_TrpB = syn_TrpB_ssSSM_output_df.reset_index()
temp_syn_TrpB['protein'] = 'syn_TrpB'

#combined_fitness_df = pd.concat([temp_TrpB, temp_GB1]).set_index(['protein', 'start_seq', 'final_seq', 'final_fitness']).sort_index()
combined_fitness_df = pd.concat([temp_GB1, temp_TrpB, temp_syn_TrpB]).set_index(['protein', 'start_seq', 'final_seq', 'final_fitness']).sort_index()

In [10]:
# combining TrpB and syn_TrpB
# temp_TrpB_1 = TrpB_SSMr_output_df.reset_index()
# temp_TrpB_1['protein'] = 'TrpB'
# temp_TrpB_1['method'] = 'SSMr'

temp_TrpB_2 = TrpB_ssSSM_output_df.reset_index()
temp_TrpB_2['protein'] = 'TrpB'
temp_TrpB_2['method'] = 'ssSSM'

temp_TrpB_3 = TrpB_dsSSM_output_df.reset_index()
temp_TrpB_3['protein'] = 'TrpB'
temp_TrpB_3['method'] = 'dsSSM'

# temp_syn_TrpB_1 = syn_TrpB_SSMr_output_df.reset_index()
# temp_syn_TrpB_1['protein'] = 'syn_TrpB'
# temp_syn_TrpB_1['method'] = 'SSMr'

temp_GB1_2 = GB1_ssSSM_output_df.reset_index()
temp_GB1_2['protein'] = 'GB1'
temp_GB1_2['method'] = 'ssSSM'

temp_GB1_3 = GB1_dsSSM_output_df.reset_index()
temp_GB1_3['protein'] = 'GB1'
temp_GB1_3['method'] = 'dsSSM'

combined_fitness_df = pd.concat([temp_TrpB_2, temp_TrpB_3, temp_GB1_2, temp_GB1_3]).set_index(['protein', 'method', 'start_seq', 'final_seq', 'final_fitness']).sort_index()

In [11]:
# Plot Hooks
def one_decimal_x(plot,element):
    plot.handles['plot'].xaxis[0].formatter = NumeralTickFormatter(format="0.0")

def one_decimal_y(plot,element):
    plot.handles['plot'].yaxis[0].formatter = NumeralTickFormatter(format="0.0")

def fixmargins(plot,element):
    plot.handles['plot'].min_border_right=30
    plot.handles['plot'].min_border_left=65
    plot.handles['plot'].min_border_top=20
    plot.handles['plot'].min_border_bottom=65
    plot.handles['plot'].outline_line_color='black'
    plot.handles['plot'].outline_line_alpha=1
    plot.handles['plot'].outline_line_width=1
    plot.handles['plot'].toolbar.autohide = True

In [13]:
figure_2b = hv.Violin(
    combined_fitness_df.sort_index(ascending=False),
    kdims=['protein', 'method'],
    vdims=['final_fitness']
).opts(
    # split='protein',
    frame_height=300,
    frame_width=300,
    violin_width=0.8,
    fontscale=1.3,
    hooks=[fixmargins],
    # show_legend=True,
    # inner=None,
    cut = 0,
    # legend_position='top',
    ylabel='Max Fitness Achieved',
    violin_color=hv.dim('protein').str(),
    cmap = 'Category10',
    ylim = (0,1)
)


figure_2b

In [24]:
hv.save(figure_2b, 'dsSSM_ssSSM_GB1_TrpB.png', fmt='png')



In [17]:
# # plot ssSSM as hist
# from bokeh.plotting import figure, show

# p = figure(width=670, height=400, toolbar_location=None,
#            title="Rank")

# # Histogram
# bins = np.linspace(-3, 3, 40)
# hist, edges = np.histogram(x, density=True, bins=bins)
# p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:],
#          fill_color="skyblue", line_color="white",
#          legend_label="1000 random samples")

# show(p)


frequencies, edges = np.histogram(syn_TrpB_ssSSM_output_df['rank'])
print('Values: %s, Edges: %s' % (frequencies.shape[0], edges.shape[0]))
hv.Histogram((edges, frequencies))

Values: 10, Edges: 11
