In [None]:
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle

In [None]:
from scipy import stats

In [None]:
import importlib
import evotsc_lib
import evotsc_plot
importlib.reload(evotsc_lib)
importlib.reload(evotsc_plot)

In [None]:
label_fontsize=20
tick_fontsize=15
legend_fontsize=15
dpi=300

In [None]:
sc_path = pathlib.Path('/Users/theotime/Desktop/evotsc/phd/continuous-epistasis/with-sc/')
control_path = pathlib.Path('/Users/theotime/Desktop/evotsc/phd/continuous-epistasis/control/')
gen = 250_000
gene_types = ['AB', 'A', 'B'] # Name of each gene type
gene_type_color = ['tab:blue', 'tab:red', 'tab:green'] #AB, A, B
sc_color = 'tab:green'
control_color = 'tab:red'

In [None]:
sc_wt_dirs = sorted([d for d in sc_path.iterdir() if (d.is_dir() and d.name.startswith(sc_path.name))])
nb_sc_wt = len(sc_wt_dirs)

In [None]:
control_wt_dirs = sorted([d for d in control_path.iterdir() if (d.is_dir() and d.name.startswith(control_path.name))])
nb_control_wt = len(control_wt_dirs)

In [None]:
sc_rep_dirs = [sorted([d for d in wt_dir.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
               for wt_dir in sc_wt_dirs]
sc_params = evotsc_lib.read_params(sc_rep_dirs[0][0]) # Only the seed is different

In [None]:
control_rep_dirs = [sorted([d for d in wt_dir.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
                    for wt_dir in control_wt_dirs]
control_params = evotsc_lib.read_params(control_rep_dirs[0][0]) 

In [None]:
genes_per_type = sc_params["nb_genes"] / len(gene_types) # Doesn't change

In [None]:
def get_stats(exp_name, params):
            
    wt_dirs = sorted([d for d in exp_name.iterdir() if (d.is_dir() and d.name.startswith(exp_name.name))])
        
    res = pd.DataFrame()
    
    data_cols = ['Gen', 'Fitness']
        
    if params['intergene_poisson_lam'] != 0.0:
        data_cols += ['Genome size']

    if params['basal_sc_mutation_prob'] != 0.0:
        data_cols += ['Basal SC']

    
    for i_wt, wt_dir in enumerate(wt_dirs):
        rep_dirs = sorted([d for d in wt_dir.iterdir() if (d.is_dir() and d.name.startswith('rep'))])
        

        for i_rep, rep_dir in enumerate(rep_dirs):

            res_dir = pd.read_csv(rep_dir.joinpath('stats.csv'), usecols=data_cols)

            res_dir.insert(0, 'WT', i_wt)
            res_dir.insert(1, 'Replicate', i_rep)

            res = pd.concat([res, res_dir])

    res['Log Fitness'] = np.log(res['Fitness'])
            
    return res

In [None]:
def get_orig_indivs(exp_path):
    wt_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith(exp_path.name))])
    
    indivs = []
    
    for i_wt, wt_dir in enumerate(wt_dirs):
        indiv_path = [p for p in wt_dir.iterdir() if ('best_rep' in p.name)][0]
        
        with open(indiv_path, 'rb') as indiv_file:
            indivs.append(pickle.load(indiv_file))
        
    return indivs

In [None]:
sc_orig_indivs = get_orig_indivs(sc_path)

In [None]:
control_orig_indivs = get_orig_indivs(control_path)

# Plot the number of active genes of each type over evolutionary time

In [None]:
sc_stats = get_stats(sc_path, sc_params)

In [None]:
control_stats = get_stats(control_path, control_params)

# Plot fitness, genome size, and basal supercoiling over evolutionary time

In [None]:
def plot_fitness_per_wt(exp_path, full_stats):
    
    nb_wt = full_stats['WT'].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_wt)(range(nb_wt))

    plt.figure(figsize=(9, 4), dpi=dpi)
    
    for i_wt in range(nb_wt):
        wt_stats = full_stats[(full_stats["Gen"] > 0) & (full_stats['WT'] == i_wt)]
        
        mean_data = wt_stats.groupby('Gen').mean().reset_index()
        plt.plot(mean_data['Gen'],
                 np.exp(mean_data['Log Fitness']),
                 color=colors[i_wt],
                 label=f'WT {i_wt}',
                 linewidth=2) 

        # Min and max (5 is not enough for quantiles)
        min_data = wt_stats.groupby('Gen').min()
        plt.plot(mean_data['Gen'],
                 min_data['Fitness'],
                 color=colors[i_wt],
                 alpha=0.3)

        max_data = wt_stats.groupby('Gen').max()
        plt.plot(mean_data['Gen'],
                 max_data['Fitness'],
                 color=colors[i_wt],
                 alpha=0.3)

    plt.xscale('log')
    plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')

    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Fitness', fontsize=label_fontsize)
    
    plt.ylim(1e-10, 1e0)
    
    plt.legend()

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    plt.savefig(f'{exp_path}/fitness_per_wt.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_fitness_per_wt(sc_path, sc_stats)

In [None]:
plot_fitness_per_wt(control_path, control_stats)

In [None]:
def plot_fitness_grouped(sc_stats, control_stats, exp_path):

    stats = [control_stats[control_stats["Gen"] > 0].copy(),
             sc_stats[sc_stats["Gen"] > 0].copy(),]
    
    name = ['Control runs', 'Supercoiling runs']
    
    colors = mpl.cm.get_cmap('viridis', 2)(range(2))

    plt.figure(figsize=(9, 4), dpi=dpi)
    
    for i_exp in range(2):
        mean_data = stats[i_exp].groupby('Gen').mean().reset_index()
        plt.plot(mean_data['Gen'],
                 np.exp(mean_data['Log Fitness']),
                 color=colors[i_exp],
                 linewidth=2,
                 label=name[i_exp]) 

        # 5*5 = 25 replicates so let's use qa
        first_dec = stats[i_exp].groupby('Gen').quantile(0.1)
        plt.plot(mean_data['Gen'],
                 first_dec['Fitness'],
                 color=colors[i_exp],
                 alpha=0.3)

        last_dec = stats[i_exp].groupby('Gen').quantile(0.9)
        plt.plot(mean_data['Gen'],
                 last_dec['Fitness'],
                 color=colors[i_exp],
                 alpha=0.3)

    plt.xscale('log')
    plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')

    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Fitness', fontsize=label_fontsize)
    plt.ylim(1e-10, 1e0)
    
    plt.legend(fontsize=legend_fontsize)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    plt.savefig(f'{exp_path}/fitness_grouped.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_fitness_grouped(sc_stats, control_stats, sc_path)

In [None]:
def plot_relative_fitness_per_wt(full_stats, orig_indivs, exp_path):
    
    nb_wt = full_stats['WT'].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_wt)(range(nb_wt))

    plt.figure(figsize=(9, 4), dpi=dpi)
    
    for i_wt in range(nb_wt):
        wt_stats = full_stats[(full_stats["Gen"] > 0) & (full_stats['WT'] == i_wt)]
        
        orig_fitness = orig_indivs[i_wt].fitness
        
        mean_data = wt_stats.groupby('Gen').mean().reset_index()
        plt.plot(mean_data['Gen'],
                 np.exp(mean_data['Log Fitness']) / orig_fitness,
                 color=colors[i_wt],
                 label=f'WT {i_wt}',
                 linewidth=2) 

        # Min and max (5 is not enough for quantiles)
        min_data = wt_stats.groupby('Gen').min()
        #plt.plot(mean_data['Gen'],
        #         min_data['Fitness'] / orig_fitness,
        #         color=colors[i_wt],
        #         alpha=0.3)

        max_data = wt_stats.groupby('Gen').max()
        #plt.plot(mean_data['Gen'],
        #         max_data['Fitness'] / orig_fitness,
        #         color=colors[i_wt],
        #         alpha=0.3)

    plt.xscale('log')
    plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')

    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Relative fitness', fontsize=label_fontsize)
    plt.ylim(1e-7, 0.5e1)

    plt.legend()

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    plt.savefig(f'{exp_path}/all_relative_fitness_per_wt.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_relative_fitness_per_wt(sc_stats, sc_orig_indivs, sc_path)

In [None]:
plot_relative_fitness_per_wt(control_stats, control_orig_indivs, control_path)

In [None]:
def plot_basal_sc_per_wt(full_stats, exp_path):
    
    nb_wt = full_stats['WT'].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_wt)(range(nb_wt))

    plt.figure(figsize=(8, 5), dpi=dpi)
    
    for i_wt in range(nb_wt):
        wt_stats = full_stats[(full_stats["Gen"] > 0) & (full_stats['WT'] == i_wt)][['Gen', 'Replicate', 'Basal SC']]

        # plot by replicate fitness
        #for i_rep in wt_stats['Replicate'].unique():
        #    rep_stats = wt_stats[wt_stats['Replicate'] == i_rep]
        #    if i_rep == 0:
        #        label = f'WT {i_wt}'
        #    else:
        #        label = None
        
        mean_data = wt_stats.groupby('Gen').mean().reset_index()
        plt.plot(mean_data['Gen'],
                 mean_data['Basal SC'],
                 color=colors[i_wt],
                 label=f'WT {i_wt}',
                 linewidth=2) 

        # Min and max (5 is not enough for quantiles)
        min_data = wt_stats.groupby('Gen').min()
        plt.plot(mean_data['Gen'],
                 min_data['Basal SC'],
                 color=colors[i_wt],
                 alpha=0.3)

        max_data = wt_stats.groupby('Gen').max()
        plt.plot(mean_data['Gen'],
                 max_data['Basal SC'],
                 color=colors[i_wt],
                 alpha=0.3)

    plt.xscale('log')
    #plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')

    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Basal SC', fontsize=label_fontsize)
    plt.ylim(-0.0675, -0.053)
    
    plt.legend(loc='lower left')

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    plt.savefig(f'{exp_path}/sc_per_wt.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_basal_sc_per_wt(sc_stats, sc_path)

In [None]:
def plot_basal_sc_all(full_stats, exp_path):
        
    color = mpl.cm.get_cmap('viridis', 1)(range(1))[0]

    plt.figure(figsize=(8, 5), dpi=dpi)
    
    mean_stats = full_stats.groupby('Gen').mean().reset_index()
        
    plt.plot(mean_stats['Gen'],
             mean_stats['Basal SC'],
             color=color,
             linewidth=2) 

    # 5*5 = 25 replicates so let's use qa
    first_dec = full_stats.groupby('Gen').quantile(0.1)
    plt.plot(mean_stats['Gen'],
             first_dec['Basal SC'],
             color=color,
             alpha=0.3)

    last_dec = full_stats.groupby('Gen').quantile(0.9)
    plt.plot(mean_stats['Gen'],
             last_dec['Basal SC'],
             color=color,
             alpha=0.3)

    plt.xscale('log')
    #plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')

    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Basal SC', fontsize=label_fontsize)
    plt.ylim(-0.0675, -0.053)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    plt.savefig(f'{exp_path}/sc_all.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_basal_sc_all(sc_stats, sc_path)

## Interesting stats: fitness relative to the ancestor at the last generation

In [None]:
def compute_rel_fitness_at(full_stats, orig_indivs, gen):
    last_stats_by_wt = full_stats[full_stats['Gen'] == gen].groupby('WT').mean()[['Replicate', 'Gen', 'Fitness']]

    nb_wt = full_stats['Replicate'].nunique()
    
    for i_wt in range(nb_wt):
        orig_fitness = orig_indivs[i_wt].fitness
        rel_fitness = last_stats_by_wt.iloc[i_wt] / orig_fitness

    orig_df = pd.DataFrame({'WT': list(range(nb_wt)),
                            'Orig Fitness': [i.fitness for i in orig_indivs]}).set_index('WT')

    full_df = pd.concat([last_stats_by_wt, orig_df], axis='columns')

    full_df['Rel Fitness'] = full_df['Fitness'] / full_df['Orig Fitness']

    return full_df

In [None]:
sc_rel_data = compute_rel_fitness_at(sc_stats, sc_orig_indivs, gen)

In [None]:
sc_rel_data.mean()

In [None]:
control_rel_data = compute_rel_fitness_at(control_stats, control_orig_indivs, gen)

In [None]:
control_rel_data.mean()

In [None]:
stats.ttest_ind(sc_rel_data['Rel Fitness'], control_rel_data['Rel Fitness'])

In [None]:
def plot_rel_fitness_agrege(sc_stats, sc_orig_indivs, control_stats, control_orig_indivs, exp_path):
            
    fig, (ax1, ax2) = plt.subplots(1, 2, gridspec_kw={'width_ratios': [5, 1]},
                                   figsize=(8,5), dpi=dpi)
    
    # Left figure: over time
    ax1.set_yscale('log')
    ax1.set_xscale('log')
    #plt.ylim(1e-2, 1e-1)
    ax1.grid(linestyle=':', which='both')
    ax1.set_xlabel('Generation', fontsize='large')
    ax1.set_ylabel('Relative fitness', fontsize='large')
    
    sc_last_fitness = []
    for i_wt in sc_stats['WT'].unique():
        wt_rep = sc_stats[(sc_stats['WT'] == i_wt) & (sc_stats['Gen'] > 0)].copy()
        wt_rep['Rel Fitness'] = wt_rep['Fitness'] / sc_orig_indivs[i_wt].fitness
        mean_data = wt_rep.groupby(['Gen']).mean()
        sc_last_fitness.append(mean_data.iloc[-1]['Rel Fitness'])
        if i_wt == 0:
            label = "Supercoiling runs"
        else:
            label = None
        ax1.plot(mean_data['Rel Fitness'], label=label, lw=2, color=sc_color)
        

    control_last_fitness = []
    for i_wt in control_stats['WT'].unique():
        wt_rep = control_stats[(control_stats['WT'] == i_wt) & (control_stats['Gen'] > 0)].copy()
        wt_rep['Rel Fitness'] = wt_rep['Fitness'] / control_orig_indivs[i_wt].fitness
        mean_data = wt_rep.groupby(['Gen']).mean()
        control_last_fitness.append(mean_data.iloc[-1]['Rel Fitness'])
        if i_wt == 0:
            label = "Control runs"
        else:
            label = None
        ax1.plot(mean_data['Rel Fitness'], label=label, lw=2, color=control_color)

    ax1.legend()
    
    # Right figure: last time point
    # Replicates
    
    ax2.plot(np.zeros(5), sc_last_fitness,
             linestyle=' ', marker='o', color=sc_color)
    ax2.plot(np.ones(5), control_last_fitness,
             linestyle=' ', marker='o', color=control_color)
    
    #ax2.set_yscale('log')
    ax2.yaxis.tick_right()
    ax2.yaxis.set_label_position("right")
    #ax2.set_ylim(-0.1, 1.55)
    ax2.set_ylabel('Final relative fitness', fontsize='large')
    ax2.set_xlim(-0.5, 1.5)
    ax2.set_xticks(ticks=[0, 1])
    ax2.set_xticklabels(labels=['SC', 'Control'])
    ax2.grid(linestyle=':', axis='y', which='both')

    # Save and show
    plt.tight_layout()
    
    plt.savefig(exp_path.joinpath('rel_fitness_sc_control.pdf'), dpi=dpi, bbox_inches='tight')
    
    plt.show()

In [None]:
plot_rel_fitness_agrege(sc_stats, sc_orig_indivs, control_stats, control_orig_indivs, sc_path)