In [None]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle

In [None]:
from scipy import stats

In [None]:
import autoreload
import evotsc_plot
autoreload.reload(evotsc_plot)

In [None]:
label_fontsize=20
tick_fontsize=15
legend_fontsize=15
dpi=300

In [None]:
exp_path = pathlib.Path('/Users/theotime/Desktop/evotsc/euler/substep_2_delta_1.0/')
gen=200_000
nb_genes_type = 20
gene_types = ['AB', 'A', 'B'] # Name of each gene type
gene_type_color = ['tab:blue', 'tab:red', 'tab:green'] #AB, A, B
orient_name = ['leading', 'lagging'] # Name of each gene orientation
rel_orient = ['conv', 'div', 'upstr', 'downstr']

In [None]:
def get_params(exp_path):
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    with open(rep_dirs[0].joinpath('params.txt'), 'r') as params_file:
        param_lines = params_file.readlines()
        
    params = {}
    for line in param_lines:
        param_name = line.split(':')[0]
        if param_name == 'commit':
            param_val = line.split(':')[1].strip()
        else:
            param_val = float(line.split(':')[1])
        
        params[param_name] = param_val
        
    return params

In [None]:
params = get_params(exp_path)

In [None]:
params

In [None]:
def get_stats(exp_name):
    
    exp_name = pathlib.Path(exp_name)
        
    rep_dirs = sorted([d for d in exp_name.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    res = pd.DataFrame()
    
    for i_rep, rep_dir in enumerate(rep_dirs):
                
        res_dir = pd.read_csv(rep_dir.joinpath('stats.csv'))
        
        res_dir.insert(0, 'Replicate', i_rep)
        
        res = pd.concat([res, res_dir])
    
    return res

# Plot the number of active genes of each type over evolutionary time

In [None]:
def plot_gene_activity(base_path, plot_sc=False, plot_genome_size=False):
    
    base_path = pathlib.Path(base_path)
    
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    nb_rep = len(rep_dirs)
    
    for i_rep, rep_dir in enumerate(rep_dirs):
        stats_path = rep_dir.joinpath('stats.csv')
        if stats_path.stat().st_size > 0: # not an empty file

            data = pd.read_csv(stats_path)
            data = data[data['Gen'] > 0]

            for env in ["A", "B"]:

                fig, ax1 = plt.subplots(figsize=(9, 4), dpi=dpi)
                ax1.set_ylim(-0.05, 1.05)
                ax1.set_ylabel('Proportion of activated genes', fontsize=label_fontsize)
                ax1.set_xlabel('Generation', fontsize=label_fontsize)
                ax1.set_xscale('log')
                ax1.grid()

                ax1.plot(data['Gen'], data[f"ABon_{env}"] / (data[f"ABon_{env}"] + data[f"ABoff_{env}"]),
                         color="tab:blue",
                         linewidth=2,
                         label="AB genes on")
                ax1.plot(data['Gen'], data[f"Aon_{env}"] / (data[f"Aon_{env}"] + data[f"Aoff_{env}"]),
                         color="tab:red",
                         linewidth=2,
                         label="A genes on")
                ax1.plot(data['Gen'], data[f"Bon_{env}"] / (data[f"Bon_{env}"] + data[f"Boff_{env}"]),
                         color="tab:green",
                         linewidth=2,
                         label="B genes on")
                ax1.tick_params(axis='both', which='major', labelsize=tick_fontsize)


                ## 2nd axis: fitness
                ax2 = ax1.twinx()
                ax2.set_yscale('log')
                ax2.set_ylim(1e-25, 1e0)
                ax2.set_ylabel('Fitness', fontsize=label_fontsize, color='tab:cyan')
                ax2.plot(data['Gen'],
                         data["Fitness"],
                         color="tab:cyan",
                         linewidth=2)
                ax2.tick_params(axis='both', which='major', labelsize=tick_fontsize)

                
                ## 3rd axis: supercoiling
                if plot_genome_size:
                    ax3 = ax1.twinx()

                    ax3.spines['right'].set_position(('outward', 80))
                    ax3.set_ylim(0.7e4, 6.3e4)
                    ax3.set_ylabel('Genome Size', fontsize=label_fontsize, color='tab:olive')
                    ax3.plot(data['Gen'],
                             data["Genome size"],
                             color="tab:olive",
                             linewidth=2)
                    ax3.tick_params(axis='both', which='major', labelsize=tick_fontsize)
                
                #plt.title(f"Environment {env}")
                fig.legend(bbox_to_anchor=(0,0),
                           bbox_transform=ax1.transAxes,
                           loc="lower left",
                           fontsize=legend_fontsize)            

                rep_num = rep_dir.name[3:] # add a `_` between the `rep` and the rep number 
                plt.savefig(f'{base_path}/rep_{rep_num}_env_{env}.pdf', dpi=dpi, bbox_inches='tight')
                
                plt.close('all')


In [None]:
plot_gene_activity(exp_path)

# Plot the mean and stddev of activated genes of each type in each environment over all replicas

In [None]:
full_stats = get_stats(exp_path)

In [None]:
def plot_mean_std(full_stats):
    last_gen = np.min(full_stats.groupby('Replicate').max()['Gen'])
    last_gen_stats = full_stats[full_stats['Gen'] == last_gen] / nb_genes_type
    mean_stats = last_gen_stats.mean()
    std_stats = last_gen_stats.std()
    fig, ax = plt.subplots(figsize=(9, 4), dpi=dpi)

    x = np.arange(3)  # 3 types of genes
    width = 0.35  # the width of the bars

    env_A_means = np.array([mean_stats["ABon_A"], mean_stats["Aon_A"], mean_stats["Bon_A"]])
    env_B_means = np.array([mean_stats["ABon_B"], mean_stats["Aon_B"], mean_stats["Bon_B"]])

    env_A_std = np.array([std_stats["ABon_A"], std_stats["Aon_A"], std_stats["Bon_A"]])
    env_B_std = np.array([std_stats["ABon_B"], std_stats["Aon_B"], std_stats["Bon_B"]])


    rects_A = ax.bar(x - width/2,
                    env_A_means,
                    width=width,
                    label='Environment A',
                    #yerr=[np.zeros_like(env_A_std), env_A_std],
                    yerr=env_A_std,
                    capsize=5)

    rects_B = ax.bar(x + width/2,
                     env_B_means,
                     width=width,
                     label='Environment B',
                     yerr=env_B_std,
                     capsize=5)

    ax.set_ylabel('Activated genes', fontsize=label_fontsize)
    ax.set_xticks(x)
    ax.set_xticklabels(["AB genes on", "A genes on", "B genes on"])

    plt.grid(linestyle=':', axis='y')

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    plt.legend(fontsize=legend_fontsize, loc='upper center')

    plt.savefig(f'{exp_path}/mean_activation.pdf', bbox_inches='tight')


In [None]:
plot_mean_std(full_stats)

## Statistical significance tests for the above figure

In [None]:
def stats_tests(full_stats):
    last_gen = np.min(full_stats.groupby('Replicate').max()['Gen'])
    last_gen_stats = full_stats[full_stats["Gen"] == last_gen] / nb_genes_type
    stats_AB = stats.ttest_rel(last_gen_stats["ABon_A"], last_gen_stats["ABon_B"])
    stats_Aon = stats.ttest_rel(last_gen_stats["Aon_A"], last_gen_stats["Aon_B"])
    stats_Bon = stats.ttest_rel(last_gen_stats["Bon_A"], last_gen_stats["Bon_B"])
    print(f'AB genes: {stats_AB}')
    print(f'A genes on: {stats_Aon}')
    print(f'B genes on: {stats_Bon}')

In [None]:
stats_tests(full_stats)

# Plot fitness, genome size, and basal supercoiling over evolutionary time

In [None]:
def plot_fitness(full_stats):
    nb_rep = full_stats["Replicate"].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_rep)(range(nb_rep))
    
    plt.figure(figsize=(9,4), dpi=dpi)
    
    plt.xscale('log')
    plt.yscale('log')
    plt.grid(linestyle=':')
    
    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Fitness', fontsize=label_fontsize)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    
    for rep in range(nb_rep):
        stats_rep = full_stats[full_stats["Replicate"] == rep]
        stats_rep = stats_rep[stats_rep["Gen"] > 0]
        plt.plot(stats_rep['Gen'],
                 stats_rep["Fitness"],
                 linewidth=2,
                 color=colors[rep])
        
    plt.savefig(f'{exp_path}/all_fitness.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_fitness(full_stats)

In [None]:
def plot_sigma(full_stats):
    
    if 'basal_sc' not in full_stats.columns:
        return
    
    nb_rep = full_stats["Replicate"].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_rep)(range(nb_rep))
    
    fig, ax1 = plt.subplots(figsize=(9,4), dpi=dpi)
    
    #plt.xscale('log')
    #plt.yscale('log')
    plt.grid(linestyle=':')
    
    ax1.set_xlabel('Generation', fontsize=label_fontsize)
    ax1.set_ylabel('Basal SC', fontsize=label_fontsize)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    for rep in range(nb_rep):
        stats_rep = full_stats[full_stats["Replicate"] == rep]
        stats_rep = stats_rep[stats_rep["Gen"] > 0]
        ax1.plot(stats_rep["Gen"],
                 stats_rep["basal_sc"],
                 linewidth=2,
                 color=colors[rep])
        
    plt.savefig(f'{exp_path}/all_basal_sc.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
#plot_sigma(full_stats)

In [None]:
def plot_genome_size(full_stats):
    
    if 'Genome size' not in full_stats.columns:
        return
    
    nb_rep = full_stats["Replicate"].nunique()
    
    colors = mpl.cm.get_cmap('viridis', nb_rep)(range(nb_rep))
    
    fig, ax1 = plt.subplots(figsize=(9,4), dpi=dpi)
    
    #plt.xscale('log')
    #plt.yscale('log')
    plt.grid(linestyle=':')
    
    ax1.set_xlabel('Generation', fontsize=label_fontsize)
    ax1.set_ylabel('Genome Size', fontsize=label_fontsize)
    
    #ax1.set_ylim(0, 189000)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
        
    for rep in range(nb_rep):
        stats_rep = full_stats[full_stats["Replicate"] == rep]
        stats_rep = stats_rep[stats_rep["Gen"] > 0]
        ax1.plot(stats_rep["Gen"],
                 stats_rep["Genome size"],
                 linewidth=2,
                 color=colors[rep])
        
    plt.savefig(f'{exp_path}/all_genome_size.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_genome_size(full_stats)

# Plot gene activities over the lifecycle of the best individual in each replica

In [None]:
def plot_expr_AB(indiv, sigma_A, sigma_B, plot_title, plot_name):

    (temporal_expr_A, temporal_expr_B), fitness = indiv.evaluate(sigma_A, sigma_B)

    colors = ['tab:blue', 'tab:red', 'tab:green'] # AB: blue, A: red, B: green

    plt.figure(figsize=(9, 8), dpi=dpi)

    ## First subplot: environment A
    plt.subplot(2, 1, 1)
    plt.ylim(-0.05, 1.05)

    for i_gene, gene in enumerate(indiv.genes):
        linestyle = 'solid' if gene.orientation == 0 else 'dashed'
        plt.plot(temporal_expr_A[i_gene, :],
                 linestyle=linestyle,
                 linewidth=2,
                 color=colors[gene.gene_type],
                 #alpha=0.25,
                 label=f'Gene {gene.id}')

    plt.grid(linestyle=':')
    #plt.xlabel('Time', fontsize='large')
    plt.ylabel('Expression level', fontsize=label_fontsize)

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    #plt.legend(loc='center right')
    #plt.title('Environment A')

    ## Second subplot: environment B
    plt.subplot(2, 1, 2)
    plt.ylim(-0.05, 1.05)

    for i_gene, gene in enumerate(indiv.genes):
        linestyle = 'solid' if gene.orientation == 0 else 'dashed'
        plt.plot(temporal_expr_B[i_gene, :],
                 linestyle=linestyle,
                 linewidth=2,
                 color=colors[gene.gene_type],
                 #alpha=0.25,
                 label=f'Gene {gene.id}')

    plt.grid(linestyle=':')
    plt.xlabel('Time', fontsize=label_fontsize)
    plt.ylabel('Expression level', fontsize=label_fontsize)

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    #plt.legend(loc='center right')
    #plt.title('Environment B')

    ## Final stuff

    plt.tight_layout()
    plt.savefig(plot_name + '.pdf', dpi=dpi, bbox_inches='tight')
    plt.show()
    plt.close()


In [None]:
def get_best_indiv(rep_path, gen):
    
    with open(rep_path.joinpath(f'pop_gen_{gen:06}.evotsc'), 'rb') as save_file:
        pop_rep = pickle.load(save_file)
        
    pop_rep.evaluate()
    
    best_fit = 0
    best_indiv = pop_rep.individuals[0]
    
    try:
        for indiv in pop_rep.individuals:
            if indiv.fitness > best_fit:
                best_fit = indiv.fitness
                best_indiv = indiv
    except AttributeError: # In the neutral control, individuals are not evaluated so there is no fitness field
        pass
    
    return best_indiv

In [None]:
def plot_best(gen):
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])

    for rep, rep_dir in enumerate(rep_dirs):
        best_rep = get_best_indiv(rep_dir, gen)
        plot_expr_AB(best_rep, sigma_A=params['sigma_A'], sigma_B=params['sigma_B'],
                     plot_title='', plot_name=f'{exp_path}/best_rep{rep}')
        #evotsc_plot.plot_genome(best_rep, name=f'{exp_path}/genome_rep{rep}')

In [None]:
plot_best(gen)

# Plot average gene expression levels

In [None]:
def plot_avg_expr(indiv, sigma_env, plot_title, plot_name):

    temporal_expr = indiv.run_system(sigma_env)

    nb_genes, nb_steps = temporal_expr.shape

    colors = mpl.cm.get_cmap('viridis', nb_genes)(range(nb_genes))

    plt.figure(figsize=(9, 4), dpi=dpi)

    plt.ylim(-0.05, 1.05)
    
    type_expr = np.zeros((3, nb_steps))
    
    for wanted_gene_type in gene_types:
        for i_gene, gene in enumerate(indiv.genes):
            if gene_types[gene.gene_type] == wanted_gene_type:
                type_expr[gene.gene_type] += temporal_expr[i_gene, :]
        
    type_expr /= (nb_genes // 3)

    for i_gene_type, gene_type in enumerate(gene_types):
        plt.plot(type_expr[i_gene_type, :],
                 color=gene_type_color[i_gene_type],
                 label=f'{gene_type} genes')

    plt.grid(linestyle=':')
    plt.xlabel('Time', fontsize='large')
    plt.ylabel('Expression level', fontsize='large')
    
    x_min, x_max = plt.xlim()
    plt.hlines(0.5, x_min, x_max, linestyle='--', linewidth=1,
           color='tab:red', label='Half activation threshold')
    plt.xlim(x_min, x_max)


    plt.legend(loc='lower left')
    plt.title(plot_title)
    
    plt.savefig(exp_path.joinpath(plot_name + '.pdf'), dpi=300, bbox_inches='tight')

    plt.show()

    plt.close()


In [None]:
def plot_avg_expr_all(exp_path, gen):
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    for rep, rep_dir in enumerate(rep_dirs):
        
        indiv = get_best_indiv(rep_dir, gen)
        
        plot_avg_expr(indiv, params['sigma_A'], 'Environment A', f'avg_expr_rep{rep:02}_env_A')
        #plot_avg_expr(indiv, params['sigma_B'], 'Environment B', f'avg_expr_rep{rep:02}_env_B')

In [None]:
#plot_avg_expr_all(exp_path, gen)

# Plot average supercoiling for each type of gene over replicas (_il faut changer les moyennes en médianes_)

In [None]:
def get_sigma_tsc(indiv, sigma_env: float):
    temporal_expr = np.zeros((indiv.nb_genes, indiv.nb_eval_steps))

    # Initial values at t = 0
    temporal_expr[:, 0] = np.array([gene.basal_expression for gene in indiv.genes])
    
    sigma_tsc = np.zeros((indiv.nb_genes, indiv.nb_eval_steps))

    # Iterate the system
    for t in range(1, indiv.nb_eval_steps):
        sigma_local = indiv.inter_matrix @ temporal_expr[:, t-1]
        sigma_tsc[:, t] = sigma_local
        sigma_total = indiv.sigma_basal + sigma_local + sigma_env
        temporal_expr[:, t] = 1.0 / (1.0 + np.exp((sigma_total - indiv.sigma_opt)/indiv.epsilon))

    return sigma_tsc

In [None]:
def get_avg_sigma(exp_path, gen):
    
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    nb_reps = len(rep_dirs)
    
    time_steps = 5
    
    full_data = pd.DataFrame()

    for rep, rep_dir in enumerate(rep_dirs):
        best_indiv = get_best_indiv(rep_dir, gen)
        
        sigma_A = get_sigma_tsc(best_indiv, sigma_env=params['sigma_A'])
        sigma_B = get_sigma_tsc(best_indiv, sigma_env=params['sigma_B'])

        # Take temporal averages
        mean_sigma_A_gene = np.sum(sigma_A[:, best_indiv.nb_eval_steps-time_steps:], axis=1) / time_steps
        mean_sigma_B_gene = np.sum(sigma_B[:, best_indiv.nb_eval_steps-time_steps:], axis=1) / time_steps
        
        # Sort by gene type
        mean_sigma_A = np.zeros(3)
        mean_sigma_B = np.zeros(3)
        for i_gene in range(best_indiv.nb_genes):
            mean_sigma_A[best_indiv.genes[i_gene].gene_type] += mean_sigma_A_gene[i_gene]
            mean_sigma_B[best_indiv.genes[i_gene].gene_type] += mean_sigma_B_gene[i_gene]

        # Divide by number of genes in each class
        mean_sigma_A /= (best_indiv.nb_genes // 3)
        mean_sigma_B /= (best_indiv.nb_genes // 3)
        
        # Add other supercoiling sources
        mean_sigma_A += best_indiv.sigma_basal + params['sigma_A']
        mean_sigma_B += best_indiv.sigma_basal + params['sigma_B']
        
        col_names = []
        
        for gene_type in gene_types:
            col_names.append(f'{gene_type}sc_A')
            
        for gene_type in gene_types:
            col_names.append(f'{gene_type}sc_B')
                    
        rep_data = pd.DataFrame(data=[np.concatenate([mean_sigma_A, mean_sigma_B])], 
                                columns=col_names)
        
        rep_data.insert(0, 'Replicate', rep)
        
        full_data = pd.concat([full_data, rep_data])

    full_data = full_data.set_index('Replicate')

    return full_data

In [None]:
def plot_mean_sigma(exp_path, gen, title=None):
    
    full_data = get_avg_sigma(exp_path, gen)
                
    ## Actual plotting
    mean_stats = full_data.mean()
    std_stats = full_data.std()
    med_stats = full_data.median()
    
    fig, ax = plt.subplots(figsize=(9, 4), dpi=300)

    x = np.arange(3)  # 3 types of genes
    width = 0.1  # the width of the bars

    env_A_means = np.array([mean_stats["ABsc_A"], mean_stats["Asc_A"], mean_stats["Bsc_A"]])
    env_B_means = np.array([mean_stats["ABsc_B"], mean_stats["Asc_B"], mean_stats["Bsc_B"]])

    env_A_std = np.array([std_stats["ABsc_A"], std_stats["Asc_A"], std_stats["Bsc_A"]])
    env_B_std = np.array([std_stats["ABsc_B"], std_stats["Asc_B"], std_stats["Bsc_B"]])
    
    env_A_med = np.array([med_stats["ABsc_A"], med_stats["Asc_A"], med_stats["Bsc_A"]])
    env_B_med = np.array([med_stats["ABsc_B"], med_stats["Asc_B"], med_stats["Bsc_B"]])


    # Plot mean and std
    ax.errorbar(x - width/2,
                env_A_means,
                label='Environment A',
                marker='o',
                linestyle='',
                color='tab:blue',
                yerr=env_A_std,
                capsize=5)

    ax.errorbar(x + width/2,
                env_B_means,
                marker='o',
                linestyle='',
                color='tab:orange',
                label='Environment B',
                yerr=env_B_std,
                capsize=5)
    
    # Add median values -------- note : c'est la médiane des *moyennes*, or ce sont les *médianes* qu'il faut
    ax.plot(x - width/2,
            env_A_med,
            marker='x',
            linestyle='',
            color='tab:blue')
    
    ax.plot(x + width/2,
            env_B_med,
            marker='x',
            linestyle='',
            color='tab:orange')
    
    
    # Add half-activation threshold (sigma_opt)
    x_min, x_max = ax.get_xlim()
    ax.hlines(params['sigma_opt'], x_min, x_max, linestyle='--', linewidth=1,
           color='tab:red', label='Half activation threshold')
    ax.set_xlim(x_min, x_max)

    ax.set_ylabel('SC level', fontsize=label_fontsize)
    ax.set_ylim(0.25, -0.85)
    ax.set_xticks(x)
    ax.set_xticklabels(["AB genes", "A genes", "B genes"])

    plt.grid(linestyle=':', axis='y')

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    plt.legend(fontsize=legend_fontsize, loc='upper center')
    
    if title:
        plt.title(title, fontsize=label_fontsize)

    plt.savefig(f'{exp_path}/mean_supercoiling.pdf', bbox_inches='tight')
    

In [None]:
plot_mean_sigma(exp_path, gen, title=exp_path.name)