In [None]:
import sys
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import pickle

In [None]:
from scipy import stats

In [None]:
import importlib
import evotsc_lib
import evotsc_plot
importlib.reload(evotsc_lib)
importlib.reload(evotsc_plot)

In [None]:
label_fontsize=20
tick_fontsize=15
legend_fontsize=15
dpi=300

In [None]:
exp_path = pathlib.Path('/Users/theotime/Desktop/evotsc/pci/main/')
gen = 1_000_000
gene_types = ['AB', 'A', 'B'] # Name of each gene type
gene_type_color = ['tab:blue', 'tab:red', 'tab:green'] #AB, A, B
orient_name = ['leading', 'lagging'] # Name of each gene orientation
rel_orient = ['conv', 'div', 'upstr', 'downstr']

In [None]:
rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
params = evotsc_lib.read_params(rep_dirs[0])

In [None]:
params

In [None]:
genes_per_type = params["nb_genes"] / len(gene_types)

In [None]:
def get_stats(exp_name):
    
    exp_name = pathlib.Path(exp_name)
        
    rep_dirs = sorted([d for d in exp_name.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    res = pd.DataFrame()
    
    for i_rep, rep_dir in enumerate(rep_dirs):
        
        res_dir = pd.read_csv(rep_dir.joinpath('stats.csv'),
                              usecols=['Gen', 'Fitness',
                                       'ABon_A', 'ABon_B', 'Aon_A', 'Aon_B', 'Bon_A', 'Bon_B'])

        res_dir.insert(0, 'Replicate', i_rep)
        
        res = pd.concat([res, res_dir])
    
    return res

# Plot the number of active genes of each type over evolutionary time

In [None]:
full_stats = get_stats(exp_path)

In [None]:
def plot_gene_activity_all(exp_path, full_stats, var_type='quantile'):
    
    mean_data = full_stats.groupby('Gen').mean().reset_index()
    if var_type == 'sigma':
        std_data = full_stats.groupby('Gen').std()
    elif var_type == 'quantile':
        first_dec = full_stats.groupby('Gen').quantile(0.1)
        last_dec = full_stats.groupby('Gen').quantile(0.9)
    elif var_type == 'minmax':
        min_data = full_stats.groupby('Gen').min()
        max_data = full_stats.groupby('Gen').max()
    
    for env in ["A", "B"]:

        fig, ax1 = plt.subplots(figsize=(9, 4), dpi=dpi)
        delta_y = params["nb_genes"] / 3 * 0.05 
        ax1.set_ylim(-delta_y, params["nb_genes"] / 3 + delta_y)
        ax1.set_ylabel('Activated genes', fontsize=label_fontsize)
        ax1.set_xlabel('Generation', fontsize=label_fontsize)
        ax1.set_xscale('log')
        ax1.grid(linestyle=':')
        ax1.grid(visible=True, which="minor", axis='x', linestyle=':')

        for i_gene_type, gene_type in enumerate(gene_types):

            ax1.plot(mean_data['Gen'], mean_data[f"{gene_type}on_{env}"],
                     color=gene_type_color[i_gene_type],
                     linewidth=2,
                     label=gene_type)
            
            # Show 2-sigma (95%) confidence intervals
            if var_type == 'sigma':
                ax1.plot(mean_data['Gen'],
                         mean_data[f"{gene_type}on_{env}"] - 2 * std_data[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)
                ax1.plot(mean_data['Gen'],
                         mean_data[f"{gene_type}on_{env}"] + 2 * std_data[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)
            # Show first and last deciles
            elif var_type == 'quantile':
                ax1.plot(mean_data['Gen'],
                         first_dec[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)
                ax1.plot(mean_data['Gen'],
                         last_dec[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)
            # Show min and max values
            elif var_type == 'minmax':
                ax1.plot(mean_data['Gen'],
                         min_data[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)
                ax1.plot(mean_data['Gen'],
                         max_data[f"{gene_type}on_{env}"],
                         color=gene_type_color[i_gene_type],
                         alpha=0.3)


        
        ax1.tick_params(axis='both', which='major', labelsize=tick_fontsize)



        #plt.title(f"Environment {env}")
        fig.legend(bbox_to_anchor=(0, 1),
                   bbox_transform=ax1.transAxes,
                   loc="upper left",
                   fontsize=legend_fontsize)   

        plt.savefig(f'{exp_path}/gene_activity_env_{env}.pdf', dpi=dpi, bbox_inches='tight')

        plt.show()

In [None]:
plot_gene_activity_all(exp_path, full_stats)

In [None]:
def plot_gene_activity(base_path):
    
    base_path = pathlib.Path(base_path)
    
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    nb_rep = len(rep_dirs)
    
    for i_rep, rep_dir in enumerate(rep_dirs):
        stats_path = rep_dir.joinpath('stats.csv')
        if stats_path.stat().st_size > 0: # not an empty file

            data = pd.read_csv(stats_path)
            data = data[data['Gen'] > 0]

            for env in ["A", "B"]:

                fig, ax1 = plt.subplots(figsize=(9, 4), dpi=dpi)
                delta_y = params["nb_genes"] / 3 * 0.05 
                ax1.set_ylim(-delta_y, params["nb_genes"] / 3 + delta_y)
                ax1.set_ylabel('Activated genes', fontsize=label_fontsize)
                ax1.set_xlabel('Generation', fontsize=label_fontsize)
                ax1.set_xscale('log')
                ax1.grid(linestyle=':')
                ax1.grid(visible=True, which="minor", axis='x', linestyle=':')

                
                for i_gene_type, gene_type in enumerate(gene_types):

                    ax1.plot(data['Gen'], data[f"{gene_type}on_{env}"],
                             color=gene_type_color[i_gene_type],
                             linewidth=2,
                             label=gene_type)
                    
                ax1.tick_params(axis='both', which='major', labelsize=tick_fontsize)


                ## 2nd axis: fitness
                ax2 = ax1.twinx()
                ax2.set_yscale('log')
                ax2.set_ylabel('Fitness', fontsize=label_fontsize, color='tab:cyan')
                ax2.plot(data['Gen'],
                         data['Fitness'],
                         color="tab:cyan",
                         linewidth=2)
                if 'Avg Fit' in data.columns and False:
                    ax2.plot(data['Gen'],
                             data['Avg Fit'],
                             color="tab:cyan",
                             linestyle=':',
                             linewidth=2)

                ax2.tick_params(axis='both', which='major', labelsize=tick_fontsize, colors='tab:cyan')

                        
                #plt.title(f"Environment {env}")
                fig.legend(bbox_to_anchor=(0, 1),
                           bbox_transform=ax1.transAxes,
                           loc="upper left",
                           fontsize=legend_fontsize)   

                rep_num = rep_dir.name[3:] # add a `_` between the `rep` and the rep number 
                plt.savefig(f'{base_path}/rep_{rep_num}_env_{env}.pdf', dpi=dpi, bbox_inches='tight')
                
                plt.close('all')


In [None]:
#plot_gene_activity(exp_path)

# Plot the mean and stddev of activated genes of each type in each environment over all replicas

In [None]:
def plot_mean_std(full_stats, gen):
    
    
    last_gen_stats = full_stats[full_stats["Gen"] == gen]
    mean_stats = last_gen_stats.mean()
    std_stats = last_gen_stats.std()
    nb_reps = last_gen_stats["Replicate"].nunique()
    
    fig, ax = plt.subplots(figsize=(9, 4), dpi=dpi)

    x = np.arange(3)  # 3 types of genes
    width = 0.35  # the width of the bars

    env_A_means = np.array([mean_stats["ABon_A"], mean_stats["Aon_A"], mean_stats["Bon_A"]])
    env_B_means = np.array([mean_stats["ABon_B"], mean_stats["Aon_B"], mean_stats["Bon_B"]])
            
    rects_A = ax.bar(x - width/2,
                     env_A_means,
                     width=width,
                     label='Environment A',
                     color="#008fd5")
        
    rects_B = ax.bar(x + width/2,
                     env_B_means,
                     width=width,
                     label='Environment B',
                     color="#fc4f30")
    
    if nb_reps > 10: # If we have many replicates, draw a boxplot over the means
        ax.boxplot([last_gen_stats["ABon_A"], last_gen_stats["Aon_A"], last_gen_stats["Bon_A"]],
                       positions=x - width/2, manage_ticks=False, medianprops={'color':'black'})
        ax.boxplot([last_gen_stats["ABon_B"], last_gen_stats["Aon_B"], last_gen_stats["Bon_B"]],
                       positions=x + width/2, manage_ticks=False, medianprops={'color':'black'})    
    
    else: # Else, just draw every replicate.
        x_plot = np.tile(x, (nb_reps, 1)).T + np.tile(np.linspace(-width/2, width/2, nb_reps+2)[1:-1], (len(x), 1))
        
        ax.plot(x_plot - width/2, [last_gen_stats["ABon_A"], last_gen_stats["Aon_A"], last_gen_stats["Bon_A"]],
                marker='o', linestyle='', markeredgecolor='black', markerfacecolor='none')
        ax.plot(x_plot + width/2, [last_gen_stats["ABon_B"], last_gen_stats["Aon_B"], last_gen_stats["Bon_B"]],
                marker='o', linestyle='', markeredgecolor='black', markerfacecolor='none')

    
    ax.set_ylabel('Activated genes', fontsize=label_fontsize)
    delta_y = genes_per_type * 0.05 
    ax.set_ylim(0, genes_per_type + delta_y)

    ax.set_xticks(x)
    ax.set_xticklabels(["AB genes on", "A genes on", "B genes on"])

    plt.grid(linestyle=':', axis='y')

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    plt.legend(fontsize=12, loc='upper left', bbox_to_anchor=(0.46, 1))

    plt.savefig(f'{exp_path}/mean_activation.pdf', bbox_inches='tight')

In [None]:
last_gen = full_stats.groupby('Replicate').max()['Gen'].min()

In [None]:
plot_mean_std(full_stats, last_gen)

## Statistical significance tests for the figure above

In [None]:
def stats_tests(full_stats):
    last_gen = np.min(full_stats.groupby('Replicate').max()['Gen'])
    last_gen_stats = full_stats[full_stats["Gen"] == last_gen] / genes_per_type
    stats_AB = stats.ttest_rel(last_gen_stats["ABon_A"], last_gen_stats["ABon_B"])
    stats_Aon = stats.ttest_rel(last_gen_stats["Aon_A"], last_gen_stats["Aon_B"])
    stats_Bon = stats.ttest_rel(last_gen_stats["Bon_A"], last_gen_stats["Bon_B"])
    print(f'AB genes: {stats_AB}')
    print(f'A genes on: {stats_Aon}')
    print(f'B genes on: {stats_Bon}')

In [None]:
stats_tests(full_stats)

# Plot fitness, genome size, and basal supercoiling over evolutionary time

In [None]:
def plot_fitness(full_stats):
    nb_rep = full_stats["Replicate"].nunique()
    
    stats = full_stats[full_stats["Gen"] > 0][['Gen', 'Fitness']].copy()
    
    stats['log fitness'] = np.log(stats['Fitness'])
    mean_data = stats.groupby('Gen').mean().reset_index()
    
    # Note: for the fitness, the mean can be above the quantile
    first_dec = stats.groupby('Gen').quantile(0.1)
    last_dec = stats.groupby('Gen').quantile(0.9)
    
    color = 'tab:cyan'
    
    plt.figure(figsize=(9,4), dpi=dpi)
    
    plt.xscale('log')
    plt.yscale('log')
    plt.grid(linestyle=':')
    plt.grid(visible=True, which="minor", axis='x', linestyle=':')
    
    plt.xlabel('Generation', fontsize=label_fontsize)
    plt.ylabel('Fitness', fontsize=label_fontsize)
    
    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)
    
    # Average fitness
    plt.plot(mean_data['Gen'],
             np.exp(mean_data['log fitness']),
             color=color,
             linewidth=2) 
    
    # Quantiles
    plt.plot(mean_data['Gen'],
             first_dec['Fitness'],
             color=color,
             alpha=0.3)
    
    plt.plot(mean_data['Gen'],
             last_dec['Fitness'],
             color=color,
             alpha=0.3)

    #plt.title(exp_path.name)
        
    plt.savefig(f'{exp_path}/all_fitness.pdf', dpi=dpi, bbox_inches='tight')

In [None]:
plot_fitness(full_stats)

# Plot gene activities over the lifecycle of the best individual in each replica

In [None]:
def plot_best(gen):
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])

    for rep, rep_dir in enumerate(rep_dirs):
        try:
            best_rep = evotsc_lib.get_best_indiv(rep_dir, gen)
            evotsc_plot.plot_expr_AB(best_rep, sigma_A=params['sigma_A'], sigma_B=params['sigma_B'],
                                     plot_title='', plot_name=f'{exp_path}/best_rep{rep}')
            
        except FileNotFoundError: # Skip missing data
            print(f'Missing data for replicate {rep} at generation {gen}', file=sys.stderr)
            pass

In [None]:
plot_best(gen)

# Plot average gene expression levels

In [None]:
def plot_avg_expr(indiv, sigma_env, plot_title, plot_name):

    temporal_expr = indiv.run_system(sigma_env)

    nb_genes, nb_steps = temporal_expr.shape

    colors = mpl.cm.get_cmap('viridis', nb_genes)(range(nb_genes))

    plt.figure(figsize=(9, 4), dpi=dpi)

    plt.ylim(-0.05, 1.05)
    
    type_expr = np.zeros((3, nb_steps))
    
    for wanted_gene_type in gene_types:
        for i_gene, gene in enumerate(indiv.genes):
            if gene_types[gene.gene_type] == wanted_gene_type:
                type_expr[gene.gene_type] += temporal_expr[i_gene, :]
        
    type_expr /= (nb_genes // 3)

    for i_gene_type, gene_type in enumerate(gene_types):
        plt.plot(type_expr[i_gene_type, :],
                 color=gene_type_color[i_gene_type],
                 label=f'{gene_type} genes')

    plt.grid(linestyle=':')
    plt.xlabel('Time', fontsize='large')
    plt.ylabel('Expression level', fontsize='large')
    
    x_min, x_max = plt.xlim()
    plt.hlines(0.5, x_min, x_max, linestyle='--', linewidth=1,
           color='tab:red', label='Half activation threshold')
    plt.xlim(x_min, x_max)


    plt.legend(loc='lower left')
    plt.title(plot_title)
    
    plt.savefig(exp_path.joinpath(plot_name + '.pdf'), dpi=dpi, bbox_inches='tight')

    plt.show()

    plt.close()


In [None]:
def plot_avg_expr_all(exp_path, gen):
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    for rep, rep_dir in enumerate(rep_dirs):
        
        indiv = evotsc_lib.get_best_indiv(rep_dir, gen)
        
        plot_avg_expr(indiv, params['sigma_A'], 'Environment A', f'avg_expr_rep{rep:02}_env_A')
        #plot_avg_expr(indiv, params['sigma_B'], 'Environment B', f'avg_expr_rep{rep:02}_env_B')

In [None]:
#plot_avg_expr_all(exp_path, gen)

# Plot average supercoiling for each type of gene over replicas (_il faut changer les moyennes en médianes_)

In [None]:
def get_sigma_tsc(indiv, sigma_env: float):
    temporal_expr = np.zeros((indiv.nb_genes, indiv.nb_eval_steps))

    # Initial values at t = 0
    temporal_expr[:, 0] = np.array([gene.basal_expression for gene in indiv.genes])
    
    sigma_tsc = np.zeros((indiv.nb_genes, indiv.nb_eval_steps))

    # Iterate the system
    for t in range(1, indiv.nb_eval_steps):
        sigma_local = indiv.inter_matrix @ temporal_expr[:, t-1]
        sigma_tsc[:, t] = sigma_local
        sigma_total = indiv.sigma_basal + sigma_local + sigma_env
        temporal_expr[:, t] = 1.0 / (1.0 + np.exp((sigma_total - indiv.sigma_opt)/indiv.epsilon))

    return sigma_tsc

In [None]:
def get_avg_sigma(exp_path, gen):
    
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    nb_reps = len(rep_dirs)
    
    time_steps = 5
    
    full_data = pd.DataFrame()

    for rep, rep_dir in enumerate(rep_dirs):
        best_indiv = evotsc_lib.get_best_indiv(rep_dir, gen)
        
        sigma_A = get_sigma_tsc(best_indiv, sigma_env=params['sigma_A'])
        sigma_B = get_sigma_tsc(best_indiv, sigma_env=params['sigma_B'])

        # Take temporal averages
        mean_sigma_A_gene = np.sum(sigma_A[:, best_indiv.nb_eval_steps-time_steps:], axis=1) / time_steps
        mean_sigma_B_gene = np.sum(sigma_B[:, best_indiv.nb_eval_steps-time_steps:], axis=1) / time_steps
        
        # Sort by gene type
        mean_sigma_A = np.zeros(3)
        mean_sigma_B = np.zeros(3)
        for i_gene in range(best_indiv.nb_genes):
            mean_sigma_A[best_indiv.genes[i_gene].gene_type] += mean_sigma_A_gene[i_gene]
            mean_sigma_B[best_indiv.genes[i_gene].gene_type] += mean_sigma_B_gene[i_gene]

        # Divide by number of genes in each class
        mean_sigma_A /= (best_indiv.nb_genes // 3)
        mean_sigma_B /= (best_indiv.nb_genes // 3)
        
        # Add other supercoiling sources
        mean_sigma_A += best_indiv.sigma_basal + params['sigma_A']
        mean_sigma_B += best_indiv.sigma_basal + params['sigma_B']
        
        col_names = []
        
        for gene_type in gene_types:
            col_names.append(f'{gene_type}sc_A')
            
        for gene_type in gene_types:
            col_names.append(f'{gene_type}sc_B')
                    
        rep_data = pd.DataFrame(data=[np.concatenate([mean_sigma_A, mean_sigma_B])], 
                                columns=col_names)
        
        rep_data.insert(0, 'Replicate', rep)
        
        full_data = pd.concat([full_data, rep_data])

    full_data = full_data.set_index('Replicate')

    return full_data

In [None]:
def plot_mean_sigma(exp_path, gen, title=None):
    
    full_data = get_avg_sigma(exp_path, gen)
                
    ## Actual plotting
    mean_stats = full_data.mean()
    std_stats = full_data.std()
    med_stats = full_data.median()
    
    fig, ax = plt.subplots(figsize=(9, 4), dpi=dpi)

    x = np.arange(3)  # 3 types of genes
    width = 0.1  # the width of the bars

    env_A_means = np.array([mean_stats["ABsc_A"], mean_stats["Asc_A"], mean_stats["Bsc_A"]])
    env_B_means = np.array([mean_stats["ABsc_B"], mean_stats["Asc_B"], mean_stats["Bsc_B"]])

    env_A_std = np.array([std_stats["ABsc_A"], std_stats["Asc_A"], std_stats["Bsc_A"]])
    env_B_std = np.array([std_stats["ABsc_B"], std_stats["Asc_B"], std_stats["Bsc_B"]])
    
    env_A_med = np.array([med_stats["ABsc_A"], med_stats["Asc_A"], med_stats["Bsc_A"]])
    env_B_med = np.array([med_stats["ABsc_B"], med_stats["Asc_B"], med_stats["Bsc_B"]])


    # Plot mean and std
    ax.errorbar(x - width/2,
                env_A_means,
                label='Environment A',
                marker='o',
                linestyle='',
                color='tab:blue',
                yerr=env_A_std,
                capsize=5)

    ax.errorbar(x + width/2,
                env_B_means,
                marker='o',
                linestyle='',
                color='tab:orange',
                label='Environment B',
                yerr=env_B_std,
                capsize=5)
    
    # Add median values -------- note : c'est la médiane des *moyennes*, or ce sont les *médianes* qu'il faut
    ax.plot(x - width/2,
            env_A_med,
            marker='x',
            linestyle='',
            color='tab:blue')
    
    ax.plot(x + width/2,
            env_B_med,
            marker='x',
            linestyle='',
            color='tab:orange')
    
    
    # Add half-activation threshold (sigma_opt)
    x_min, x_max = ax.get_xlim()
    ax.hlines(params['sigma_opt'], x_min, x_max, linestyle='--', linewidth=1,
           color='tab:red', label='Half activation threshold')
    ax.set_xlim(x_min, x_max)

    ax.set_ylabel('SC level', fontsize=label_fontsize)
    ax.set_ylim(0.25, -0.85)
    ax.set_xticks(x)
    ax.set_xticklabels(["AB genes", "A genes", "B genes"])

    plt.grid(linestyle=':', axis='y')

    plt.tick_params(axis='both', which='major', labelsize=tick_fontsize)

    plt.legend(fontsize=legend_fontsize, loc='upper center')
    
    if title:
        plt.title(title, fontsize=label_fontsize)

    plt.savefig(f'{exp_path}/mean_supercoiling.pdf', bbox_inches='tight')
    

In [None]:
#plot_mean_sigma(exp_path, gen, title=exp_path.name)