In [None]:
import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pathlib
import itertools

In [None]:
import importlib
import evotsc
import evotsc_lib
import evotsc_plot
importlib.reload(evotsc)
importlib.reload(evotsc_lib)
importlib.reload(evotsc_plot)

In [None]:
exp_path = pathlib.Path('/Users/theotime/Desktop/evotsc/pci/main/')
gen=1_000_000
gene_types = ['AB', 'A', 'B'] # Name of each gene type
gene_type_color = ['tab:blue', 'tab:red', 'tab:green'] #AB, A, B
orient_name = ['leading', 'lagging'] # Name of each gene orientation
dpi=300

In [None]:
rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
nb_rep = len(rep_dirs)
params = evotsc_lib.read_params(rep_dirs[0])
params['m'] = 2.5 # Temporary fix because the parameter wasn't saved

In [None]:
rng = np.random.default_rng(seed=123456)

## Generate sub-networks of _k_ consecutive genes

In [None]:
def extract_subnetwork(indiv, i_start, size, keep_ids=False):
    
    orig_pos, indiv_len = indiv.compute_gene_positions(include_coding=True)
    
    new_genes = []
    for i_gene in range(i_start, i_start + size):
        new_gene = indiv.genes[(i_gene % indiv.nb_genes)].clone()
        if keep_ids == False:
            new_gene.id = i_gene % indiv.nb_genes
        new_genes.append(new_gene)


    clone = evotsc.Individual(genes=new_genes,
                              interaction_dist=indiv.interaction_dist,
                              interaction_coef=indiv.interaction_coef,
                              sigma_basal=indiv.sigma_basal,
                              sigma_opt=indiv.sigma_opt,
                              epsilon=indiv.epsilon,
                              m=indiv.m,
                              selection_coef=indiv.selection_coef,
                              rng=indiv.rng)

    _, clone_len = clone.compute_gene_positions(include_coding=True)

    new_genes[-1].intergene += indiv_len - clone_len

    return clone

In [None]:
def plot_subnetworks(rep, gen, network_sizes):
    
    indiv = evotsc_lib.get_best_indiv(exp_path.joinpath(f'rep{rep:02}'), gen)
    
    orig_pos, indiv_len = indiv.compute_gene_positions(include_coding=True)
    
    sub_path = exp_path.joinpath(f'sub_rep{rep:02}')
    sub_path.mkdir(exist_ok=True)

    for network_size in network_sizes:
        for i_start in range(indiv.nb_genes):
            clone = extract_subnetwork(indiv, i_start, network_size)
            
            # For plotting at the same position as the original individual
            shift = orig_pos[i_start]
            if indiv.genes[i_start].orientation == 1: # Lagging
                shift -= (indiv.genes[i_start].length - 1)
                
            for env in ['A', 'B']:
                sigma = params[f'sigma_{env}']
                name = sub_path.joinpath(f'sub_{network_size}_genes_{i_start:02}_env_{env}.pdf')

                evotsc_plot.plot_genome_and_tsc(clone, sigma=sigma, coloring_type='on-off',
                                                show_bar=(env == 'A'),
                                                id_interval=1, print_ids=True, naming_type='id',
                                                shift=-shift, plot_name=name, show_plot=False)   

In [None]:
plot_subnetworks(rep=21, gen=gen, network_sizes=[3, 7])

## Statistics of the subnetworks: final expression of each gene and fitness

In [None]:
def compute_subnetwork_fitness(indiv):
    # Take into account the fact that the individual has k_A, k_B, k_AB < 20 genes of each type
    
    expr_levels_A = indiv.run_system(params['sigma_A'])
    expr_levels_B = indiv.run_system(params['sigma_B'])
    
    # The minimal expression level is exp(-m)
    target_A = np.array([1.0, 1.0, np.exp(-indiv.m)]) # Gene types are AB, A, B
    target_B = np.array([1.0, np.exp(-indiv.m), 1.0])

    
    nb_genes_per_type = np.array([0, 0, 0], dtype=int)
    
    for gene in indiv.genes:
        nb_genes_per_type[gene.gene_type] += 1
        
    for i_type in range(len(gene_types)):
        if nb_genes_per_type[i_type] == 0:
            nb_genes_per_type[i_type] = 1  # No genes of that type: expr should be 0, but divide by 1
            target_A[i_type] = target_B[i_type] = 0 # Also don't count this type in the fitness

    # Environment A
    gene_expr_A = np.zeros(3)
    for i_gene, gene in enumerate(indiv.genes):
        gene_expr_A[gene.gene_type] += expr_levels_A[-1, i_gene]


    gap_A = np.square(gene_expr_A / nb_genes_per_type - target_A).sum()

    # Environment B
    gene_expr_B = np.zeros(3)
    for i_gene, gene in enumerate(indiv.genes):
        gene_expr_B[gene.gene_type] += expr_levels_B[-1, i_gene]


    gap_B = np.square(gene_expr_B / nb_genes_per_type - target_B).sum()

    fitness = np.exp(- indiv.selection_coef * (gap_A + gap_B))

    return fitness

In [None]:
def compute_subnetwork_stats(exp_path, gen, network_sizes):
    
    rep_dirs = sorted([d for d in exp_path.iterdir() if (d.is_dir() and d.name.startswith("rep"))])
    
    full_stats = pd.DataFrame()
        
    for rep, rep_dir in enumerate(rep_dirs):
        
        indiv = evotsc_lib.get_best_indiv(rep_dir, gen)
        
        result_dict = {'Replicate': [],
               'network_size': [],
               'start_pos': [],
               'subnetwork_pos': [],
               'gene_id': [],
               'gene_type': [],
               'gene_orient': [],
               'final_expr_A': [],
               'final_expr_B': [],
               'fitness': []}
    
        for size in network_sizes:

            for i_start in range(indiv.nb_genes):
                sub_indiv = extract_subnetwork(indiv, i_start, size, keep_ids=True)

                (expr_A, expr_B), _ = sub_indiv.evaluate(params['sigma_A'], params['sigma_B'])
                fitness = compute_subnetwork_fitness(sub_indiv)

                for i_gene, gene in enumerate(sub_indiv.genes):
                    result_dict['Replicate'].append(rep)
                    result_dict['network_size'].append(size)
                    result_dict['start_pos'].append(i_start)
                    result_dict['subnetwork_pos'].append(i_gene)
                    result_dict['gene_id'].append(gene.id)
                    result_dict['gene_type'].append(gene_types[gene.gene_type])
                    result_dict['gene_orient'].append(orient_name[gene.orientation])
                    result_dict['final_expr_A'].append(expr_A[-1, i_gene])
                    result_dict['final_expr_B'].append(expr_B[-1, i_gene])
                    result_dict['fitness'].append(fitness)


        indiv_stats = pd.DataFrame.from_dict(result_dict)

        full_stats = pd.concat([full_stats, indiv_stats])
            
    return full_stats

In [None]:
# Full stats
stats = compute_subnetwork_stats(exp_path, gen, network_sizes=range(1, int(params['nb_genes'])+1))

In [None]:
# Statistics for the genes at the middle position of a subnetwork (of odd sizes only)
central_gene_stats = pd.concat(stats[(stats["network_size"] == network_size) &
                                     (stats["subnetwork_pos"] == network_size // 2)]
                               for network_size in np.arange(1, 60, 2)).copy()

## Plot the mean and median expression level by gene type for each network size

In [None]:
def plot_expr_by_network_size(stats):
    
    mean_stats = stats.groupby(['gene_type', 'network_size']).mean()
    med_stats = stats.groupby(['gene_type', 'network_size']).median()
    

    custom_lines = [mpl.lines.Line2D([0], [0], color='k', linewidth=2),
                    mpl.lines.Line2D([0], [0], color='k', linestyle=':', linewidth=2)]
    
    for i_env, env in enumerate(['A', 'B']):
        plt.figure(figsize=(9, 4), dpi=dpi)
        plt.grid(linestyle=':')
        plt.ylim(-0.05, 1.05)
        for i_gene_type, gene_type in enumerate(gene_types):
            plt.plot(med_stats.loc[(gene_type)][f'final_expr_{env}'],
                     color=gene_type_color[i_gene_type], linewidth=2)
            plt.plot(mean_stats.loc[(gene_type)][f'final_expr_{env}'],
                     color=gene_type_color[i_gene_type], linewidth=2, linestyle=':')
    
        plt.ylabel('Expression level')    
        
        plt.xlabel('Subnetwork size')
        plt.legend(custom_lines, ['Medians', 'Means'])

        plt.savefig(exp_path.joinpath(f'med_expr_by_network_size_all_env_{env}.pdf'), bbox_inches='tight')
    
        plt.show()

In [None]:
plot_expr_by_network_size(stats)

In [None]:
plot_expr_by_network_size(central_gene_stats)

In [None]:
#for i_rep in range(nb_rep):
#    plot_expr_by_network_size(stats[stats['Replicate'] == i_rep], 
#                               plot_name=exp_path.joinpath(f'med_expr_by_network_size_rep{i_rep}.pdf'))

## Plot the distribution of gene expression levels in each env. for each network size

In [None]:
def plot_distrib_by_network_size(stats, network_size):
    
    for env in ['A', 'B']:
    
        # Data
        size_stats = stats[stats['network_size'] == network_size].copy().reindex()

        median_stats = size_stats.groupby('gene_type').median()[f'final_expr_{env}']
        mean_stats = size_stats.groupby('gene_type').mean()[f'final_expr_{env}']

        x = range(1, 4)

        # Plot
        fig, ax = plt.subplots(dpi=300)

        colors = plt.cm.get_cmap('tab20').colors
        light_type_color = [colors[1], colors[7], colors[5]]
        dark_type_color = [colors[0], colors[6], colors[4]]

        violins = []
        for i_gene_type, gene_type in enumerate(gene_types):
            type_stats = size_stats[size_stats['gene_type'] == gene_type][f'final_expr_{env}']
            violins.append(ax.violinplot(type_stats, positions=[x[i_gene_type]],
                                         showmeans=True, showmedians=True))

            if i_gene_type == 0:
                ax.scatter([x[i_gene_type]], median_stats.loc[gene_type], marker='o', 
                            color=dark_type_color[i_gene_type], label='Medians')
                ax.scatter([x[i_gene_type]], mean_stats.loc[gene_type], marker='x',
                            color=dark_type_color[i_gene_type], label='Means')
            else:
                ax.scatter([x[i_gene_type]], median_stats.loc[gene_type], marker='o', 
                            color=dark_type_color[i_gene_type])
                ax.scatter([x[i_gene_type]], mean_stats.loc[gene_type], marker='x',
                            color=dark_type_color[i_gene_type])


        for i_violin, violin in enumerate(violins):
            for i_pc, pc in enumerate(violin['bodies']):
                pc.set_facecolor(light_type_color[i_violin])

            for partname in ['cbars','cmins','cmaxes','cmeans','cmedians']:
                violin[partname].set_edgecolor(dark_type_color[i_violin])

        ax.set_ylim(-0.05, 1.05)
        ax.set_ylabel('Expression level')
        ax.set_xticks(x)
        ax.set_xticklabels(gene_types)
        ax.set_xlabel('Gene type')
        ax.grid(axis='y', linestyle=':')

        plt.legend(loc='lower left')

        plt.savefig(f'distrib_expr_size_env_{env}_{network_size}.pdf', bbox_inches='tight')

        plt.show()
        plt.close()

In [None]:
#plot_distrib_by_network_size(stats, network_size=59)

## Plot proportion of activated genes for each network size

In [None]:
def plot_activ_by_network_size(stats, plot_name=None):
    
    stats = stats.copy()
    
    half_expr = (1 + np.exp(- params['m'])) / 2
    for env in ['A', 'B']:
        stats[f'activ_{env}'] = stats[f'final_expr_{env}'] > half_expr

    mean_stats = stats.groupby(['gene_type', 'network_size']).mean()
        
    
    for i_env, env in enumerate(['A', 'B']):
        plt.figure(figsize=(9, 4), dpi=dpi)
        plt.grid(linestyle=':')
        plt.ylim(-0.05, 1.05)
        for i_gene_type, gene_type in enumerate(gene_types):
            plt.plot(mean_stats.loc[(gene_type)][f'activ_{env}'],
                     color=gene_type_color[i_gene_type], linewidth=2)
    
        plt.ylabel('Proportion of activated genes')
        plt.xlabel('Subnetwork size')
     
        plt.savefig(exp_path.joinpath(f'med_activ_by_network_size_all_env_{env}.pdf'), bbox_inches='tight')
        plt.show()

In [None]:
plot_activ_by_network_size(stats)

In [None]:
plot_activ_by_network_size(central_gene_stats)

## Plot fitness (not averaged by subnetwork) for each network size
For each subnetwork, compute the difference between the mean expression of the genes in the subnetwork and the target, neutralizing gene types that are not present in the subnetwork.

In [None]:
def plot_fitness_by_network_size(stats, plot_name=None):
    
    stats = stats.copy()
    
    mean_stats = stats.groupby(['network_size']).mean()
        
    plt.figure(figsize=(9, 4), dpi=dpi)
    
    plt.grid(linestyle=':')
    plt.ylim(-0.05, 1.05)
    plt.yscale('log')
    plt.ylim(1e-8 - 1e-100, 1)
    
    plt.plot(mean_stats['fitness'], color='tab:cyan', linewidth=2)

    plt.ylabel('Fitness')
    plt.xlabel('Subnetwork size')
     
    if plot_name:
        plt.savefig(plot_name, bbox_inches='tight')
    plt.show()

In [None]:
plot_fitness_by_network_size(stats)

## "Fitness" by size of the subnetwork
For a given subnetwork size, compute the average expression of each gene in all subnetworks of that size, and then compute fitness based on this: this removes the problem of missing gene types in a given subnetwork.

In [None]:
def plot_fitness_by_network_size_avg(stats, plot_name=None):
    
    grouped_stats = stats.groupby(['network_size', 'gene_type']).mean()
    fitnesses = {}
    
    for size in stats['network_size'].unique():

        target_A = np.array([1.0, 1.0, np.exp(-params['m'])]) # Gene types are AB, A, B
        expr_A = grouped_stats.loc[(size)]['final_expr_A'][gene_types].to_numpy()
        gap_A = np.square(expr_A - target_A).sum()

        target_B = np.array([1.0, np.exp(-params['m']), 1.0]) # Gene types are AB, A, B
        expr_B = grouped_stats.loc[(size)]['final_expr_B'][gene_types].to_numpy()
        gap_B = np.square(expr_B - target_B).sum()

        fitnesses[size-1] = np.exp(- params['selection_coef'] * (gap_A + gap_B))
        
    plt.figure(figsize=(9, 4), dpi=300)

    plt.plot(fitnesses.keys(), fitnesses.values(), color='tab:cyan')
    plt.yscale('log')
    plt.grid(linestyle=':')
    plt.ylabel('Fitness')
    plt.xlabel('Subnetwork size')
    
    if plot_name:
        plt.savefig(plot_name, bbox_inches='tight')
    plt.show()

In [None]:
plot_fitness_by_network_size_avg(stats)

## Minimum network size for the central gene to regain its function

In [None]:
def compute_min_subnetwork_size(stats, central_gene_stats):
    
    half_dist = (1 - np.exp(- params['m'])) / 2 # Distance between min_expr and half_expr or half_expr and 1
    nb_genes = stats['gene_id'].nunique()
    
    whole_stats = stats[stats['network_size'] == 60]

    data = {'A': {'A': [], 'B': []},
            'B': {'A': [], 'B': []},
            'AB': {'A': [], 'B': []}}
    
    res_dict = {'Replicate': [],
                'gene_id': [],
                'gene_pos': [], # position in the gene list
                'gene_type': [],
                'min_size_A': [],
                'min_size_B': []}
    
    target_A = {'A': 1, 'B': np.exp(- params['m']), 'AB': 1}
    target_B = {'A': np.exp(- params['m']), 'B': 1, 'AB': 1}

    for rep in range(nb_rep):
        rep_stats = whole_stats[whole_stats['Replicate'] == rep]
        for gene_id in range(nb_genes):
            
            gene_stats = rep_stats[rep_stats['gene_id'] == gene_id].iloc[0]
            cur_gene_type = gene_stats['gene_type']
            
            for gene_type in gene_types:
                if cur_gene_type == gene_type:
                    if (np.abs(gene_stats['final_expr_A'] - target_A[gene_type]) < half_dist and
                        np.abs(gene_stats['final_expr_B'] - target_B[gene_type]) < half_dist):

                        gene_stats = central_gene_stats[(central_gene_stats['Replicate'] == rep) &
                                                        (central_gene_stats['gene_id'] == gene_id)]

                        for min_size_A in np.arange(1, nb_genes+1, step=2):
                            size_data = gene_stats[gene_stats['network_size'] == min_size_A].iloc[0]
                            if np.abs(size_data['final_expr_A'] - target_A[gene_type]) < half_dist:
                                break

                        for min_size_B in np.arange(1, nb_genes+1, step=2):
                            size_data = gene_stats[gene_stats['network_size'] == min_size_B].iloc[0]
                            if np.abs(size_data['final_expr_B'] - target_B[gene_type]) < half_dist:
                                break

                        res_dict['Replicate'].append(rep)
                        res_dict['gene_id'].append(gene_id)
                        
                        gene_pos = (gene_stats.iloc[0]['start_pos'] + gene_stats.iloc[0]['subnetwork_pos']) % nb_genes
                        res_dict['gene_pos'].append(gene_pos)
                        res_dict['gene_type'].append(gene_type)
                        res_dict['min_size_A'].append(min_size_A)
                        res_dict['min_size_B'].append(min_size_B)
                        
    return pd.DataFrame.from_dict(res_dict)

In [None]:
data = compute_min_subnetwork_size(stats, central_gene_stats)

In [None]:
data.groupby('gene_type').count()

In [None]:
def plot_min_needed_size(data):
    fig, ax = plt.subplots(figsize=(8, 5), dpi=300)
    ax.set_ylim(0, 30)
    ax.grid(linestyle=':', axis='y')
    
    delta = 2.5
    
    x_pos = [0, 1, delta, delta + 1, 2*delta, 2*delta + 1]
        
    ax.set_xticks(x_pos, labels=[f'{gene_type} env. {env}' for gene_type, env in
                                 itertools.product(gene_types, ['A', 'B'])])
    
    for i_gene_type, gene_type in enumerate(gene_types):
        violin = ax.violinplot(data[data['gene_type'] == gene_type][['min_size_A', 'min_size_B']],
                               positions=[delta*i_gene_type, delta*i_gene_type+1])
        
        for pc in violin['bodies']:
            pc.set_facecolor(gene_type_color[i_gene_type])

        for partname in ('cbars','cmins','cmaxes'):
            vp = violin[partname]
            vp.set_edgecolor(gene_type_color[i_gene_type])
            #vp.set_linewidth(1)

    for i_gene_type, gene_type in enumerate(gene_types):
        ax.boxplot(data[data['gene_type'] == gene_type][['min_size_A', 'min_size_B']],
                   positions=[delta*i_gene_type, delta*i_gene_type+1], showmeans=True,
                   manage_ticks=False, medianprops={'color':'black'},
                   meanprops={'marker':'_', 'markerfacecolor':'black', 'markeredgecolor':'black'})
    
    #ax.yaxis.set_major_locator(mpl.ticker.MultipleLocator(2))
    ax.set_yticks(np.arange(1, 31, step=4))
    ax.set_ylim(-1, 31)
    
    ax.set_ylabel('Minimum network size to reach activation target')
    
    plt.savefig(exp_path.joinpath('min_network_size.pdf'), dpi=dpi, bbox_inches='tight')

In [None]:
plot_min_needed_size(data)

In [None]:
data.groupby(['gene_type']).median()[['min_size_A', 'min_size_B']]

In [None]:
data.groupby(['gene_type']).mean()[['min_size_A', 'min_size_B']]

In [None]:
data[(data['gene_type'] == 'A') & (data['min_size_B'] == 9) & (data['Replicate'] == 21)]

In [None]:
data[(data['gene_type'] == 'B') & (data['min_size_A'] == 5) & (data['Replicate'] == 21)]