In [89]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
import matplotlib.gridspec as gridspec
from matplotlib.transforms import Affine2D
from matplotlib.transforms import ScaledTranslation
import seaborn as sns
import scienceplots
import numpy as np
import re

In [90]:
TEXTWIDTH_CM = 18.3
CM_TO_INCH = 1/2.54  # centimeters in inches
CONDITION_ORDER = ['CTRL', 'ILE', 'LEU', 'VAL', '(LEU, ILE)', '(LEU, ILE, VAL)']

In [129]:
df_condition_pcc = (pd.read_csv('Performance.csv')
 .melt(id_vars='Condition', var_name='Model', value_name='PCC')
 .assign(
     Seed=lambda df: [[el for el in re.findall(r"S\d+", x)] for x in df.PCC],
     PCC=lambda df: [[float(el) for el in re.findall(r"\d+\.\d+", x)] for x in df.PCC])
 .explode(['Seed','PCC'])
 .replace({
     'Model': {
      ' XL-Net 1 (64+6) DH Seed Best': 'XLNet DH',
      'XL-Net 1 DH (PLabel)': 'XLNet DH+PL',
      'XL-Net 1 SH (L: MAE+PCC)': 'XLNet SH', 
      'LSTM DH (L: MAE+PCC)': 'LSTM DH',
      'LSTM SH (L: MAE+PCC)': 'LSTM SH'}, 
    'Condition': {
        'CTRL + Liver': 'CTRL', 
        'LEU_ILE': '(LEU, ILE)', 
        'LEU_ILE_VAL': '(LEU, ILE, VAL)'
    }}
))
df_condition_pcc

Unnamed: 0,Condition,Model,PCC,Seed
0,CTRL,XLNet DH,0.5842,S1
0,CTRL,XLNet DH,0.5953,S2
0,CTRL,XLNet DH,0.5641,S3
0,CTRL,XLNet DH,0.5927,S4
0,CTRL,XLNet DH,0.5979,S42
...,...,...,...,...
35,VAL,LSTM SH,0.6592,S1
35,VAL,LSTM SH,0.6563,S2
35,VAL,LSTM SH,0.6584,S3
35,VAL,LSTM SH,0.6584,S4


In [130]:
BRIGHT_PALETTE = ['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377', '#BBBBBB']
PALETTE_TO_MODEL = {
    "RiboMIMO": BRIGHT_PALETTE[0],
    "LSTM SH": BRIGHT_PALETTE[1],
    "LSTM DH": BRIGHT_PALETTE[2],
    "XLNet SH": BRIGHT_PALETTE[3],
    "XLNet DH": BRIGHT_PALETTE[4],
    "XLNet DH+PL": BRIGHT_PALETTE[5],    
}

In [71]:
palette_df = pd.DataFrame.from_dict(dict(
    Model = ['Riboclette DH+IM', 'Riboclette DH', 'LSTM DH', 'RiboMIMO'],
    Palette = np.array(['#4477AA', '#EE6677', '#228833', '#CCBB44', '#66CCEE', '#AA3377', '#BBBBBB'])[[0,1,4,5]]
))
palette_dict = dict(zip(palette_df['Model'], palette_df['Palette']))

In [83]:
df_condition_pcc = (
    pd.DataFrame.from_dict({
        'Order': [1, 3, 2, 5, 6, 4],
        'Condition': ['CTRL', 'LEU', 'ILE', '(LEU, ILE)', '(LEU, ILE, VAL)', 'VAL'],
        'Riboclette DH': [0.5979, 0.6893, 0.6382, 0.6798, 0.689, 0.6997],
        'LSTM DH': [0.525, 0.6092, 0.5163, 0.6005, 0.656, 0.6602],
        'RiboMIMO': [0.3898, 0.5958, 0.5421, 0.5899, 0.5888, 0.6129]
    })
    .melt(id_vars=['Condition', 'Order'], var_name='Model', value_name='PCC')
    .assign(Model=lambda df: pd.Categorical(df.Model, ['RiboMIMO', 'LSTM DH', 'Riboclette DH']))
    .sort_values('Model'))
df_condition_pcc

Unnamed: 0,Condition,Order,Model,PCC
17,VAL,4,RiboMIMO,0.6129
15,"(LEU, ILE)",5,RiboMIMO,0.5899
14,ILE,2,RiboMIMO,0.5421
13,LEU,3,RiboMIMO,0.5958
12,CTRL,1,RiboMIMO,0.3898
16,"(LEU, ILE, VAL)",6,RiboMIMO,0.5888


In [224]:
PCCs =[0.6816, 0.6759, 0.68, 0.6788, 0.6789] + [0.659, 0.6438, 0.6557, 0.6209, 0.6531] + [0.5831, 0.58, 0.5808, 0.5806, 0.5811] + [0.5532]
MAEs = [.218, .2213, .2176, .2168, .2198] + [0.2231, 0.231, 0.2242, 0.2328, 0.2232] + [np.nan] * 6
Models = ['Riboclette DH+IM'] * 5 + ['Riboclette DH'] * 5 + ['LSTM DH'] * 5 + ['RiboMIMO']

df_overall_pcc = (
    pd.DataFrame(np.array([PCCs, MAEs]).T, columns=['PCC', 'MAE']).assign(Model=Models)
    .assign(Model=lambda df: pd.Categorical(df.Model, ['RiboMIMO', 'LSTM DH', 'Riboclette DH', 'Riboclette DH+IM']))
    .sort_values('Model'))
df_overall_pcc

Unnamed: 0,PCC,MAE,Model
15,0.5532,,RiboMIMO
10,0.5831,,LSTM DH
11,0.58,,LSTM DH
12,0.5808,,LSTM DH
13,0.5806,,LSTM DH
14,0.5811,,LSTM DH
5,0.659,0.2231,Riboclette DH
6,0.6438,0.231,Riboclette DH
7,0.6557,0.2242,Riboclette DH
8,0.6209,0.2328,Riboclette DH


In [45]:
df_imputation = (
    pd.DataFrame.from_dict({
        'Trainset Size': [17897] * 5 + [17897] * 5 + [92700] * 5 + [128808] * 5,
        'PCC': [0.659, 0.6438, 0.6557, 0.6209, 0.6531] + [0.6644, 0.6301, 0.6605, 0.6303, 0.6338] + [0.6793, 0.6768, 0.6795, 0.6795, 0.6817] + [0.6757, 0.6754, 0.6802, 0.6732, 0.6797],
        'MAE': [0.2231, 0.231, 0.2242, 0.2328, 0.2232] + [0.2198, 0.2273, 0.2226, 0.2198, 0.2198] + [0.2236, 0.2243, 0.2209, 0.2253, 0.2217] + [0.2198, 0.2271, 0.2197, 0.2269, 0.2264],
        'Imputed': ['None'] * 5 + ['T'] * 5 + ['(T, D)'] * 5 + ['(T, D, M)'] * 5,
        'Color': [PALETTE_TO_MODEL["Riboclette DH"]] * 5 + [BRIGHT_PALETTE[6]] * 5 + [PALETTE_TO_MODEL["Riboclette DH+IM"]] * 5 + [BRIGHT_PALETTE[5]] * 5
    }))
df_imputation

Unnamed: 0,Trainset Size,PCC,MAE,Imputed,Color
0,17897,0.659,0.2231,,#EE6677
1,17897,0.6438,0.231,,#EE6677
2,17897,0.6557,0.2242,,#EE6677
3,17897,0.6209,0.2328,,#EE6677
4,17897,0.6531,0.2232,,#EE6677
5,17897,0.6644,0.2198,T,#BBBBBB
6,17897,0.6301,0.2273,T,#BBBBBB
7,17897,0.6605,0.2226,T,#BBBBBB
8,17897,0.6303,0.2198,T,#BBBBBB
9,17897,0.6338,0.2198,T,#BBBBBB


In [235]:
df_condition_pcc.groupby('Condition').Order.first().sort_values().index

Index(['CTRL', 'ILE', 'LEU', 'VAL', '(LEU, ILE)', '(LEU, ILE, VAL)'], dtype='object', name='Condition')

In [139]:
(df_condition_pcc
     .groupby(['Model', 'Seed'])
     .PCC
     .agg('mean')
     .reset_index()
     .groupby('Model')
     .PCC
     .agg(['mean', 'std'])
     .reset_index())

Unnamed: 0,Model,mean,std
0,LSTM DH,0.592573,0.001227
1,LSTM SH,0.619703,0.000735
2,RiboMIMO,0.520643,0.002517
3,XLNet DH,0.653,0.015579
4,XLNet DH+PL,0.686507,0.001203
5,XLNet SH,0.639,0.006384


In [151]:
import os
# https://github.com/Kozea/CairoSVG/issues/392
os.environ['DYLD_LIBRARY_PATH']="/opt/homebrew/opt/cairo/lib"

In [163]:
import matplotlib.patches as patches
import cairosvg
import skunk
with plt.style.context(['science','nature','grid','bright']):
    fig = plt.figure(constrained_layout=True, figsize=(TEXTWIDTH_CM*CM_TO_INCH, 10*CM_TO_INCH))
    gs = fig.add_gridspec(nrows=2, ncols=4, wspace=0.01, hspace=.01)
    ax0 = fig.add_subplot(gs[0,:-1])
    ax1 = fig.add_subplot(gs[0,-1])
    ax2 = fig.add_subplot(gs[1,:-1])
    ax3 = fig.add_subplot(gs[1,-1])
    
    ax0.grid(False)
    ax0.spines['top'].set_visible(False)
    ax0.spines['right'].set_visible(False)
    ax0.spines['bottom'].set_visible(False)
    ax0.spines['left'].set_visible(False)
    ax0.get_xaxis().set_ticks([])
    ax0.get_yaxis().set_ticks([])
    ax0.tick_params(bottom=False, top=False, left=False, right=False, labelbottom=False, labeltop=False, labelleft=False, labelright=False)
    ax0.text(x=-0.05, y=1.1, s="A.", fontweight='bold', fontsize=12, ha='right', va='center', transform=ax0.transAxes)
    skunk.connect(ax0, 'sk') 

    data = (
        df_imputation.groupby('Imputed').agg({'PCC': ['mean', 'std'], 'Trainset Size': 'mean', 'Color': 'first'})
        .reset_index())
    data.columns = ['Imputed', 'PCC_mean', 'PCC_std', 'Trainset Size', 'Color']
    for idx, row in data.sort_values('Trainset Size').iterrows():
        if 'GC' in row.Imputed:
            print(row.Imputed)
            sign = 1 if 'IM' in row.Imputed else -1
            trans = Affine2D().translate(sign*1e3, 0.0) + ax3.transData
            ax1.errorbar(x=row['Trainset Size'], y=row['PCC_mean'], yerr=row['PCC_std'], fmt="o", markersize=2, capsize=2, label=row.Imputed, color=row.Color)
        else:
            ax1.errorbar(x=row['Trainset Size'], y=row['PCC_mean'], yerr=row['PCC_std'], fmt="o", markersize=2, capsize=2, label=row.Imputed, color=row.Color)
    ax1.legend(title='Imputed')
    #sns.pointplot(x="Trainset Size", y="PCC", hue='Experiment', capsize=.1, err_kws={'linewidth': 1.5}, markersize=3, errorbar="sd", alpha=.8, dodge=True, data=df_imputation, ax=ax3)
    ax1.set_xticks([v*1000 for v in [0, 25, 50, 75, 100, 125, 150]])
    #ax1.text(x=0.05, y=.45, s="c.", fontweight='bold', fontsize=12, ha='center', va='center', transform=fig.transFigure)
    ax1.set_title("Pseudo-Labeling Performance")
    ax1.set_xlabel("Trainset Size")
    ax1.set_ylabel("PCC")
    ax1.set_ylim((.6, .7))
    ax1.ticklabel_format(style='sci',scilimits=(3,3),axis='x')
    ax1.text(x=-0.05, y=1.1, s="B.", fontweight='bold', fontsize=12, ha='right', va='center', transform=ax1.transAxes)

    width = .15
    multiplier = -2.5
    x_ticks = np.arange(df_condition_pcc.Condition.nunique())
    for curr_model, color in PALETTE_TO_MODEL.items():
        group = (df_condition_pcc
                 .query('Model == @curr_model')
                 .groupby('Condition')
                 .PCC
                 .agg(['mean', 'std'])
                 .reset_index()
                 .set_index('Condition')
                 .loc[CONDITION_ORDER]
        )
        #group=group.sort_values('Order')
        offset = width * multiplier
        rects = ax2.bar(height=group['mean'], yerr=group['std'],  x=x_ticks+offset, width=width, color=color, label=curr_model)
        multiplier += 1

    #f1 = sns.barplot(x='Condition', y='PCC', hue='Model', palette=palette_dict, data=df_condition_pcc, ax=ax1)
    ax2.set_xticks(x_ticks, CONDITION_ORDER)
    ax2.set_ylim(0.3,.8)
    ax2.xaxis.grid(False)
    ax2.xaxis.set_ticks_position('none')

    ax2.set_ylabel('PCC')
    ax2.set_xlabel('Condition')
    ax2.set_title("Condition-wise Model Performance")
    ax2.text(x=-0.05, y=1.1, s="C.", fontweight='bold', fontsize=12, ha='right', va='center', transform=ax2.transAxes)

    width = .6
    df_overall_pcc = (df_condition_pcc
     .groupby(['Model', 'Seed'])
     .PCC
     .agg('mean')
     .reset_index()
     .groupby('Model')
     .PCC
     .agg(['mean', 'std'])
     .reset_index()
     .set_index('Model')
     .loc[PALETTE_TO_MODEL.keys()])
    x_ticks = np.arange(df_overall_pcc.shape[0])
    ax3.bar(height=df_overall_pcc['mean'], yerr=df_overall_pcc['std'], x=x_ticks, color=[PALETTE_TO_MODEL[n] for n in df_overall_pcc.index], width=width)
    #ax3.set_xticks(x_ticks, [n for n, _ in df_overall_pcc.groupby('Model')], ha='center')
    ax3.set_xticks([])
    ax3.xaxis.set_ticks_position('none')
    ax3.xaxis.grid(False)
    ax3.set_ylim(.3,.8)
    #ax2.set_xticklabels(ax2.get_xticklabels(), rotation=30)
    ax3.set_title("Model-wise Macro-Average")
    ax3.text(x=-0.03, y=1.1, s="D.", fontweight='bold', fontsize=12, ha='right', va='center', transform=ax3.transAxes)

    fig.legend(*ax2.get_legend_handles_labels(), bbox_transform=fig.transFigure, loc='center', bbox_to_anchor=(0.5, -0.05), borderaxespad=0., frameon=False, ncols=3)
    
    #sns.scatterplot(x="Trainset Size", y="PCC", hue='Experiment', data=df_imputation.groupby('Experiment').mean().reset_index(), ax=ax3)

    #data = df_imputation.groupby('Experiment').mean()
    #sns.scatterplot(x='MAE', y='PCC', hue='Experiment', data=data, ax=ax4)
    #err_df = (
    #    df_overall_pcc
    #    .groupby('Model')
    #    .agg({'PCC': ['mean', 'std'], 'MAE': ['mean', 'std']})
    #    .reset_index())
    #err_df.columns= err_df.columns.map('_'.join)
    #err_df = (
    #    err_df
    #    .rename(columns={'Model_': 'Model'})
    #    .dropna()
    #)

    #for _, row in err_df.iterrows():
    #    ax4.errorbar(x=row['MAE_mean'], y=row['PCC_mean'], xerr=row['MAE_std'], yerr=row['PCC_std'], color=palette_dict[row['Model']],fmt="o")
    #ax4.legend(bbox_to_anchor=(-.25, 1.15), loc=2, borderaxespad=0., frameon=False, ncols=2)
    

    svg = skunk.insert(
    {
        'sk': 'output/pseudolabel_diagram.svg'
    })

    cairosvg.svg2pdf(bytestring=svg, write_to='output/performance_panel.pdf')
    #plt.savefig('output/performance_panel.png', dpi=600, bbox_inches='tight')

In [168]:
data = (
        df_imputation.groupby('Experiment').agg({'PCC': ['mean', 'std'], 'Trainset Size': 'mean'})
        .reset_index())
data.columns = ['Experiment', 'PCC_mean', 'PCC_std', 'Trainset Size']
data

Unnamed: 0,Experiment,PCC_mean,PCC_std,Trainset Size
0,ALL[IM],0.67936,0.001737,92700.0
1,GC,0.6465,0.015387,17897.0
2,GC[IM],0.64382,0.017126,17897.0
3,GEN[IM],0.67684,0.003004,128808.0


In [None]:
with plt.style.context(['science','nature','grid','bright']):
    ax = plt.figure(constrained_layout=True, figsize=(TEXTWIDTH_CM*CM_TO_INCH, 6*CM_TO_INCH))
    
    data = (
        df_imputation.groupby('Experiment').agg({'PCC': ['mean', 'std'], 'Trainset Size': 'mean'})
        .reset_index())
    data.columns = data.columns.map(lambda x: '_'.join([str(i) for i in x]) if x[0] == 'PCC' else x[0])
    for idx, row in data.sort_values('Trainset Size').iterrows():
        if 'GC' in row.Experiment:
            sign = 1 if 'IM' in row.Experiment else -1
            trans = Affine2D().translate(sign*1e3, 0.0) + ax3.transData
            ax1.errorbar(x=row['Trainset Size'], y=row['PCC_mean'], yerr=row['PCC_std'], fmt="o", markersize=4, transform=trans, capsize=3, label=row.Experiment)
        else:
            ax1.errorbar(x=row['Trainset Size'], y=row['PCC_mean'], yerr=row['PCC_std'], fmt="o", markersize=4, capsize=3, label=row.Experiment)
    ax1.legend()
    #sns.pointplot(x="Trainset Size", y="PCC", hue='Experiment', capsize=.1, err_kws={'linewidth': 1.5}, markersize=3, errorbar="sd", alpha=.8, dodge=True, data=df_imputation, ax=ax3)
    ax1.set_xticks([0, 50e3, 100e3, 150e3])
    ax1.text(x=0.05, y=.45, s="c.", fontweight='bold', fontsize=12, ha='center', va='center', transform=fig.transFigure)
    ax1.set_title("Imputation")
    ax1.set_xlabel("Trainset Size")
    ax1.set_ylabel("PCC")

In [66]:
data.sort_values('Trainset Size_mean')

Unnamed: 0,Experiment,PCC_mean,PCC_std,Trainset Size_mean
1,GC,0.6465,0.015387,17897.0
2,GC[IM],0.64382,0.017126,17897.0
0,ALL[IM],0.67936,0.001737,92700.0
3,GEN[IM],0.67684,0.003004,128808.0


In [42]:
data

Index(['Experiment', 'PCC_mean', 'PCC_std', 'Trainset Size_mean'], dtype='object')

In [40]:
data

Index(['Experiment', 'PCC_mean', 'PCC_std', 'Trainset Size_mean'], dtype='object')

Index(['Experiment', 'PCC_mean', 'PCC_std', 'Trainset Size_mean'], dtype='object')

In [20]:
df_imputation.groupby('Experiment').PCC.agg(['Mean', 'Std']).reset_index()

AttributeError: 'SeriesGroupBy' object has no attribute 'Mean'

In [10]:
df_imputation.groupby('Experiment').std()

Unnamed: 0_level_0,Trainset Size,PCC,MAE
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALL[IM],0.0,0.001737,0.001824
GC,0.0,0.015387,0.004665
GC[IM],0.0,0.017126,0.003274
GEN[IM],0.0,0.003004,0.00387


In [25]:
df_imputation.groupby('Experiment').mean()

Unnamed: 0_level_0,Trainset Size,PCC,MAE
Experiment,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ALL[IM],92700.0,0.67936,0.22316
GC,17897.0,0.6465,0.22686
GC[IM],17897.0,0.64382,0.22186
GEN[IM],128808.0,0.67684,0.22398


In [15]:
df_imputation['Trainset Size'] = df_imputation['Trainset Size'].astype('float64')

In [48]:
df_imputation

Unnamed: 0,Trainset Size,PCC,MAE,Experiment
0,17897,0.659,0.2231,GC
1,17897,0.6438,0.231,GC
2,17897,0.6557,0.2242,GC
3,17897,0.6209,0.2328,GC
4,17897,0.6531,0.2232,GC
5,17897,0.6644,0.2198,GC[IM]
6,17897,0.6301,0.2273,GC[IM]
7,17897,0.6605,0.2226,GC[IM]
8,17897,0.6303,0.2198,GC[IM]
9,17897,0.6338,0.2198,GC[IM]


In [103]:
df_overall_pcc.groupby('Model').mean().dropna()

  df_overall_pcc.groupby('Model').mean().dropna()


Unnamed: 0_level_0,PCC,MAE
Model,Unnamed: 1_level_1,Unnamed: 2_level_1
Riboclette DH,0.6465,0.22686
Riboclette DH+IM,0.67904,0.2187


In [77]:
mpl.rcParams.keys()

KeysView(RcParams({'_internal.classic_mode': False,
          'agg.path.chunksize': 0,
          'animation.bitrate': -1,
          'animation.codec': 'h264',
          'animation.convert_args': ['-layers', 'OptimizePlus'],
          'animation.convert_path': 'convert',
          'animation.embed_limit': 20.0,
          'animation.ffmpeg_args': [],
          'animation.ffmpeg_path': 'ffmpeg',
          'animation.frame_format': 'png',
          'animation.html': 'none',
          'animation.writer': 'ffmpeg',
          'axes.autolimit_mode': 'data',
          'axes.axisbelow': 'line',
          'axes.edgecolor': 'black',
          'axes.facecolor': 'white',
          'axes.formatter.limits': [-5, 6],
          'axes.formatter.min_exponent': 0,
          'axes.formatter.offset_threshold': 4,
          'axes.formatter.use_locale': False,
          'axes.formatter.use_mathtext': False,
          'axes.formatter.useoffset': True,
          'axes.grid': False,
          'axes.grid.axis': 'b

<Axes: xlabel='Condition', ylabel='PCC'>