In [None]:
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
import pandas as pd
import seaborn as sns
import copy

In [None]:
%config Completer.use_jedi = False
%matplotlib inline

In [None]:
def to_set_fold(model_file):
    stripped = model_file.replace('best_model_v_', '')
    stripped = stripped.replace('.h5', '')
    stripped = stripped.replace('_drop', '')
    return stripped[:-1], int(stripped[-1])

In [None]:
dats = []
for fold in range(4):
    dat = pd.read_csv(f'data_splits/valmix/f1_scores_valmix_fold{fold}.csv', header=None)
    dats.append(dat)

In [None]:
dat = pd.concat(dats)

In [None]:

dat.columns = ['model', 'species', 'genic_f1', 'sub_genic_f1']

dat.loc[:, ["valset", "fold"]] = [to_set_fold(m) for m in dat.loc[:, "model"]]

In [None]:
#dat.sort_values(by=['fold', 'species'], ascending=True)

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(x='valset', y='genic_f1', data=dat)
 
ax = sns.swarmplot(x='valset', y='genic_f1', data=dat, hue="species")
ax.get_legend().remove()

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(x='fold', hue='valset', y='genic_f1', data=dat)
 
#ax = sns.swarmplot(x='valset', y='genic_f1', data=dat, hue="species")
#ax.get_legend().remove()

In [None]:
pivotted = dat.pivot(columns='valset', index=['species', 'fold'], values='genic_f1')

In [None]:
plt.bar(height=np.sort(pivotted['eval'] - pivotted['all']), x=range(pivotted.shape[0]))
plt.ylabel("genic F1(eval - all)")
plt.xticks([])
plt.xlabel("species, sorted")

In [None]:
mycol = np.array([['#5D8AA8', '#D3212D', '#5D8AA8', '#5D8AA8'][x[1]] for x in pivotted.index])
differences = pivotted['eval'] - pivotted['train']
plt.bar(height=np.sort(differences), x=range(pivotted.shape[0]), 
        color=mycol[np.argsort(np.array(differences))])
plt.ylabel("genic F1(eval - all)")
plt.xticks([])
plt.xlabel("species, sorted")

In [None]:
differences[np.argsort(np.array(differences))][:20]


In [None]:
subdat = dat.loc[[x in [0, 2, 3] for x in dat.loc[:,'fold']], :]
pivotted = subdat.pivot(columns='valset', index='species', values='genic_f1')
plt.bar(height=np.sort(pivotted['eval'] - pivotted['train']), x=range(pivotted.shape[0]))
plt.ylabel("genic F1(eval - all)")
plt.xticks([])
plt.xlabel("species, sorted")

In [None]:
subdat = dat.loc[[x in [1] for x in dat.loc[:,'fold']], :]
pivotted = subdat.pivot(columns='valset', index='species', values='genic_f1')
plt.bar(height=np.sort(pivotted['eval'] - pivotted['train']), x=range(pivotted.shape[0]))
plt.ylabel("genic F1(eval - all)")
plt.xticks([])
plt.xlabel("species, sorted")

In [None]:
epochs = pd.read_csv('data_splits/valmix/epochs_to_best.csv')

In [None]:
epochs

In [None]:
################ more vs quality
# uses valmix eval as the 'quality'
# 'more' swaps the train/val species that were used

In [None]:
def to_moqsmall_fold(model_file):
    stripped = model_file.replace('best_model_', '')
    stripped = stripped.replace('.h5', '')
    stripped = stripped.replace('_drop', '')
    fold = int(stripped[-1])
    stripped = stripped[:-1]
    if stripped[-1] == "_":
        stripped = stripped[:-1]
    return stripped, fold

In [None]:
moqsmalls = pd.read_csv("data_splits/f1_moq_smalls.csv", header=None)

In [None]:
moqsmalls

In [None]:
x = [3,5,76]

In [None]:
x.pop(-1)

In [None]:
x

In [None]:
moqsmalls.columns = ['model', 'species', 'genic_f1', 'sub_genic_f1']

In [None]:
moqsmalls.loc[:,['valset', 'fold']] = [to_moqsmall_fold(x) for x in moqsmalls["model"]]

In [None]:
# grab metaok too
metaok = pd.read_csv("data_splits/f1_metaok.csv", header=None)
metaok.columns = ['model', 'species', 'genic_f1', 'sub_genic_f1']
metaok.loc[:,['valset', 'fold']] = [to_moqsmall_fold(x) for x in metaok["model"]]

In [None]:
metaok

In [None]:
# put together just the pair to compare wheter thraining with more or the quailty selected
# species generalizes/performs better on left out test species
moq = pd.concat([dat.loc[dat["valset"] == "eval", :],
                 moqsmalls.loc[moqsmalls["valset"] == "drop", :],
                 metaok])

In [None]:
moq.columns = list(moq.columns[:4]) + ['trainers'] + list(moq.columns[5:])

In [None]:
moq["trainers"] = [{"drop": "more", "eval": "qual", "metaok": "metaok"}[x] for x in moq["trainers"]]

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(x='fold', hue='trainers', y='genic_f1', data=moq,)
# OK, then, inconclusiveness FTW -_-

In [None]:
dat.loc[dat["valset"] == "train", :]
#dat

In [None]:
####### more only, dataset size
smalls = copy.deepcopy(moqsmalls)
smalls.columns = list(smalls.columns[:4]) + ['train_size'] + list(smalls.columns[5:])

In [None]:
smalls.loc[smalls["train_size"] == "drop", "train_size"] = "large"


In [None]:
smalls.sort_values(by="train_size")

In [None]:
plt.figure(figsize=(10,10))
ax = sns.boxplot(x='fold', hue='train_size', y='genic_f1', data=smalls.sort_values(by="train_size"))