# maxsmi
## Analysis of results

This notebook serves to analyse the results of the simulations ran on the Curta cluster.

## Prerequisites
This notebook will run under the condition that some simulations have been run, e.g.
```
(maxsmi) $ python maxsmi/full_workflow.py --task ESOL --aug-strategy-train augmentation_with_duplication --aug-nb-train 10 --aug-nb-test 10

```

Have a look at the [README](https://github.com/t-kimber/maxsmi/blob/main/README.md) page for more details.

In [1]:
#  !pip install flake8 pycodestyle_magic
%load_ext pycodestyle_magic
%pycodestyle_on

In [2]:
from maxsmi.utils_analysis import load_results, retrieve_metric
import pandas as pd
from IPython.display import display
import matplotlib.pyplot as plt
import numpy as np

In [3]:
# To show the full pandas data frame with the full grid
pd.set_option("display.max_rows", None, "display.max_columns", None)

In [4]:
TASK = "ESOL"

In [5]:
fine_grid = [elem for elem in range(0, 21, 1)]
coarse_grid = [elem for elem in range(0, 110, 10)]

temp_grid = [elem for elem in range(30, 110, 10)]
full_grid = fine_grid + temp_grid

In [6]:
def array_by_strategy(augmentation_strategy,
                      task="ESOL",
                      set_="test",
                      metric="rmse",
                      grid=full_grid):
    """
    #TODO
    """

    models = ["CONV1D", "CONV2D", "RNN"]

    if augmentation_strategy == "augmentation_maximum_estimation":
        if task == "ESOL":
            task = "ESOL_SMALL"
        result_array = np.zeros((1, len(models)))
        for i, model in enumerate(models):
            for j, augmentation_num in enumerate([10]):
                try:
                    y = retrieve_metric(
                        metric,
                        set_,
                        task,
                        augmentation_strategy,
                        augmentation_num,
                        augmentation_strategy,
                        augmentation_num,
                        model,
                    )
                except FileNotFoundError:
                    y = np.nan
                result_array[j, i] = y
    else:
        result_array = np.zeros((len(grid), len(models)))

        for i, model in enumerate(models):
            for j, augmentation_num in enumerate(grid):
                try:
                    y = retrieve_metric(
                        metric,
                        set_,
                        task,
                        augmentation_strategy,
                        augmentation_num,
                        augmentation_strategy,
                        augmentation_num,
                        model,
                    )
                except FileNotFoundError:
                    y = np.nan
                result_array[j, i] = y
    return result_array

In [7]:
res_without_dupl = array_by_strategy("augmentation_without_duplication",
                                     task=TASK)
res_with_dupl = array_by_strategy("augmentation_with_duplication", task=TASK)
res_with_red_dupl = array_by_strategy("augmentation_with_reduced_duplication",
                                      task=TASK)
res_max_est = array_by_strategy("augmentation_maximum_estimation", task=TASK)

In [8]:
full_res = np.concatenate([res_without_dupl,
                           res_with_dupl,
                           res_with_red_dupl,
                           res_max_est])

In [9]:
grid = full_grid

In [10]:
index_list = ["augmentation_without_duplication" for i in range(len(grid))] + \
             ["augmentation_with_duplication" for i in range(len(grid))] + \
             ["augmentation_with_reduced_duplication"
              for i in range(len(grid))] + \
             ["augmentation_maximum_estimation"]

index_nb = grid + \
           grid + \
           grid + \
           [10]

df = pd.DataFrame(full_res,
                  index=[index_list, index_nb],
                  columns=['CONV1D', 'CONV2D', 'RNN'])

In [11]:
df2 = df.style.\
    set_caption(f"Data: {TASK}").\
    format("{:.3f}").\
    background_gradient(cmap='Purples', subset=["CONV1D"]).\
    background_gradient(cmap='Greens', subset=["CONV2D"]).\
    background_gradient(cmap='Blues', subset=["RNN"]).\
    highlight_min(color="yellow")
df2

Unnamed: 0,Unnamed: 1,CONV1D,CONV2D,RNN
augmentation_without_duplication,0,0.839,0.895,0.93
augmentation_without_duplication,1,0.964,1.009,1.016
augmentation_without_duplication,2,0.785,0.787,0.964
augmentation_without_duplication,3,0.785,0.726,0.896
augmentation_without_duplication,4,0.732,0.761,0.881
augmentation_without_duplication,5,0.716,0.748,0.791
augmentation_without_duplication,6,0.666,0.743,0.788
augmentation_without_duplication,7,0.66,0.676,0.773
augmentation_without_duplication,8,0.712,0.692,0.743
augmentation_without_duplication,9,0.642,0.761,0.727


In [12]:
def best_strategy_per_model(model, dataframe):
    """
    #TODO
    """
    return (dataframe.iloc[dataframe[model].argmin()].name,
            np.round(dataframe[model].min(), 3))

In [13]:
print(best_strategy_per_model("CONV1D", df),
      best_strategy_per_model("CONV2D", df),
      best_strategy_per_model("RNN", df))

(('augmentation_with_reduced_duplication', 70), 0.569) (('augmentation_with_reduced_duplication', 14), 0.631) (('augmentation_with_duplication', 70), 0.589)
