## Analyze the Collected Results

Get the following metrics for each experiment:
  * total running time
  * deployment time (first run's start time - job submission time)
  * clean-up time (job finish time - last run's finish time)
  * best metric on validation set
  * metric on test set

In [None]:
import os
import pandas as pd
import papermill as pm

In [None]:
RESULT_DIR = os.path.join('./log_old')
COLUMN_NAMES = ["best_metric", "test_metric", "total_time", "deployment_time", "clean_up_time"]

### Repeat experiments

In [None]:
notebook_dir = os.getcwd()

for i in range(5):
    print('Experiment ' + str(i))
    original_notebook_path = os.path.join(notebook_dir, 'azureml_hyperdrive_surprise_svd_experiment.ipynb')
    output_notebook_path = os.path.join(notebook_dir, 'output.ipynb')
    pm.execute_notebook(original_notebook_path, output_notebook_path)

### Get average metrics

In [None]:
def combine_results(result_dir, column_names, only_first_row=True):
    """Combine results of all the repeated experiments.
    """
    results_all = pd.DataFrame(columns=["experiment_id"] + column_names)
    for idx, sub_folder in enumerate(next(os.walk(result_dir))[1]):
        result_file_path = os.path.join(result_dir, sub_folder, "results.csv")
        try:
            results = pd.read_csv(result_file_path)
        except:
            continue
        results["experiment_id"] = idx
        results = results[["experiment_id"] + column_names]
        if only_first_row:
            results_all.loc[idx] = results.iloc[0]
        else:
            results_all = pd.concat([results_all, results])
    results_all.reset_index(drop=True, inplace=True)
    return results_all

In [None]:
results_all = combine_results(RESULT_DIR, COLUMN_NAMES)
results_mean = results_all[COLUMN_NAMES].mean(axis=0).to_dict()
results_mean

### Plot accuracy vs. time

In [None]:
results_all = combine_results(RESULT_DIR, ["time_since_start", "best_metric_so_far"], only_first_row=False)
results_all

In [None]:
results_groupby = results_all.groupby(["experiment_id"]).min()
min_time_since_start = results_groupby["time_since_start"].mean()
min_time_since_start

In [None]:
results_groupby = results_all.groupby(["experiment_id"]).max()
max_time_since_start = results_groupby["time_since_start"].mean()
max_time_since_start

In [None]:
unique_times = list(set(results_all["time_since_start"]))
unique_times = [x for x in unique_times if x > min_time_since_start and x < max_time_since_start]
unique_times.sort()

In [None]:
def get_best_metric_so_far(x, time_since_start):
    return x[x["time_since_start"] <= time_since_start]["best_metric_so_far"].max()

avg_best_metric = []
for t in unique_times:
    avg_best_metric.append(results_all.groupby(["experiment_id"]).apply(lambda x: get_best_metric_so_far(x, t)).mean())

In [None]:
# Plot accuracy vs. time
%matplotlib inline
import matplotlib.pyplot as plt

plt.xlabel("time (s)")
plt.ylabel("precision_at_k")
plt.xlim((0, max(unique_times)))
plt.ylim((0, 0.1))
plt.title("HyperDrive")
plt.step(unique_times, avg_best_metric)
plt.show()