In [1]:
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [7]:
def get_df(path):
    df = pd.read_csv(path, index_col='Unnamed: 0')
    return df

def get_best_epoch(df, window_size):
    val_nll = df.val_nll.values
    windowed_nll = [sum(val_nll[index-window_size:index])/window_size \
                      for index in range(window_size, len(val_nll))]
    return df.iloc[window_size+np.argmin(windowed_nll)]

In [8]:
repo_path = '/cluster/tufts/hugheslab/eharve06/bdl-transfer-learning'
experiments_path = os.path.join(repo_path, 'experiments/HAM10000/8:1:1')

In [9]:
random_states = [1001, 2001, 3001]
#random_states = [4001, 5001, 6001]
prior_scales = np.logspace(0, 9, num=10)
lr_0s = np.logspace(-1, -4, num=4)
weight_decays = np.append(np.logspace(-2, -6, num=5), 0)
window_size = 5

In [1]:
columns = ['random_state', 'method', 'test_auroc', 'test_loss', 'test_nll', 
           'test_prior', 'train_auroc', 'train_loss', 'train_nll', 'train_prior', 
           'val_auroc', 'val_loss', 'val_nll', 'val_prior']
df = pd.DataFrame(columns=columns)

for (random_state,) in itertools.product(random_states):
    # Get best model
    best_row = None
    for lr_0, weight_decay in itertools.product(lr_0s, weight_decays):
        df_path =  '{}/nonlearned_lr_0={}_n=1000_random_state={}_weight_decay={}.csv'\
        .format(experiments_path, lr_0, random_state, weight_decay)
        row = get_best_epoch(get_df(df_path), window_size=window_size)
        if best_row is None: best_row = row
        if row['val_nll'] < best_row['val_nll']: best_row = row
    # Append best_model to df
    row = [random_state, 'nonlearned', best_row.test_auroc, best_row.test_loss, 
           best_row.test_nll, best_row.test_prior, best_row.train_auroc, 
           best_row.train_loss, best_row.train_nll, best_row.train_prior, 
           best_row.val_auroc, best_row.val_loss, best_row.val_nll, 
           best_row.val_prior]
    df.loc[df.shape[0]] = row
    # Get best model
    best_row = None
    for lr_0, prior_scale, weight_decay in itertools.product(lr_0s, prior_scales, weight_decays):
        df_path =  '{}/learned_lr_0={}_n=1000_prior_scale={}_random_state={}_weight_decay={}.csv'\
        .format(experiments_path, lr_0, prior_scale, random_state, weight_decay)
        row = get_best_epoch(get_df(df_path), window_size=window_size)
        if best_row is None: best_row = row
        if row['val_nll'] < best_row['val_nll']: best_row = row
    # Append best_model to df
    row = [random_state, 'learned', best_row.test_auroc, best_row.test_loss, 
           best_row.test_nll, best_row.test_prior, best_row.train_auroc, 
           best_row.train_loss, best_row.train_nll, best_row.train_prior, 
           best_row.val_auroc, best_row.val_loss, best_row.val_nll, 
           best_row.val_prior]
    df.loc[df.shape[0]] = row
# TODO: If more seeds are added average over seeds
df = df.groupby('method').agg(lambda x: list(x))
columns = ['test_auroc', 'train_auroc', 'val_auroc']
for column in columns:
    df['{}_std'.format(column)] = df[column].apply(lambda item: np.std(item))
    df[column] = df[column].apply(lambda item: np.mean(item))
df = df.reset_index()

NameError: name 'pd' is not defined

In [11]:
df[['random_state', 'method', 'train_auroc', 'train_auroc_std', 'val_auroc', 
    'val_auroc_std', 'test_auroc', 'test_auroc_std']]

Unnamed: 0,random_state,method,train_auroc,train_auroc_std,val_auroc,val_auroc_std,test_auroc,test_auroc_std
0,"[1001, 2001, 3001]",learned,1.0,0.0,0.885,0.038258,0.846525,0.022369
1,"[1001, 2001, 3001]",nonlearned,0.980668,0.026712,0.856886,0.03602,0.84951,0.003192
