In [1]:
import os
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
repo_path = '/cluster/tufts/hugheslab/eharve06/bdl-transfer-learning'
experiments_path = os.path.join(repo_path, 'experiments/CIFAR-10')

In [3]:
def get_df(path):
    df = pd.read_csv(path, index_col='Unnamed: 0')
    return df

def get_best_epoch(df, window_size):
    val_acc = df.val_acc.values
    windowed_acc = [sum(val_acc[index-window_size:index])/window_size \
                      for index in range(window_size, len(val_acc))]
    return df.iloc[window_size+np.argmax(windowed_acc)]

In [4]:
lr_0s = np.logspace(-1, -2, num=2)
ns = [10, 100, 1000, 10000, 50000]
random_states = [1001]
weight_decays = np.append(np.logspace(-2, -6, num=5), 0)
window_size = 5

In [5]:
columns = ['n', 'random_state', 'method', 'test_acc', 'test_loss', 'test_nll', 
           'test_prior', 'train_acc', 'train_loss', 'train_nll', 'train_prior', 
           'val_acc', 'val_loss', 'val_nll', 'val_prior']
df = pd.DataFrame(columns=columns)

for n, random_state in itertools.product(ns, random_states):
    # Get best model
    best_row = None
    for lr_0, weight_decay in itertools.product(lr_0s, weight_decays):
        df_path = '{}/nonlearned_lr_0={}_n={}_random_state={}_weight_decay={}.csv'\
        .format(experiments_path, lr_0, n, random_state, weight_decay)
        row = get_best_epoch(get_df(df_path), window_size=window_size)
        if best_row is None: best_row = row
        if row['val_acc'] > best_row['val_acc']: best_row = row
    # Append best_model to df
    row = [n, random_state, 'nonlearned', best_row.test_acc, best_row.test_loss, 
           best_row.test_nll, best_row.test_prior, best_row.train_acc, 
           best_row.train_loss, best_row.train_nll, best_row.train_prior, 
           best_row.val_acc, best_row.val_loss, best_row.val_nll, 
           best_row.val_prior]
    df.loc[df.shape[0]] = row
# TODO: If more seeds are added average over seeds
#df = df.groupby(['n', 'method']).agg(lambda x: list(x.values))
#columns = ['test_acc', 'train_acc', 'val_acc']
#for column in columns:
#    df['{}_std'.format(column)] = df[column].apply(lambda item: np.std(item))
#    df[column] = df[column].apply(lambda item: np.mean(item))
#df = df.reset_index()

In [6]:
df[['n', 'random_state', 'method', 'train_acc', 'val_acc', 'test_acc']]

Unnamed: 0,n,random_state,method,train_acc,val_acc,test_acc
0,10,1001,nonlearned,0.166667,0.5,0.09984
1,100,1001,nonlearned,1.0,0.555556,0.549444
2,1000,1001,nonlearned,1.0,0.798122,0.80427
3,10000,1001,nonlearned,1.0,0.937957,0.930213
4,50000,1001,nonlearned,1.0,0.96712,0.964901


In [40]:
df[['n', 'random_state', 'method', 'train_acc', 'train_acc_std', 'val_acc', 
    'val_acc_std', 'test_acc', 'test_acc_std']]

KeyError: "['train_acc_std' 'val_acc_std' 'test_acc_std'] not in index"