In [209]:
import glob
import pandas as pd
import re

In [210]:
import os
import json

In [220]:
dirlist = glob.glob('output_*')

In [221]:
adam_regex = re.compile('adam_epsilon=[0-9e\-]+')
checkpoint_regex = re.compile('checkpoint=[\'\.\/a-zA-Z0-9_]+')
config_regex = re.compile('config[\=\'\.\/a-zA-Z0-9_]+')
doc_stride_regex = re.compile('doc_stride=[0-9]+')
learning_rate_regex = re.compile('learning_rate=[0-9e\-]+')
max_answer_length_regex = re.compile('max_answer_length=[0-9]+')
max_grad_norm_regex = re.compile('max_grad_norm=[0-9\.]+')
max_query_length_regex = re.compile('max_query_length=[0-9]+')
max_seq_length_regex = re.compile('max_seq_length=[0-9]+')
n_best_size_regex = re.compile('n_best_size=[0-9]+')
num_train_epochs_regex = re.compile('num_train_epochs=[0-9\.]+')
per_gpu_train_batch_size_regex = re.compile('train_batch_size=[0-9]+')
seed_regex = re.compile('seed=[0-9]+')
warmup_proportion_regex = re.compile('warmup_proportion=[0-9\.]+')
weight_decay_regex = re.compile('weight_decay')

def get_train_hyperparams(key, path):

    with open(path) as param:
        parameters = param.read()       

        adam_epsilon = adam_regex.findall(parameters)[0]        
        checkpoint = checkpoint_regex.findall(parameters)[0]        
        config_file = config_regex.findall(parameters)[0]        
        doc_stride = doc_stride_regex.findall(parameters)[0]        
        gradient_acc_steps = gradient_acc_steps_regex.findall(parameters)[0]        
        learning_rate = learning_rate_regex.findall(parameters)[0]        
        max_answer_length = max_answer_length_regex.findall(parameters)[0]        
        max_grad_norm = max_grad_norm_regex.findall(parameters)[0]        
        max_query_length = max_query_length_regex.findall(parameters)[0]        
        max_seq_length = max_seq_length_regex.findall(parameters)[0]        
        n_best_size = n_best_size_regex.findall(parameters)[0]        
        num_train_epochs = num_train_epochs_regex.findall(parameters)[0]        
        per_gpu_train_batch_size = per_gpu_train_batch_size_regex.findall(parameters)[0]        
        seed = seed_regex.findall(parameters)[0]        
        warmup_proportion = warmup_proportion_regex.findall(parameters)[0]
        
    hyperparams = {
        'key': key,
        'adam_epsilon':adam_epsilon.split('=')[1],
        'checkpoint':checkpoint.split('=')[1],
        'config_file':config_file.split('=')[1],
        'doc_stride':doc_stride.split('=')[1],
        'learning_rate':learning_rate.split('=')[1],
        'max_answer_length': max_answer_length.split('=')[1],
        'max_grad_norm': max_grad_norm.split('=')[1],
        'max_query_length': max_query_length.split('=')[1],
        'max_seq_length': max_seq_length.split('=')[1],
        'n_best_size': n_best_size.split('=')[1],
        'num_train_epochs': num_train_epochs.split('=')[1],
        'train_batch_size': per_gpu_train_batch_size.split('=')[1],
        'seed': seed.split('=')[1],
        'warmup_proportion': warmup_proportion.split('=')[1]        
    }
    
    return hyperparams

def get_eval_hyperparams(key, path):
    
    with open(path) as param:
        param_json = json.load(param)
        parameters = param_json['parameters']
        
        checkpoint = checkpoint_regex.findall(parameters)[0]
        doc_stride = doc_stride_regex.findall(parameters)[0]
        max_answer_length = max_answer_length_regex.findall(parameters)[0]
        max_seq_length = max_seq_length_regex.findall(parameters)[0]
        n_best_size = n_best_size_regex.findall(parameters)[0]
        seed = seed_regex.findall(parameters)[0]
        
        result = param_json['result']        
                
    return {
        'key': key,
        'eval_checkpoint': checkpoint.split('=')[1],
        'eval_doc_stride': doc_stride.split('=')[1],
        'eval_max_answer_length': max_answer_length.split('=')[1],
        'eval_max_seq_length': max_seq_length.split('=')[1],
        'eval_n_best_size': n_best_size.split('=')[1],
        'eval_seed': seed.split('=')[1],
        'em': result['exact_match'],
        'f1': result['f1']
    }

        
def get_eval_result(path):
    
    with open(path) as param:
        param_json = json.loads(param)
        
        result = param_json['result']
        
    return result
        

In [222]:
def build_experiment_info(path):
    checkpoints = glob.glob(os.path.join(dirpath, 'korquad*.bin'))
    num_checkpoints = len(checkpoints)
    
    #read/parsing hyperparameter file
    train_param = get_train_hyperparams(path, os.path.join(path, 'hyperparameters.txt'))
#     print(train_param)
    #read/parsing evaluation file
    eval_param = get_eval_hyperparams(path, os.path.join(path, 'evaluation.txt'))
#     print(eval_param)
    
    train_df = pd.DataFrame.from_dict([train_param])
    
    eval_df = pd.DataFrame.from_dict([eval_param])
    
    _df = pd.merge(train_df, eval_df)    
    
    return _df        

In [223]:
df = build_experiment_info(dirlist[0])
for dirpath in dirlist[1:]:
    print(dirpath)
    _df = build_experiment_info(dirpath)    
    
    df = pd.concat([df, _df])  


output_20200611_02
output_20200609_03
output_20200609_04
output_20200604_03
output_20200609_02
output_20200610_01
output_20200604_04
output_20200605_01
output_20200611_01
output_20200608_01
output_20200608_02
output_20200604_02
output_20200603_01
output_20200604_01


In [224]:
key_list = df.keys()

In [225]:
pd.options.display.max_columns = None

In [226]:
df.drop(columns=['checkpoint', 'config_file'], inplace=True)

In [227]:
df.sort_values(by=['key'], ascending=True, inplace=True)

In [228]:
df

Unnamed: 0,key,adam_epsilon,doc_stride,learning_rate,max_answer_length,max_grad_norm,max_query_length,max_seq_length,n_best_size,num_train_epochs,train_batch_size,seed,warmup_proportion,eval_checkpoint,eval_doc_stride,eval_max_answer_length,eval_max_seq_length,eval_n_best_size,eval_seed,em,f1
0,output_20200603_01,1e-06,128,5e-05,30,1.0,64,512,20,16.0,16,42,0.1,'output/korquad_15.bin',64,30,512,20,42,77.225494,87.399047
0,output_20200604_01,1e-06,128,5e-05,30,1.0,96,512,20,4.0,16,42,0.1,'output/korquad_3.bin',86,30,512,128,42,78.662972,88.399031
0,output_20200604_02,1e-06,128,5e-05,30,1.0,96,512,20,4.0,16,42,0.1,'output/korquad_7.bin',64,30,512,20,42,78.108763,88.003316
0,output_20200604_03,1e-06,128,5e-05,30,1.0,96,512,20,4.0,32,42,0.1,'output/korquad_3.bin',64,30,512,20,42,77.485279,87.386134
0,output_20200604_04,1e-06,64,5e-05,30,1.0,96,512,20,4.0,32,42,0.1,'output/korquad_3.bin',64,30,512,20,42,77.260132,87.459613
0,output_20200605_01,1e-06,128,5e-05,17,1.0,96,512,20,4.0,16,42,0.1,'output/korquad_3.bin',64,30,512,20,42,78.628334,88.393956
0,output_20200608_01,1e-06,128,5e-05,30,1.0,96,512,40,4.0,16,42,0.1,'output/korquad_3.bin',64,30,512,40,42,78.628334,88.393956
0,output_20200608_02,1e-06,128,2e-05,30,1.0,96,512,20,8.0,16,42,0.1,'output/korquad_7.bin',64,30,512,20,42,77.918254,87.876529
0,output_20200609_01,1e-06,128,5e-05,30,1.0,96,512,20,4.0,32,42,0.1,'output/korquad_3.bin',128,30,512,128,42,77.46796,87.401976
0,output_20200609_02,1e-06,128,5e-05,30,1.0,96,512,20,4.0,32,42,0.1,'output/korquad_3.bin',128,30,512,128,42,77.46796,87.401976
