In [1]:
import itertools
import logging
import os
import glob
import re

from collections import defaultdict
from functools import reduce
from pprint import pprint, pformat
from os.path import basename

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [3]:
input_path = 'output/'
output_path = 'output/images/'
to_process = {
    'FLIPFLOP': {
        'path': 'FLIPFLOP_OLD',
        'nn_curve': False,
        'multiple_trials': True
    },
    'TSP': {
        'path': 'TSP_OLD',
        'nn_curve': False,
        'multiple_trials': True
    },
    'CONTPEAKS': {
        'path': 'CONTPEAKS_OLD',
        'nn_curve': False,
        'multiple_trials': True
    },
    'NN': {
        'path': 'NN_OUTPUT',
        'nn_curve': True,
        'multiple_trials': False
    }
}


In [26]:

def find_best_results(base_dir, problem_name, nn_curve=False, multiple_trials=False):
    output_file_name_regex = re.compile('{}_([A-Za-z]+)(.*)_LOG\.csv'.format(problem_name))
    output_files = glob.glob('{}/{}_*_LOG.csv'.format(base_dir, problem_name))
    files = {}
    for output_file in output_files:
        base_file_name = basename(output_file)
        algo, params = output_file_name_regex.search(base_file_name).groups()
        params = list(filter(None, params.split('_')))

        if algo not in files:
            files[algo] = {'files': [], 'best': 0, 'worst': 100000000000000}

        df = read_data_file(output_file, nn_curve=nn_curve)
        if nn_curve:
            # best_value = np.max(np.max(df[['f1_tst']]))
            best_value = np.max(np.max(df[['acc_tst']]))
        else:
            best_value = np.max(df['fitness'])

        if best_value > files[algo]['best']:
            if nn_curve:
                files[algo]['best'] = best_value
                files[algo]['files'] = [output_file]
            else:
                # Read all the trials for this algo and param set
                params = '_'.join(params[0:-1])
                similar_files = glob.glob('{}/{}_{}{}*_LOG.csv'.format(base_dir, problem_name, algo, params))

                # TODO: This is super inefficient ... but... maybe fine? ¯\_(ツ)_/¯
                # Double-check it's still the best
                fitness_data = df['fitness']
                curr_max = 0
                curr_min = 100000000000000
                for i, f in enumerate(list(similar_files)):
                    file_fitness_data = read_data_file(f, nn_curve=nn_curve)[['fitness']]
#                     fitness_data = pd.concat([fitness_data, file_fitness_data], axis=1)
                    max_v = np.max(file_fitness_data)[0]
#                     print(max_v[0])
                    if curr_max < max_v:
                        curr_max = max_v
                    if curr_min > max_v:
                        curr_min = max_v

#                 curr_max = np.max(fitness_data, axis=1))
                
                if curr_max > files[algo]['best']:
                    files[algo]['best'] = curr_max
                    files[algo]['files'] = list(similar_files)
                if curr_min < files[algo]['worst']:
                    files[algo]['worst'] = curr_min

#     logger.info(pformat(files))
#     with open(input_path + '/best_results_raw.txt', 'a+') as f:
#         f.write("---------- {} ----------\n".format(problem_name))
#         f.write(pformat(files) + '\n')

#     return dict(list(map(lambda k: (k, files[k]['files']), files)))
    return files



In [27]:
find_best_results('./output/CONTPEAKS_OLD/', 'CONTPEAKS', False, True)

{'RHC': {'files': ['./output/CONTPEAKS_OLD/CONTPEAKS_RHC_3_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_RHC_2_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_RHC_1_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_RHC_4_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_RHC_5_LOG.csv'],
  'best': 99.0,
  'worst': 64.0},
 'MIMIC': {'files': ['./output/CONTPEAKS_OLD/CONTPEAKS_MIMIC100_50_0.5_5_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_MIMIC100_50_0.5_4_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_MIMIC100_50_0.5_2_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_MIMIC100_50_0.5_3_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_MIMIC100_50_0.5_1_LOG.csv'],
  'best': 99.0,
  'worst': 38.0},
 'SA': {'files': ['./output/CONTPEAKS_OLD/CONTPEAKS_SA0.95_3_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_SA0.95_2_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_SA0.95_1_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_SA0.95_4_LOG.csv',
   './output/CONTPEAKS_OLD/CONTPEAKS_SA0.95_5_LOG.csv'],
  'best': 

In [9]:
def read_data_file(file, nn_curve=False):
    #     logger.info("    - Processing {}".format(file))
    df = pd.read_csv(file)
    if 'iterations' not in df.columns:
        df = df.rename(columns={'iteration': 'iterations'})

    df = df.set_index('iterations')
    # Trim the nn graphs to the first 1k iterations, as after that the graphs flatten out
    # if nn_curve:
    #     df = df[df.index <= 2000]

    return df