In [1]:
# io
import os, sys, time
import itertools, functools, collections

from natsort import natsorted

import csv

sys.path.append('convert_full_data')
sys.path.append('../regression-prior-networks')

# science utils
import numpy as np
import scipy
import pandas as pd

import sklearn
import sklearn.model_selection
import sklearn.preprocessing

# plot
import matplotlib.pyplot as plt
import seaborn as sns

#####################
# custom
import tss
from parallization_metacentrum import ArrayMetacentrum, IsMetacentrum

#####################
# pytorch & settings
os.environ['CUDA_VISIBLE_DEVICES'] = ""
import torch

torch.set_num_threads(1)
torch.set_num_interop_threads(1)

if torch.cuda.is_available():
    print("Using cuda")
else:
    print("Using cpu")

Using cpu


In [2]:
class RecordedRun:
    def __init__(self, npz_path, info):
        super().__init__()

        npz = np.load(npz_path)

        self.dim = npz['dimensions']
        self.fun = npz['function_id']

        self.xmeans = npz['surrogate_data_means'].T
        self.sigmas = npz['surrogate_data_sigmas']
        self.bds = npz['surrogate_data_bds']
        self.iruns = npz['iruns']
        self.evals = npz['evals']
        self.points = npz['points']
        self.fvalues = npz['fvalues']
        self.orig = npz['orig_evaled']
        self.coco = npz['fvalues_orig']
        self.gen_split = npz['gen_split']

        self.n_gen = len(self.gen_split)
        self.n_points = len(self.points)

        # tuple: x_train, y_train, x_test, y_test, tss_mask, stats
        self.all_gens = [self.get_gen(gen_i, info) for gen_i in range(1, self.n_gen)]
        self.stats_table = pd.DataFrame([gen[-1] for gen in self.all_gens])

        # for key, val in info.items():
        #     self.stats_table[key] = val
        # self.stats_table['ins'] = ins

    def get_gen(self, gen_i, info):
        # first point is initial guess
        # gen_split[0] = 0, gen_split[1] = 1, gen_split[2] = 1, ...
        low = self.gen_split[gen_i] + 1
        high = self.gen_split[gen_i + 1] + 1 if gen_i + 1 < self.n_gen else self.n_points

        x_test = self.points[low:high]
        y_test = self.coco[low:high]

        o = self.orig[:low]
        x_train = self.points[:low][o]
        y_train = self.coco[:low][o]

        pop = x_test

        mean = self.xmeans[gen_i]
        sigma = self.sigmas[gen_i]
        bd = self.bds[gen_i]
        mahalanobis_transf = np.linalg.inv(bd * sigma)

        maximum_distance = 4  # trainRange
        maximum_number = int(20 * self.dim)

        tss2 = tss.TSS2(pop, mahalanobis_transf, maximum_distance, maximum_number)
        tss_mask, _ = tss2(x_train, y_train)

        x_tss = x_train[tss_mask]
        y_tss = y_train[tss_mask]

        stats = {
            "gen_num": gen_i,
            "restarts": self.iruns[gen_i] - 1,
            "arch_len": len(x_train),
            "tss_len": np.count_nonzero(tss_mask),
            "var_y_tss": np.var(y_tss),

            "sigma": sigma,
            "bd": bd,
            "mean": mean,
            #"cond_num": condition_num(bd.T),
            #"fst_scnd_ratio": fst_scnd_ratio(bd.T),
            #"max_eigenvec": max(np.linalg.norm(bd.T, axis=1))
        }
        stats.update(info)

        return x_train, y_train, x_test, y_test, tss_mask, stats

In [3]:
run_folder = 'data_full/'
run_files = os.listdir(run_folder)
run_files = natsorted(run_files)
print(len(run_files))

5280


### stuff used for ens and distillation

In [4]:
from probabilistic_models import extend_info, \
    get_information_basic_model, get_information_ens, \
    get_information_distillation, get_information_double_distillation, \
    train_NLL_ensemble, train_ensemble, train_distillation, train_double_distillation

In [5]:
def train_all_models_on_specific_state(state):
    training_output = {}
    
    # dataset handling - train vs test - loading
    x_train, y_train, x_test, y_test, tss_mask, stats = state
            
    x      = x_train[tss_mask].astype(np.float32)
    y      = y_train[tss_mask].astype(np.float32).reshape(-1, 1)
    x_test = x_test.astype(np.float32)
    y_test = y_test.astype(np.float32).reshape(-1, 1)
    
    # train-validation split
    x_train, x_validation, y_train, y_validation = \
        sklearn.model_selection.train_test_split(x, y)
        
    # normalization x_train, y_train, x_validation, y_validation
    x_normalizer = sklearn.preprocessing.StandardScaler().fit(x_train)
    y_normalizer = sklearn.preprocessing.StandardScaler().fit(y_train)
    
    x_train      = x_normalizer.transform(x_train)
    x_validation = x_normalizer.transform(x_validation)
    x_test       = x_normalizer.transform(x_test)
    
    y_train      = y_normalizer.transform(y_train)
    y_validation = y_normalizer.transform(y_validation)
    y_test       = y_normalizer.transform(y_test)
        
    # make datasets and dataloader
    train_dataset = torch.utils.data.TensorDataset(
        torch.tensor(x_train), torch.tensor(y_train))
    validation_dataset = torch.utils.data.TensorDataset(
        torch.tensor(x_validation), torch.tensor(y_validation))
    
    train_dataloader = torch.utils.data.DataLoader(
            train_dataset, batch_size=5, shuffle=True
        )
    validation_dataloader = torch.utils.data.DataLoader(
            validation_dataset, batch_size=5, shuffle=True
        )
    
    # logging
    training_output['size_train'] = len(x_train)
    training_output['size_validation'] = len(x_validation)
    training_output['size_test'] = len(x_test)
    
    # train ensemble
    ensemble = train_NLL_ensemble(train_dataloader, validation_dataloader)

    # ---------------------
    
    # 1. use only one model
    model = ensemble[0]
    training_output.update(
        extend_info('basic', get_information_basic_model, model, x_test, y_test)
    )
    
    # 2. use ensemble
    model = train_ensemble(ensemble)
    training_output.update(
        extend_info('ensemble', get_information_ens, model, x_test, y_test)
    )

    # 3. use distillation
    model = train_distillation(ensemble, train_dataloader, validation_dataloader)
    training_output.update(
        extend_info('distillation', get_information_distillation, model, x_test, y_test)
    )

    # 4. use double distillation
    model = train_double_distillation(ensemble, train_dataloader, validation_dataloader)
    training_output.update(
        extend_info('doubledistillation', get_information_double_distillation, model, x_test, y_test)
    )
    
    return training_output

In [6]:
elements = ["name", "fun", "dim", "ker", "ins", "generation"]
for model in ['basic', 'ensemble', 'distillation', 'doubledistillation']:
    for loss in ["MSE", "MAE", "RDE"]:
        elements.append(model + "_" + loss)

OUTPUT_TYPE = collections.namedtuple('ExperimentOutput', elements)

In [9]:
def process_inputfile(array, name):
    path_input = run_folder + name
    basename = os.path.splitext(name)[0]
    path_output = output_folder + basename + '.csv'
    
    fun, dim, rid, ins = name.split('.')[0].split('_')[-4:]
    
    fun = int(fun)
    dim = int(dim[:-1])
    ker = (int(rid) - 1) % 9
    ins = int(ins)
    
    training_output = {
                'name': name,
                'fun': fun,
                'dim': dim,
                'ker': ker,
                'ins': ins
            }

    record = RecordedRun(path_input, training_output)
    
    all_gens = list(filter(
        lambda generation: len(generation[0]) >= 4, 
        record.all_gens
    ))
    
    # parallel run
    result = array.run_map(train_all_models_on_specific_state, all_gens)
    
    with open(path_output, 'w') as f:
        writer = csv.writer(f)

        for ig, output in enumerate(result):
            output = OUTPUT_TYPE(**training_output, generation=ig, 
                    **{model + '_' + loss: output[model + '_' + loss] 
                        for model in ['basic', 'ensemble', 'distillation', 'doubledistillation'] 
                            for loss in ["MSE", "MAE", "RDE"]}
                           )
            writer.writerow(tuple(output))
        


In [10]:
time.sleep(5)

run_folder = '/storage/brno2/home/kozajan/data/bbob-cmaes-runs/'
run_files = os.listdir(run_folder)
run_files = natsorted(run_files)
run_files = list(sorted(run_files, key=lambda name: int(name.split('.')[0].split('_')[-4:][1][:-1])))

output_folder = 'test_output/'

array = ArrayMetacentrum(slice_type='offset')

#out = process_inputfile(array, run_files[0])

for name in array.split_work(run_files):
    process_dataset(array, name)