In [1]:
import numpy as np
import scipy
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

import os, sys

import itertools, functools

from natsort import natsorted


import csv

sys.path.append('convert_full_data')

import tss

from parallization_metacentrum import ArrayIsMetacentrum, IsMetacentrum

import sklearn
import sklearn.model_selection

import torch

In [2]:
class RecordedRun:
    def __init__(self, npz_path, info):
        super().__init__()

        npz = np.load(npz_path)

        self.dim = npz['dimensions']
        self.fun = npz['function_id']

        self.xmeans = npz['surrogate_data_means'].T
        self.sigmas = npz['surrogate_data_sigmas']
        self.bds = npz['surrogate_data_bds']
        self.iruns = npz['iruns']
        self.evals = npz['evals']
        self.points = npz['points']
        self.fvalues = npz['fvalues']
        self.orig = npz['orig_evaled']
        self.coco = npz['fvalues_orig']
        self.gen_split = npz['gen_split']

        self.n_gen = len(self.gen_split)
        self.n_points = len(self.points)

        # tuple: x_train, y_train, x_test, y_test, tss_mask, stats
        self.all_gens = [self.get_gen(gen_i, info) for gen_i in range(1, self.n_gen)]
        self.stats_table = pd.DataFrame([gen[-1] for gen in self.all_gens])

        # for key, val in info.items():
        #     self.stats_table[key] = val
        # self.stats_table['ins'] = ins

    def get_gen(self, gen_i, info):
        # first point is initial guess
        # gen_split[0] = 0, gen_split[1] = 1, gen_split[2] = 1, ...
        low = self.gen_split[gen_i] + 1
        high = self.gen_split[gen_i + 1] + 1 if gen_i + 1 < self.n_gen else self.n_points

        x_test = self.points[low:high]
        y_test = self.coco[low:high]

        o = self.orig[:low]
        x_train = self.points[:low][o]
        y_train = self.coco[:low][o]

        pop = x_test

        mean = self.xmeans[gen_i]
        sigma = self.sigmas[gen_i]
        bd = self.bds[gen_i]
        mahalanobis_transf = np.linalg.inv(bd * sigma)

        maximum_distance = 4  # trainRange
        maximum_number = int(20 * self.dim)

        tss2 = tss.TSS2(pop, mahalanobis_transf, maximum_distance, maximum_number)
        tss_mask, _ = tss2(x_train, y_train)

        x_tss = x_train[tss_mask]
        y_tss = y_train[tss_mask]

        stats = {
            "gen_num": gen_i,
            "restarts": self.iruns[gen_i] - 1,
            "arch_len": len(x_train),
            "tss_len": np.count_nonzero(tss_mask),
            "var_y_tss": np.var(y_tss),

            "sigma": sigma,
            "bd": bd,
            "mean": mean,
            #"cond_num": condition_num(bd.T),
            #"fst_scnd_ratio": fst_scnd_ratio(bd.T),
            #"max_eigenvec": max(np.linalg.norm(bd.T, axis=1))
        }
        stats.update(info)

        return x_train, y_train, x_test, y_test, tss_mask, stats

In [3]:
run_folder = 'data_full/'
run_files = os.listdir(run_folder)
run_files = natsorted(run_files)
print(len(run_files))

5280


In [4]:
class LossRDE:
    name = 'RDE'
    cache = {}
    
    def __init__(self, mu):
        self.mu = mu

    @property
    def mu(self):
        return self._mu

    @mu.setter
    def mu(self, mu):
        assert isinstance(mu, int)
        self._mu = mu
        
    def _compute_normalization_coefficient(self, lam, mu):
        assert mu <= lam
        
        prvni_sloupec = np.arange(1, -mu, step=-1)[:, np.newaxis]
        assert len(prvni_sloupec) == mu + 1
        
        radek = np.arange(1,mu+1)[np.newaxis, :]
        radek_obraceny = np.arange(lam, lam-mu, step=-1)[np.newaxis, :]
        assert radek.shape[1] == mu
        assert radek_obraceny.shape[1] == mu
        
        tabulka = prvni_sloupec + (radek - 1)
        tabulka = np.where(tabulka > 0, tabulka, radek_obraceny)
        vysledek = np.amax(np.sum(np.abs(tabulka - radek), axis=1))
        return vysledek
    
    def __call__(self, predict, target):
        super().__call__(predict, target)
        lam = len(predict)
        try:
            err_max = self.cache[(lam, self._mu)]
        except KeyError:
            err_max = self._compute_normalization_coefficient(lam, self.mu)
            self.cache[(lam, self._mu)] = err_max
            
        si_predict = np.argsort(predict)
        si_target  = np.argsort(target)[:self._mu]
        
        inRank = np.zeros(lam)
        inRank[si_predict] = np.arange(lam)
        
        r1 = inRank[si_target[:self._mu]]
        r2 = np.arange(self._mu)
        return np.sum(np.abs(r1 - r2))/err_max

class LossRDE_auto(LossRDE):
    def __init__(self):
        pass
    def __call__(self, predict, target):
        lam = len(target)
        self.mu = int(math.floor(lam / 2))
        return super().__call__(predict, target)

### stuff used for ens and distillation

In [9]:
from probabilistic_models import extend_info, \
    get_information_basic_model, get_information_ens, \
    get_information_distillation, get_information_double_distillation, \
    train_NLL_ensemble, train_ensemble, train_distillation, train_double_distillation

In [10]:
def train_all_models_on_specific_state(state):
    training_output = {}
    
    # dataset handling
    x_train, y_train, x_test, y_test, tss_mask, stats = state
        
    x = x_train[tss_mask].astype(np.float32)
    y = y_train[tss_mask].astype(np.float32)
    x_test = x_test.astype(np.float32)
    y_test = y_test.astype(np.float32)
    
    training_output['train_size'] = len(x)
    training_output['test_size'] = len(x_test)

    x_train, x_validation, y_train, y_validation = \
        sklearn.model_selection.train_test_split(x,y)
    
    x_train = torch.tensor(x_train)
    y_train = torch.tensor(y_train)
    x_validation = torch.tensor(x_validation)
    y_validation = torch.tensor(y_validation)
    
    print(x_train.shape)
    print(y_train.shape)
    print(x_validation.shape)
    print(y_validation.shape)

    train_dataset = torch.utils.data.TensorDataset(x_train, y_train)
    validation_dataset = torch.utils.data.TensorDataset(x_validation, y_validation)
    
    train_dataloader = torch.utils.data.DataLoader(
            train_dataset, batch_size=5, shuffle=True
        )
    validation_dataloader = torch.utils.data.DataLoader(
            validation_dataset, batch_size=5, shuffle=True
        )
    
    # train ensemble
    ensemble = train_NLL_ensemble(train_dataloader, validation_dataloader)

    # ---------------------
    
    # 1. use only one model
    model = ensemble[0]
    training_output.update(
        extend_info('basic', get_information_basic_model, x_test, y_test)
    )
    
    # 2. use ensemble
    model = train_ensemble(ensemble)
    training_output.update(
        extend_info('ensemble', get_information_ens, x_test, y_test)
    )

    # 3. use distillation
    model = train_distillation(ensemble, train_dataloader, validation_dataloader)
    training_output.update(
        extend_info('distillation', get_information_distillation, x_test, y_test)
    )

    # 4. use double distillation
    model = train_double_distillation(ensemble, train_dataloader, validation_dataloader)
    training_output.update(
        extend_info('doubledistillation', get_information_double_distillation, x_test, y_test)
    )
    
    return training_output

In [11]:
def process_dataset(name):
    npz = np.load(run_folder + name)

    fun, dim, rid, ins = name.split('.')[0].split('_')[-4:]
    
    fun = int(fun)
    dim = int(dim[:-1])
    ker = (int(rid) - 1) % 9
    ins = int(ins)
    
    run_dict = {
                'fun': fun,
                'dim': dim,
                'ker': ker,
                'ins': ins
            }
    
    record = RecordedRun(run_folder +  name, run_dict)
    
    output_folder = 'test_output/'
    
    with open(output_folder + name, 'a') as f:
        writer = csv.writer(f)
        
        for ig, generation in enumerate(record.all_gens):
            output = train_all_models_on_specific_state(generation)
            return output
            fields = [name, ig, *output]
            # save results
            writer.writerow(fields)
            

In [12]:
process_dataset(run_files[0])

torch.Size([5, 2])
torch.Size([5])
torch.Size([2, 2])
torch.Size([2])
[tensor([[ 2.4467,  3.3691],
        [ 2.4219, -3.4063],
        [ 4.1540, -2.6965],
        [-1.8079, -3.2079],
        [ 2.3282, -2.0042]]), tensor([272.9194, 221.0556, 223.1950, 244.0245, 224.3768])]
Starting training for 600 steps
[tensor([[-1.8079, -3.2079],
        [ 2.3282, -2.0042],
        [ 2.4219, -3.4063],
        [ 4.1540, -2.6965],
        [ 2.4467,  3.3691]]), tensor([244.0245, 224.3768, 221.0556, 223.1950, 272.9194])]
Starting training for 600 steps
[tensor([[-1.8079, -3.2079],
        [ 2.3282, -2.0042],
        [ 2.4467,  3.3691],
        [ 4.1540, -2.6965],
        [ 2.4219, -3.4063]]), tensor([244.0245, 224.3768, 272.9194, 223.1950, 221.0556])]
Starting training for 600 steps
[tensor([[ 4.1540, -2.6965],
        [ 2.3282, -2.0042],
        [ 2.4467,  3.3691],
        [-1.8079, -3.2079],
        [ 2.4219, -3.4063]]), tensor([223.1950, 224.3768, 272.9194, 244.0245, 221.0556])]
Starting training for 

TypeError: extend_info() missing 1 required positional argument: 'y_test'

In [None]:
for tensor in train_dataloader:
    print(tensor)

In [None]:
kernel_names = ['lin', 'quad', 'se', 'matern5', 'rq', 'nn-arcsin', 'add', 'se+quad', 'gibbs']

for name in list(itertools.islice(run_files, 2, 10)):
    npz = np.load(run_folder + name)

    fun, dim, rid, ins = name.split('.')[0].split('_')[-4:]
    
    fun = int(fun)
    dim = int(dim[:-1])
    ker = (int(rid) - 1) % 9
    ins = int(ins)
    
    run_dict = {
                'fun': fun,
                'dim': dim,
                'ker': ker,
                'ins': ins
            }
    
    record = RecordedRun(run_folder +  name, run_dict)
    
    print(run_dict)

In [None]:
name

In [None]:
RecordedRun(run_files[0])

In [None]:
datapaths = list(map(lambda x: 'data/' + x, os.listdir('data')))
print(datapaths)

In [None]:
for path in itertools.islice(datapaths, 1):
    
    data = np.load(path, allow_pickle=True)
    
    for state in data:
        x_train, y_train, x_test, y_test, tss_mask, stats = state
        
        x = x_train[tss_mask]
        y = y_train[tss_mask]
        
        x_test
        y_test
        