In [1]:
import sys
sys.path.append('..')

In [2]:
import numpy as np
np.random.seed(420)

In [3]:
import pickle
import multiprocessing as mp

In [4]:
from tqdm import trange
from pathlib import Path
from datetime import datetime
from typing import NamedTuple, List

In [5]:
from src.genetic import GenerationResult, DataSet, EvolutionParams, run_evolution

In [6]:
def shrink_samples(samples: np.array, size: int) -> np.array:
    return np.array([
        np.random.choice(sample, size, replace=False).astype('int64')
        for sample in samples
    ])

In [7]:
n_threads = 4
input_dir = '../data'
input_file = '../data/genetic-05-06-19-21-36-16.pkl'
output_path = datetime.now().strftime('../data/genetic-all-datasets-%d-%m-%y-%H-%M-%S.pkl')

In [8]:
input_dir = Path(input_dir).resolve()
output_path = Path(output_path).resolve()
train_X = np.load(input_dir / 'train_X.npy')
train_y = np.load(input_dir / 'train_y.npy')
valid_X = np.load(input_dir / 'valid_X.npy')
valid_y = np.load(input_dir / 'valid_y.npy')
train_data = DataSet(train_X, train_y, np.arange(len(train_X)))
valid_data = DataSet(valid_X, valid_y, np.arange(len(valid_X)) * (-1))
params = EvolutionParams(
    n_models = 32,
    n_fits = 32,
    n_generations = 1,
    n_train_samples = 1500,
    n_valid_samples = 6000,
    train_ids = None,
    mutation_prob = 0.04,
    score_mode = "variance",
)

results = {}
with open(input_file, 'rb') as file:
    results[1500] = pickle.load(file)
    final_model_samples = results[1500][-1].model_samples
    
with mp.Pool(n_threads) as pool:
    with trange(1400, 500, -100) as t:
        for dataset_size in t:
            t.set_description(f"Dataset {dataset_size}")
            start_model_samples = shrink_samples(final_model_samples, dataset_size)
            assert start_model_samples.shape == (final_model_samples.shape[0], dataset_size)
            params = params._replace(n_train_samples=dataset_size, train_ids=start_model_samples)
            results[dataset_size] = run_evolution(train_data, valid_data, pool, params)
            final_model_samples = results[dataset_size][-1].model_samples
    print(f"Saving results to {output_path}...")
    pickle.dump(results, open(output_path, 'wb'))
    print("Done")

Dataset 1400:   0%|          | 0/9 [00:00<?, ?it/s]
  0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [03:52<?, ?it/s, max_score=0.0699, mean_score=0.0346][A
Dataset 1300:  11%|█         | 1/9 [03:52<30:57, 232.18s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [03:41<?, ?it/s, max_score=0.064, mean_score=0.0317][A
Dataset 1200:  22%|██▏       | 2/9 [07:33<26:41, 228.85s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [03:23<?, ?it/s, max_score=0.0594, mean_score=0.0268][A
Dataset 1100:  33%|███▎      | 3/9 [10:56<22:07, 221.18s/it]
  0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [00:00<?, ?it/s][A
Generation 1:   0%|          | 0/1 [03:01<?, ?it/s, max_score=0.0512, mean_score=0.0244][A
Dataset 1000: 

Saving results to /home/tmiskow/Source/mim/sus/zad3/data/genetic-all-datasets-07-06-19-12-13-00.pkl...
Done
