In [1]:
import sys
sys.path.append('../src')

In [2]:
import pickle
import multiprocessing as mp
from pathlib import Path
from datetime import datetime
from typing import NamedTuple, List

from tqdm import tqdm
import click
import numpy as np

from genetic import GenerationResult, DataSet, EvolutionParams, run_evolution, fit_svr

np.random.seed(420)

In [3]:
search_params_dict = {
    'kernel': ['rbf'],
    'gamma': [1 / i for i in range(80, 130, 10)],
    'C': [0.9, 1.0, 1.1],
    'epsilon': [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1],
    'shrinking': [True]
}

def find_best_model_samples(results):
    model_scores = np.concatenate([result.model_scores for result in results])
    model_samples = np.concatenate([result.model_samples.astype('int64') for result in results])
    best_model_id = np.argmax(model_scores)
    best_model_samples = model_samples[best_model_id]
    return best_model_samples

def prepare_submission(train_data, valid_data, results, path):
    with open(path, 'w') as file:
        with tqdm(sorted(results.keys())) as t:
            for dataset_size in t:
                dataset_results = results[dataset_size]
                best_samples = find_best_model_samples(dataset_results)
                best_model = fit_svr(
                    train_data.X[best_samples],
                    train_data.y[best_samples],
                    valid_data.X,
                    valid_data.y,
                    n_iter=32,
                    params_dict=search_params_dict
                )
                best_params = best_model.get_params()
                epsilon, C, gamma = [best_params.get(key) for key in ['epsilon', 'C', 'gamma']]
                print(f"size: {dataset_size}, epsilon: {epsilon}, C: {C}, gamma: {gamma}")
                best_model_samples_string = ",".join([str(sample) for sample in best_samples])
                file.write(f"{epsilon};{C};{gamma};{best_model_samples_string}\n")

In [4]:
input_file = '../data/genetic-all-07-06-19-20-27-20.pkl'
submission_path = datetime.now().strftime('../data/sumbission-%d-%m-%y-%H-%M-%S.txt')
input_dir = Path('../data').resolve()
train_X = np.load(input_dir / 'train_X.npy')
train_y = np.load(input_dir / 'train_y.npy')
valid_X = np.load(input_dir / 'valid_X.npy')
valid_y = np.load(input_dir / 'valid_y.npy')
train_data = DataSet(train_X, train_y, np.arange(len(train_X)))
valid_data = DataSet(valid_X, valid_y, np.arange(len(valid_X)) * (-1))
with open(input_file, 'rb') as file:
    results = pickle.load(file)

In [5]:
prepare_submission(train_data, valid_data, results, submission_path)

 10%|█         | 1/10 [00:10<01:38, 10.95s/it]

size: 600, epsilon: 0.01, C: 1.0, gamma: 0.0125


 20%|██        | 2/10 [00:24<01:34, 11.79s/it]

size: 700, epsilon: 0.01, C: 1.1, gamma: 0.0125


 30%|███       | 3/10 [00:39<01:29, 12.82s/it]

size: 800, epsilon: 0.01, C: 1.1, gamma: 0.011111111111111112


 40%|████      | 4/10 [00:57<01:26, 14.37s/it]

size: 900, epsilon: 0.01, C: 1.1, gamma: 0.0125


 50%|█████     | 5/10 [01:19<01:22, 16.57s/it]

size: 1000, epsilon: 0.01, C: 1.0, gamma: 0.01


 60%|██████    | 6/10 [01:41<01:12, 18.17s/it]

size: 1100, epsilon: 0.01, C: 1.0, gamma: 0.008333333333333333


 70%|███████   | 7/10 [02:04<00:59, 19.75s/it]

size: 1200, epsilon: 0.01, C: 0.9, gamma: 0.0125


 80%|████████  | 8/10 [02:35<00:46, 23.12s/it]

size: 1300, epsilon: 0.03, C: 1.1, gamma: 0.0125


 90%|█████████ | 9/10 [03:11<00:26, 26.86s/it]

size: 1400, epsilon: 0.01, C: 1.1, gamma: 0.008333333333333333


100%|██████████| 10/10 [03:43<00:00, 28.33s/it]

size: 1500, epsilon: 0.03, C: 1.1, gamma: 0.01



