In [1]:
import pandas as pd
import numpy as np
from tqdm import trange
from numba import jit

from typing import NamedTuple, List, Tuple

np.random.seed(420)

In [2]:
import multiprocessing as mp
from itertools import repeat

from sklearn.svm import SVR
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import r2_score

In [3]:
"""
Naming:
- *_ids = array of number corresponding to rows in the dataset
- *_index = boolean array allowing for fast selection from the dataset
"""

class DataSet(NamedTuple):
    X: np.array  # (n_samples, n_features)
    y: np.array  # 1d
    ids: np.array  # 1d

class GenerationParams(NamedTuple):
    n_models: int  # = n datasets
    n_fits: int  # per each model during hyperparameter optimization
    train_data: DataSet
    n_train_samples: int
    train_probs: np.array
    valid_data: DataSet
    valid_index: np.array
        
class FitResult(NamedTuple):
    train_index: np.array
    sample_scores: np.array
    model_score: float
    model_params: dict
        
class GenerationResult(NamedTuple):
    train_probs: np.array
    train_index: np.array  # (n_samples), True for samples that were used in any of the models
    model_scores: np.array
    model_params: List[dict]
    model_samples: np.array  # (n_models, n_train_samples) - IDs, NOT INDEX
    
class EvolutionParams(NamedTuple):
    n_models: int  # = datasets per generation
    n_fits: int  # per each model during hyperparameter optimization
    n_generations: int
    n_train_samples: int
    n_valid_samples: int
    mutation_prob: float  # between 0 and 1

In [4]:
def sample(n_samples: int, ids: np.array, weights: np.array=None) -> np.array:
    selected_ids = np.random.choice(ids, n_samples, replace=False, p=weights)
    selected_index = np.isin(ids, selected_ids, assume_unique=True)
    return selected_index  # same shape as ids, for easier selection

params_dict = {
    'kernel': ['rbf'],
    'gamma': [1 / i for i in range(80, 130, 10)],
    'C': [0.9, 1.0, 1.1],
    'epsilon': [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1],
    'shrinking': [True]
}

def fit_svr(X_train: np.array, y_train: np.array, X_valid: np.array, y_valid: np.array, params_dict: dict=params_dict, n_iter: int=25):
    ps = ParameterSampler(n_iter=n_iter, param_distributions=params_dict)
    scores = np.zeros(n_iter)
    models = list(repeat(None, n_iter))
    for idx, params in enumerate(ps):
        svr = SVR(**params)
        svr.fit(X_train, y_train)
        scores[idx] = r2_score(y_valid, svr.predict(X_valid))
        models[idx] = svr
    return models[np.argmax(scores)]


def fit_model(params: GenerationParams) -> FitResult:
    train_index = sample(
        params.n_train_samples, 
        params.train_data.ids, 
        params.train_probs
    )
    model = fit_svr(
        params.train_data.X[train_index], 
        params.train_data.y[train_index],
        params.valid_data.X[params.valid_index],
        params.valid_data.y[params.valid_index],
        n_iter=params.n_fits
    )
    sample_scores = np.power(
        params.train_data.y - model.predict(params.train_data.X),
        2
    )
    model_score = r2_score(
        params.valid_data.y[params.valid_index],
        model.predict(params.valid_data.X[params.valid_index])
    )
    return FitResult(train_index, sample_scores, model_score, model.get_params())

def run_generation(params: GenerationParams, n_models: int, pool: mp.Pool) -> Tuple[np.array, np.array, np.array, GenerationResult]:
    train_probs = np.zeros_like(params.train_probs)
    used_train_index = np.zeros_like(params.train_data.y, dtype=np.bool)
    model_params = list(repeat({}, params.n_models))
    model_scores = np.zeros(params.n_models)
    model_samples = np.zeros((params.n_models, params.n_train_samples))
    results = pool.map(fit_model, repeat(params, n_models))
#     results = map(fit_model, repeat(params, n_models))  # in case the Pool does not work in Jupyter
    for idx, fit_result in enumerate(results):
        used_train_index |= fit_result.train_index
        train_probs += fit_result.sample_scores * np.exp(fit_result.model_score)
        model_params[idx] = fit_result.model_params
        model_scores[idx] = fit_result.model_score
        model_samples[idx] = params.train_data.ids[fit_result.train_index].astype(np.uint)
    return GenerationResult(train_probs, used_train_index, model_scores, model_params, model_samples)

In [5]:
def run_evolution(train_data: DataSet, valid_data: DataSet, pool: mp.Pool, params: EvolutionParams):
    valid_index = sample(params.n_valid_samples, valid_data.ids)
    train_probs = np.ones(len(train_data.ids)) / len(train_data.ids)
    results = []
    with trange(params.n_generations) as t:
        for generation_idx in t:
            t.set_description(f"Generation {generation_idx+1}")
            gen_results = run_generation(
                GenerationParams(
                    params.n_models,
                    params.n_fits,
                    train_data,
                    params.n_train_samples,
                    train_probs,
                    valid_data,
                    valid_index
                ),
                params.n_models,
                pool
            )
            # we simulate selecting samples for mutation by altering their probabilities:
            gen_train_probs = gen_results.train_probs
            gen_results.train_probs[gen_results.train_index] *= (1. - params.mutation_prob)
            gen_train_probs[~gen_results.train_index] *= params.mutation_prob
            train_probs += gen_train_probs
            train_probs /= sum(train_probs)
            results.append(gen_results)
            t.set_postfix(mean_score=sum(gen_results.model_scores)/len(gen_results.model_scores), max_score=max(gen_results.model_scores))
    return results

## Performing actual evolution

In [6]:
train_X = np.load('../data/train_X.npy')
train_y = np.load('../data/train_y.npy')
valid_X = np.load('../data/valid_X.npy')
valid_y = np.load('../data/valid_y.npy')

In [7]:
train_data = DataSet(train_X, train_y, np.arange(len(train_X)))
valid_data = DataSet(valid_X, valid_y, np.arange(len(valid_X)) * (-1))
pool = mp.Pool(4)
params = EvolutionParams(
    n_models = 16,
    n_fits = 16,
    n_generations = 2,
    n_train_samples = 1500,
    n_valid_samples = 6000,
    mutation_prob = 0.25
)

In [8]:
results = run_evolution(train_data, valid_data, pool, params)

Generation 2: 100%|████████████████████████████████| 2/2 [17:24<00:00, 523.52s/it, max_score=-.194, mean_score=-.44]


In [16]:
import pickle

In [17]:
?pickle.dump

[1;31mSignature:[0m [0mpickle[0m[1;33m.[0m[0mdump[0m[1;33m([0m[0mobj[0m[1;33m,[0m [0mfile[0m[1;33m,[0m [0mprotocol[0m[1;33m=[0m[1;32mNone[0m[1;33m,[0m [1;33m*[0m[1;33m,[0m [0mfix_imports[0m[1;33m=[0m[1;32mTrue[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m
Write a pickled representation of obj to the open file object file.

This is equivalent to ``Pickler(file, protocol).dump(obj)``, but may
be more efficient.

The optional *protocol* argument tells the pickler to use the given
protocol supported protocols are 0, 1, 2, 3 and 4.  The default
protocol is 3; a backward-incompatible protocol designed for Python 3.

Specifying a negative protocol version selects the highest protocol
version supported.  The higher the protocol used, the more recent the
version of Python needed to read the pickle produced.

The *file* argument must have a write() method that accepts a single
bytes argument.  It can thus be a file object opened for binary
wri