# Genetic search approach


In [8]:
import numpy as np
np.random.seed(420)

In [9]:
train_X = np.load('../data/train_X.npy')
train_y = np.load('../data/train_y.npy')
valid_X = np.load('../data/valid_X.npy')
valid_y = np.load('../data/valid_y.npy')

In [10]:
N = 1500  # dataset size
T = 20  # number of datasets per generation
M = 0.1  # fraction of dataset samples dropped during mutation phase

In [11]:
from typing import List, Tuple
from dataclasses import dataclass

@dataclass
class DataSet(object):
    X: np.array
    y: np.array

def sample_datasets(
        n_samples: int=N, 
        n_datasets: int=T, 
        n_validation_samples: int=10*N, 
        source_X: np.array=train_X, 
        source_y: np.array=train_y
) -> Tuple[List[DataSet], DataSet]:
    ids = [
        np.random.choice(
            len(source_X), 
            n_samples, 
            replace=False
        ) for i in range(n_datasets)
    ]
    rest_ids = np.array(list(set(range(len(source_X))) - set(list(np.concatenate(ids)))))
    validation_ids = np.random.choice(rest_ids, n_validation_samples)
    return [DataSet(source_X[i], source_y[i]) for i in ids], DataSet(source_X[validation_ids], source_y[validation_ids])

In [12]:
from sklearn.svm import SVR
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import r2_score

params_dict = {
    'kernel': ['rbf'],
    'gamma': [1 / i for i in range(80, 130, 10)],
    'C': [0.9, 1.0, 1.1],
    'epsilon': [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1],
    'shrinking': [True]
}

def train_svr(train_set: DataSet, param_validation_set: DataSet, params_dict: dict=params_dict, n_iter: int=20):
    ps = ParameterSampler(n_iter=n_iter, param_distributions=params_dict)
    scores = np.zeros(n_iter)
    models = list()
    for idx, params in enumerate(ps):
        svr = SVR(**params)
        svr.fit(train_set.X, train_set.y)
        preds = svr.predict(param_validation_set.X)
        scores[idx] = r2_score(param_validation_set.y, preds)
        models.append(svr)
    return models[np.argmax(scores)]

In [13]:
from tqdm import tqdm

class VerboseMixin(object):
    def _progress(self, iterator, total):
        if self.verbose:
            return tqdm(iterator, total=total, desc=self.__class__.__name__)
        else:
            return iterator

    def _log(self, message):
        if self.verbose:
            print(f"[{self.__class__.__name__}] {message}")

class Evolution(VerboseMixin):
    def __init__(
            self, 
            T=T, 
            train_X=train_X, 
            train_y=train_y, 
            valid_X=valid_X, 
            valid_y=valid_y, 
            n_generations: int=100,
            verbose: bool=False
    ):
        self.training_sets, self.param_validation_set = sample_datasets(T)
        self.entire_dataset = DataSet(train_X, train_y)
        self.model_validation_dataset = DataSet(valid_X, valid_y)
        self.n_generations = n_generations
        self.verbose = verbose
        
    def _train(self):
        sample_scores = np.zeros(
            (len(self.training_sets), len(self.entire_dataset.X))
        )  # sample_scores[model_id][sample] = model's uncertainty of this sample
        model_validation_scores = np.zeros(len(self.training_sets))
        model_params = list()
        for model_id, ds in self._progress(
                enumerate(self.training_sets), total=len(self.training_sets)
        ):
            model = train_svr(ds, self.param_validation_set)
            preds = model.predict(self.entire_dataset.X)
            sample_scores[model_id] = np.abs(self.entire_dataset.y - preds)
            preds = model.predict(self.model_validation_dataset.X)
            model_validation_scores[model_id] = r2_score(self.model_validation_dataset.y, preds)
            model_params.append(model.get_params())
        return np.mean(sample_scores * model_validation_scores.reshape((-1,1)), axis=0), model_params, model_validation_scores
    
    def __iter__(self):
        self.generation = 0
        return self
    
    def __next__(self):
        if self.generation >= self.n_generations:
            raise StopIteration()
        weighted_sample_scores, model_params, model_validation_scores = self._train()
        self.generation += 1
        return model_validation_scores

In [14]:
ev = Evolution(n_generations=1, verbose=True)
for scores in ev:
    print(scores)

[-1.93444112e-01 -1.77103082e-01  8.38956020e-03  5.37983506e-03
 -3.11891474e-03 -1.05870403e-04 -9.37304797e-02 -1.68101538e-02
 -1.24521524e-01  1.11805802e-02 -4.88779576e-01 -3.64735847e-01
 -5.32156671e-02 -2.40628035e-01 -1.42374381e-01 -4.30398633e-01
 -3.12083635e-02 -4.24948952e-02 -1.22556100e+00 -1.54009001e-01]
