# Genetic search approach


In [35]:
import numpy as np
np.random.seed(420)

In [36]:
train_X = np.load('../data/train_X.npy')
train_y = np.load('../data/train_y.npy')
valid_X = np.load('../data/valid_X.npy')
valid_y = np.load('../data/valid_y.npy')

In [37]:
N = 1500  # dataset size
T = 20  # number of datasets per generation
M = 0.1  # fraction of dataset samples dropped during mutation phase

In [53]:
from typing import List, Tuple
from dataclasses import dataclass

@dataclass
class DataSet(object):
    X: np.array
    y: np.array
    ids: np.array
    
    @classmethod
    def combine(cls, ds1, ds2):
        assert(ds1.X.shape == ds2.X.shape)
        intersection = np.intersect1d(ds1.ids, ds2.ids)
        other = np.random.choice(
            np.setxor1d(ds1.ids, ds2.ids), len(ds1.ids)-len(intersection)
        )
        new_ids = np.concatenate([intersection, other])
        take_from_left = np.isin(ds1.ids, new_ids)
        take_from_right = np.isin(ds2.ids, new_ids)
        # new ids need to be re-ordered to match the order 
        # in which X and y are selected for the new DataSet:
        new_ids = np.select(
            [take_from_left, take_from_right],
            [ds1.ids, ds2.ids]
        )
        new_X = np.select(
            [
                np.repeat(take_from_left, ds1.X.shape[1]).reshape(ds1.X.shape),
                np.repeat(take_from_right, ds2.X.shape[1]).reshape(ds2.X.shape)
            ],
            [ds1.X, ds2.X]
        )
        new_y = np.select(
            [take_from_left, take_from_right],
            [ds1.y, ds2.y]
        )
        return cls(new_X, new_y, new_ids)
        

def sample_datasets(
        n_samples: int=N, 
        n_datasets: int=T, 
        n_validation_samples: int=10*N, 
        source_X: np.array=train_X, 
        source_y: np.array=train_y
) -> Tuple[List[DataSet], DataSet]:
    ids = [
        np.random.choice(
            len(source_X), 
            n_samples, 
            replace=False
        ) for i in range(n_datasets)
    ]
    rest_ids = np.array(list(set(range(len(source_X))) - set(list(np.concatenate(ids)))))
    validation_ids = np.random.choice(rest_ids, n_validation_samples)
    return [DataSet(source_X[i], source_y[i], i) for i in ids], DataSet(source_X[validation_ids], source_y[validation_ids], validation_ids)

In [54]:
from sklearn.svm import SVR
from sklearn.model_selection import ParameterSampler
from sklearn.metrics import r2_score

params_dict = {
    'kernel': ['rbf'],
    'gamma': [1 / i for i in range(80, 130, 10)],
    'C': [0.9, 1.0, 1.1],
    'epsilon': [1e-3, 3e-3, 1e-2, 3e-2, 1e-1, 3e-1],
    'shrinking': [True]
}

def train_svr(train_set: DataSet, param_validation_set: DataSet, params_dict: dict=params_dict, n_iter: int=20):
    ps = ParameterSampler(n_iter=n_iter, param_distributions=params_dict)
    scores = np.zeros(n_iter)
    models = list()
    for idx, params in enumerate(ps):
        svr = SVR(**params)
        svr.fit(train_set.X, train_set.y)
        preds = svr.predict(param_validation_set.X)
        scores[idx] = r2_score(param_validation_set.y, preds)
        models.append(svr)
    return models[np.argmax(scores)]

In [58]:
from tqdm import tqdm
from itertools import combinations

class VerboseMixin(object):
    def _progress(self, iterator, total):
        if self.verbose:
            return tqdm(iterator, total=total, desc=self.__class__.__name__)
        else:
            return iterator

    def _log(self, message):
        if self.verbose:
            print(f"[{self.__class__.__name__}] {message}")

class Evolution(VerboseMixin):
    def __init__(
            self, 
            T=T, 
            train_X=train_X, 
            train_y=train_y, 
            valid_X=valid_X, 
            valid_y=valid_y, 
            n_generations: int=100,
            verbose: bool=False
    ):
        self.training_sets, self.param_validation_set = sample_datasets(T)
        self.entire_dataset = DataSet(train_X, train_y, np.arange(len(train_X)))
        self.model_validation_dataset = DataSet(valid_X, valid_y, None)
        self.n_generations = n_generations
        self.verbose = verbose

    def _train(self):
        sample_scores = np.zeros(
            (len(self.training_sets), len(self.entire_dataset.X))
        )  # sample_scores[model_id][sample] = model's uncertainty of this sample
        model_validation_scores = np.zeros(len(self.training_sets))
        model_params = list()
        for model_id, ds in self._progress(
                enumerate(self.training_sets), total=len(self.training_sets)
        ):
            model = train_svr(ds, self.param_validation_set)
            preds = model.predict(self.entire_dataset.X)
            sample_scores[model_id] = np.abs(self.entire_dataset.y - preds)
            preds = model.predict(self.model_validation_dataset.X)
            model_validation_scores[model_id] = r2_score(self.model_validation_dataset.y, preds)
            model_params.append(model.get_params())
        return np.mean(sample_scores * model_validation_scores.reshape((-1,1)), axis=0), model_params, model_validation_scores
    
    def _select_sets(self, model_validation_scores: np.array):
        normalized_scores = model_validation_scores / np.sum(model_validation_scores)
        sorted_order = np.argsort(-normalized_scores)  # sort by DESCENDING SCORE
        self.training_sets = list(np.array(self.training_sets)[sorted_order])
        cum_scores = np.cumsum(normalized_scores[sorted_order])
        fitness_threshold = 0.
        while fitness_threshold < cum_scores[1]:
            fitness_threshold = np.random.random()
        fit_datasets = [
            ds for ds, is_fit 
            in zip(self.training_sets, cum_scores >= fitness_threshold) 
            if is_fit
        ]
        return fit_datasets  # sorted by model score, descending
    
    def _crossover(self, fit_datasets: List[np.array]):
        new_datasets = []
        while len(new_datasets) < len(self.training_sets):
            # if there is a really small number of fit_datasets, 
            # we want to resample T new datasets from what we have
            for ds1, ds2 in combinations(fit_datasets, 2):
                new_datasets.append(DataSet.combine(ds1, ds2))
                if len(new_datasets) >= len(self.training_sets):
                    break
        self.training_sets = new_datasets
        
    def _mutate(self):
        raise NotImplemented()
    
    def __iter__(self):
        self.generation = 0
        return self
    
    def __next__(self):
        if self.generation >= self.n_generations:
            raise StopIteration()
        weighted_sample_scores, model_params, model_validation_scores = self._train()
        fit_datasets = self._select_sets(model_validation_scores)
        self._crossover(fit_datasets)
#         self._mutate()  # TODO
        self.generation += 1
        return model_validation_scores

In [59]:
ev = Evolution(n_generations=2, verbose=True)
for scores in ev:
    print(scores)

Evolution: 100%|████████████████████████████████████████████████████████████████████| 20/20 [00:21<00:00,  1.07s/it]


[-0.08068123 -0.06492513 -0.12409585 -0.28681169 -0.30541897 -0.05833036
 -0.11678386 -0.05277021 -0.29675168 -0.29117177 -0.09809191 -0.2452606
 -0.01433659 -0.12487273 -0.26254136 -0.32474264 -0.17549072 -0.03588393
 -0.08146849 -0.01013139]


Evolution: 100%|████████████████████████████████████████████████████████████████████| 20/20 [00:30<00:00,  1.60s/it]


[ -1.55287063  -8.8277393  -13.11085259  -3.3199114   -1.16000664
  -2.32638548  -0.62343323  -1.05061934  -0.49077914 -15.81230812
  -6.27466876  -1.4552272   -8.00177109  -0.41777791  -1.04832412
  -1.64854863 -15.88610032  -5.98064997  -4.58690519  -3.68839728]
