In [1]:
import json
import os
from collections import Counter

import numpy as np

In [2]:
from invargen.data.expression import *
from invargen.data.tokens import *
from invargen.data.tree import *

In [3]:
from invargen.data.expression import *
from invargen_generic.operators import funcs as generic_funcs
from invargen_generic.features import *

In [4]:
from gplearn.fitness import make_fitness
from gplearn.functions import make_function
from gplearn.genetic import SymbolicRegressor

funcs = [make_function(**func._asdict()) for func in generic_funcs]

In [5]:
seed = 4

In [6]:
import torch
from torch import Tensor

device = torch.device('cpu')

In [7]:
from invargen_qlib.poly_data import PolyData

data = PolyData(device=device)

data_1 = PolyData(device=device)

data_2 = PolyData(device=device)

In [8]:
data.data

tensor([[[  1.,   1.,   1.,  ...,  49.,  51.,  51.],
         [  0.,   1.,   0.,  ...,   1., -35.,  35.],
         [  1.,   2.,   2.,  ...,  51.,  55.,  55.]]])

In [9]:
# first transform under (0, 1//-1, 0)

data_1.data[0][[0, 2]] = data_1.data[0][[2, 0]]

data_1.data[0][[1]] = -data_1.data[0][[1]]

data_1.data

tensor([[[  1.,   2.,   2.,  ...,  51.,  55.,  55.],
         [ -0.,  -1.,  -0.,  ...,  -1.,  35., -35.],
         [  1.,   1.,   1.,  ...,  49.,  51.,  51.]]])

In [10]:
# second transform under (1, 1//0, 1)

data_2.data[0][[2]] = data_2.data[0][[2]] + 2 * data_2.data[0][[1]] + data_2.data[0][[0]]

data_2.data[0][[1]] = data_2.data[0][[1]] + data_2.data[0][[0]]

data_2.data

tensor([[[  1.,   1.,   1.,  ...,  49.,  51.,  51.],
         [  1.,   2.,   1.,  ...,  50.,  16.,  86.],
         [  2.,   5.,   3.,  ..., 102.,  36., 176.]]])

In [11]:
exprs = b**2 - 1 * a * c

exprs.evaluate(data), exprs.evaluate(data_1), exprs.evaluate(data_2)

(tensor([[-1.0000e+00, -1.0000e+00, -2.0000e+00,  ..., -2.4980e+03,
          -1.5800e+03, -1.5800e+03]]),
 tensor([[-1.0000e+00, -1.0000e+00, -2.0000e+00,  ..., -2.4980e+03,
          -1.5800e+03, -1.5800e+03]]),
 tensor([[-1.0000e+00, -1.0000e+00, -2.0000e+00,  ..., -2.4980e+03,
          -1.5800e+03, -1.5800e+03]]))

In [12]:
cache = {}

def _metric(x, y, w):
    
    key = y[0]

    if key in cache:
        return cache[key]
    
    token_len = key.count('(') + key.count(')')
    
    expr = eval(key)
        
    if token_len > 100:
        return -1.
    
    factor_s = exprs.evaluate(data)
            
    factor = expr.evaluate(data)
    
    factor_1 = expr.evaluate(data_1)
    
    factor_2 = expr.evaluate(data_2)
                
    ic = 0.5 * torch.sum((factor == factor_1) & (factor != 0)) / data.n_polys \
    + 0.5 * torch.sum((factor == factor_2) & (factor != 0)) / data.n_polys
        
    
    if torch.sum((factor == factor_s) & (factor != 0)) > data.n_polys/2:
        
        print(expr)
        
        
    
    cache[key] = ic.item()
    
    return ic


Metric = make_fitness(function=_metric, greater_is_better=True)

In [13]:
generation = 0

def ev():
    global generation
    generation += 1
    
    dir_ = 'results'
    os.makedirs(dir_, exist_ok=True)
    if generation % 1 == 0:
        with open(f'{dir_}/{generation}.json', 'w') as f:
            json.dump({'cache': cache}, f)
    
    pass

In [14]:
features = ['a', 'b', 'c']
terminals = features

X_train = np.array([terminals])
y_train = np.array([[1]])

In [15]:
cache = {}

est_gp = SymbolicRegressor(population_size=1000,
                           generations=5,
                           init_depth=(2, 8),
                           tournament_size=600,
                           stopping_criteria=1.,
                           p_crossover=0.3,
                           p_subtree_mutation=0.1,
                           p_hoist_mutation=0.01,
                           p_point_mutation=0.1,
                           p_point_replace=0.6,
                           max_samples=0.9,
                           verbose=1,
                           parsimony_coefficient=0.,
                           random_state=seed,
                           function_set=funcs,
                           metric=Metric,
                           const_range=None,
                           n_jobs=1)

est_gp.fit(X_train, y_train, callback=ev)

    |   Population Average    |             Best Individual              |
---- ------------------------- ------------------------------------------ ----------
 Gen   Length          Fitness   Length          Fitness      OOB Fitness  Time Left
   0    82.05        -0.190537       15         0.509406         0.509406     16.79s
   1     9.16         0.369376        3         0.509406         0.509406     14.25s
   2     9.17         0.345731       11         0.525668         0.525668      5.66s
Sub(Mul($b,$b),Mul($c,$a))
   3    10.28         0.390405        7                1                1      2.52s


In [17]:
eval('Sub(Mul(b,b),Mul(c,a))')

Sub(Mul($b,$b),Mul($c,$a))

2.0.1+cpu
