In [1]:
# Load symbolicregression model

import torch
import os, sys
import symbolicregression
import sympytorch
import requests
from sympy.core.rules import Transform

model_path = "ckpt/model.pt" 
try:
    if not os.path.isfile(model_path): 
        print("Downloading model...")
        url = "https://dl.fbaipublicfiles.com/symbolicregression/model1.pt"
        r = requests.get(url, allow_redirects=True)
        open(model_path, 'wb').write(r.content)
    if not torch.cuda.is_available():
        sr_model = torch.load(model_path, map_location=torch.device('cpu'))
    else:
        sr_model = torch.load(model_path)
        sr_model = sr_model.cuda()
    print(sr_model.device)
    print("Model successfully loaded!")

except Exception as e:
    print("ERROR: model not loaded! path was: {}".format(model_path))
    print(e)    
    
est = symbolicregression.model.SymbolicTransformerRegressor(
                        model=sr_model,
                        max_input_points=10001,
                        n_trees_to_refine=5,
                        rescale=True
                        )

cuda:0
Model successfully loaded!


In [2]:
# from transformers import AutoTokenizer
# CONTEXT_LENGTH = 256
# tokenizer = AutoTokenizer.from_pretrained("xhyi/PT_GPTNEO350_ATG") 
# tokenizer.pad_token = tokenizer.eos_token
# from transformers import AutoModelForCausalLM, TrainingArguments, Trainer
# model = AutoModelForCausalLM.from_pretrained("datasets/normalize_symbolic_regression_results_20231215/gptneo-350m-5500.model")


from transformers import T5Tokenizer, DataCollatorForSeq2Seq
from transformers import T5ForConditionalGeneration, Seq2SeqTrainingArguments, Seq2SeqTrainer

# Load the tokenizer, model, and data collator
MODEL_NAME = "google/flan-t5-base"

tokenizer = T5Tokenizer.from_pretrained(MODEL_NAME)
model = T5ForConditionalGeneration.from_pretrained("datasets/normalize_symbolic_regression_results_flant5_20231219/flant5-base-36000-loss0.097.model")
data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model=model)

  from .autonotebook import tqdm as notebook_tqdm
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
import numpy as np
import sympy as sp
from sympy import sympify, lambdify, symbols, integrate, Interval, Symbol, I, S, oo, plot
from IPython.display import display

# Given an expr f (of variable t), returns its integral, together with t's and y's for regression
def integrate_expr(f, min_x=-4.0, max_x=4.0, increment=0.002, verbose=False):
    if verbose:
        print("Running integration on")
        display(f)
    # Compute integration
    x, t = symbols(['x','t'])
    fi = integrate(f, t)
    if verbose:
        display(fi)
        #plot(fi, (t, min_x, max_x))
    # Generate data for symbolic regression
    fl = lambdify((t), fi, "numpy")
    ts = np.arange(min_x, max_x, increment)
    ys = fl(ts)
    return fi, ts, ys
    
integrate_expr(sympify("2*t**2-t*2+2.5"), verbose=True)

Running integration on


2*t**2 - 2*t + 2.5

0.666666666666667*t**3 - 1.0*t**2 + 2.5*t

(0.666666666666667*t**3 - 1.0*t**2 + 2.5*t,
 array([-4.   , -3.998, -3.996, ...,  3.994,  3.996,  3.998]),
 array([-68.66666667, -68.58170266, -68.49681062, ...,  36.50791852,
         36.56077862,  36.61369466]))

In [4]:
from utils.utils import *

def round_expr(expr, num_digits=2):
    return expr.xreplace(Transform(lambda x: x.round(num_digits), lambda x: isinstance(x, sp.Float)))

# Run symbolic regression on given data
# Returns: (raw regressed expr, rounded expr, model refined expr)
@timeout(15)
def symbolic_regress(sr_model, xs, ys, generate_refinement=True, verbose=False):
    if verbose:
        print("Running Symbolic Regression...")
    ##Example of data
    xs = np.reshape(xs, (len(xs),1))
    ys = np.reshape(ys, (len(ys),1))
    sr_model.fit(xs,ys)
    #
    replace_ops = {"add": "+", "mul": "*", "sub": "-", "pow": "**", "inv": "1/"}
    model_str = sr_model.retrieve_tree(with_infos=True)["relabed_predicted_tree"].infix()
    for op,replace_op in replace_ops.items():
        model_str = model_str.replace(op,replace_op)
    #
    raw_expr = sp.parse_expr(model_str)
    x_0, t = symbols(['x_0', 't'])
    raw_expr = raw_expr.subs(x_0, t)
    if verbose:
        display(raw_expr)
    #
    expr = sp.expand(raw_expr)
    rounded_expr = round_expr(expr)
    if verbose:
        display(rounded_expr)
    #
    # Encode some input text
    if generate_refinement:
        prompt = str(rounded_expr)
        input_ids = tokenizer.encode(prompt, return_tensors='pt')
        #
        # Generate text
        output = model.generate(input_ids, max_length=50, num_return_sequences=1, temperature=0.1)
        #
        # Decode and print the output
        generated_text = tokenizer.decode(output[0], skip_special_tokens=True)
        generated_expr = sympify(generated_text)
        if verbose:
            display(generated_expr)
        #
    else:
        generated_expr = None
    return raw_expr, rounded_expr, generated_expr


fi, ts, ys = integrate_expr(sympify("2*t**2-t*2+2.5"), verbose=False)
rounded_fi = round_expr(fi)
display(rounded_fi)
raw_expr, rounded_expr, generated_expr = symbolic_regress(est, ts, ys, verbose=False)
display(generated_expr)

print("Diff1:")
display(rounded_fi-rounded_expr)
print("Diff2:")
display(rounded_fi-generated_expr)

0.67*t**3 - 1.0*t**2 + 2.5*t

  warn_deprecated('grad')


0.67*t**3 - t**2 + 2.5*t

Diff1:


0

Diff2:


-7.812500000004e-5*t**3

In [5]:
import json
import random
import numpy as np

def load_expressions(filepaths):
    lines = []
    for filepath in filepaths:
        fin = open(filepath, 'r')
        lines.extend(fin.readlines())
        fin.close()
    #
    random.shuffle(lines)
    exprs = set()
    for line in lines:
        data = json.loads(line)
        for k,v in data.items():
            if k in ('f_t', 'g_t'):
                try:
                    if 'sqrt' not in v[1]:
                        expr = sympify(v[1])
                        exprs.add(expr)
                except:
                    continue
    return exprs

#exprs = load_expressions(['datasets/parametric_equations.json'])  #'datasets/function_evaluation.json', 

In [6]:
# # Run symbolic regression on each case

# seen_exprs = set()
# fin = open("datasets/parametric_equations_integral_results.json", "r")
# lines = fin.readlines()
# for line in lines:
#     result = json.loads(line)
#     expr = result["original"]
#     seen_exprs.add(expr)
# fin.close()
# print(f"{len(seen_exprs)} exprs loaded")

# fout = open("datasets/parametric_equations_integral_results.json", "a")
# num_seen = 0
# num_seen_changed = False

# for f in exprs:
#     if str(f) in seen_exprs:
#         num_seen+=1
#         num_seen_changed = True
#         continue
#     else:
#         num_seen_changed = False
#     if num_seen_changed:
#         print(f"{num_seen} exprs ignored")
#     #print("Original expr and its integral:")
#     #display(f)
#     #print(f)
#     try:
#         fi, xs, ys = integrate_expr(f, verbose=False)
#         x, t = symbols(['x','t'])
#         fi = fi.subs({x:t})
#         rounded_fi = round_expr(fi)
#         #display(rounded_fi)
#         raw_expr, rounded_expr, generated_expr = symbolic_regress(est, xs, ys, generate_refinement=True, verbose=False)
# #         print("Generated expr:")
# #         display(generated_expr)
#         results = {"original":str(f),
#                    "integral":str(fi),
#                    "rounded_integral":str(rounded_fi),
#                    "regressed":str(raw_expr),
#                    "rounded_regressed":str(rounded_expr),
#                    "generated_regressed":str(generated_expr),
#                    "diff_rounded": str(rounded_fi-rounded_expr),
#                    "diff_generated": str(rounded_fi-generated_expr)
#                   }
#         fout.write(json.dumps(results))
#         fout.write('\n')
#         fout.flush()
#     except:
#         print("Failed to run symbolic regression")
#         continue
#     #     print("Diff1:")
#     #     display(rounded_fi-rounded_expr)
#     #     print("Diff2:")
#     #     display(rounded_fi-generated_expr)

# fout.close()

In [7]:
# # Check the accuracy of symbolic regression

# from sympy import evalf, N
# from utils.utils import *


# # Check if f1 and f2 are almost equal.
# # Note: Relative error is defined based on f1. Please use the original expression as f1.
# def almost_equal(f1, f2, max_abs_error=0.011, max_relative_error=0.011, verbose=False):
#     expr = f1-f2
#     coeff_pairs = None
#     try:
#         coeff_pairs = get_coefficients_and_exponents(expr)
#     except:
#         print("Cannot get_coefficients_and_exponents")
#         print(str(expr))
#     if coeff_pairs is None:
#         constants = get_all_constants(expr)
#     else:
#         constants = [p[0] for p in coeff_pairs]
#     # Check if all diffs are within max_abs_error
#     violators = [c for c in constants if c == sp.nan or abs(c) > max_abs_error]
#     if verbose:
#         print("Violating constants:", violators)
#     if len(violators) == 0:
#         return True
#     # Check if all violating diffs are within max_relative_error
#     try:
#         coeffs1 = get_polynomial_coeffs(f1)
#         coeffs2 = get_polynomial_coeffs(f2)
#     except:
#         return False
#     for i in range(len(coeffs1)):
#         if abs(coeffs1[i] - coeffs2[i]) > max_abs_error and \
#            abs(coeffs1[i] - coeffs2[i]) > max_relative_error*abs(coeffs1[i]):
#             return False
#     return True
    

# fin = open("datasets/parametric_equations_integral_results.json", "r")
# lines = fin.readlines()

# num_total, qualified_rounded, qualified_generated = 0, 0, 0
# for line in lines:
#     result = json.loads(line)
#     if "diff_rounded" not in result or "diff_generated" not in result:
#         continue
#     rounded_integral = N(sympify(result["rounded_integral"]))
#     rounded_regressed = N(sympify(result["rounded_regressed"]))
#     try:
#         rounded_regressed = filter_non_polynomial(rounded_regressed)
#     except:
#         print("Cannot filter non-polynomials on", str(rounded_regressed))
#     #generated_regressed = N(sympify(result["generated_regressed"]))
#     diff_rounded = rounded_integral - rounded_regressed
#     #diff_generated = sympify(result["diff_generated"])
#     num_total += 1
#     if almost_equal(rounded_integral, rounded_regressed, verbose=True):
#         qualified_rounded += 1
#     else:
#         display(rounded_integral)
#         display(rounded_regressed)
#         print(rounded_regressed)
#         display(diff_rounded)
# #     if is_close_to_zero(diff_generated, True):
# #         qualified_generated += 1
    
# fin.close()

# print(num_total, qualified_rounded, qualified_generated)

In [28]:
# Generate data for regression to infer the rules for integral

from sympy import evalf, N
from utils.utils import *

fin = open("datasets/parametric_equations_integral_results.json", "r")
lines = fin.readlines()

MAX_POWER = 6

data_series = [([],[])] * (MAX_POWER+1)
data_series = []
originals = []
integrals = []

for i in range(MAX_POWER+1):
    data_series.append((list(),list()))

for line in lines:
    result = json.loads(line)
    if "rounded_regressed" not in result:
        continue
    original = N(sympify(result["original"]))
    integral = N(sympify(result["rounded_regressed"]))
    try:
        original = filter_non_polynomial(original)
        integral = filter_non_polynomial(integral)
    except:
        print("Cannot filter non-polynomials on", str(integral))
    try:
        coeffs_original = get_polynomial_coeffs(original)
        coeffs_integral = get_polynomial_coeffs(integral)
    except:
        print("Cannot get_coefficients_and_exponents")
        display(integral)
        continue
    if original.is_constant():
        print("Skipping", line)
        continue
    xs = list()
    for i in range(MAX_POWER+1):
        xs.append(i)
        xs.append(coeffs_original[i])
    for i in range(MAX_POWER+1):
        data_series[i][0].append(xs.copy())
        data_series[i][1].append(coeffs_integral[i])
    if len(data_series[0][1]) % 100 == 0:
        print(len(data_series[0][1]), "processed")
#     if len(data_series[0][1]) == 3844:
#         display(original)
#         print(original.is_constant())
#         display(sympify(result["original"]))
#         display(integral)
    originals.append(original)
    integrals.append(integral)
    
fin.close()


100 processed
200 processed
300 processed
Cannot get_coefficients_and_exponents


0.82*t**3 - 0.94*t**2 - 1.13*t**2/(9.82*cos(0.89*sqrt(arctan(0.0) + 0.85) + 0.08) - 0.07) + 0.39*t + 0.64*t/(9.82*cos(0.89*sqrt(arctan(0.0) + 0.85) + 0.08) - 0.07)

400 processed
Cannot get_coefficients_and_exponents


0.03*t**2*arctan(79.74) - 3.71*t**2 + 0.67*t

500 processed
Cannot get_coefficients_and_exponents


24.49*t**3 + 128.57*t**2 + 227.05*t + 187.99*t/(-90.2 + 6.2/(-0.08*arctan(0.0)**2 - 4.46))

600 processed
700 processed
Cannot get_coefficients_and_exponents


3.75*t**2 - 1.07*t*arctan(0.0) - 8.25*t

800 processed
900 processed
1000 processed
1100 processed
1200 processed
Cannot get_coefficients_and_exponents


-0.03*t**2*cos(0.01*arctan(-0.331321606892383) + 0.05) + 1.78*t**2 - 0.15*t*cos(0.01*arctan(-0.331321606892383) + 0.05) + 8.15*t

1300 processed
1400 processed
Cannot get_coefficients_and_exponents


1.33*t**3 - 11.14*t**2 + 31.05*t + 2.75*arctan(0.04) - 0.11

1500 processed
1600 processed
1700 processed
Cannot get_coefficients_and_exponents


5.33*t**3 - 27.95*t**2 - 0.58*t**2/(7.14*arctan(74.77) + 0.8) + 48.87*t + 1.53*t/(7.14*arctan(74.77) + 0.8)

1800 processed
1900 processed
2000 processed
2100 processed
2200 processed
2300 processed
2400 processed
2500 processed
2600 processed
2700 processed
2800 processed
2900 processed
3000 processed
3100 processed
3200 processed
3300 processed
3400 processed
3500 processed
3600 processed
Cannot filter non-polynomials on -2.5*t**2 + 0.14*t**2/(2.65*t*arctan(27.28*t + 0.04)**2 - 58.47*t*arctan(27.28*t + 0.04) - 531.59*t + 0.04*arctan(27.28*t + 0.04)**2 - 0.79*arctan(27.28*t + 0.04) - 7.19) - 1.5*t + 0.08*t/(2.65*t*arctan(27.28*t + 0.04)**2 - 58.47*t*arctan(27.28*t + 0.04) - 531.59*t + 0.04*arctan(27.28*t + 0.04)**2 - 0.79*arctan(27.28*t + 0.04) - 7.19)
Cannot get_coefficients_and_exponents


-2.5*t**2 + 0.14*t**2/(2.65*t*arctan(27.28*t + 0.04)**2 - 58.47*t*arctan(27.28*t + 0.04) - 531.59*t + 0.04*arctan(27.28*t + 0.04)**2 - 0.79*arctan(27.28*t + 0.04) - 7.19) - 1.5*t + 0.08*t/(2.65*t*arctan(27.28*t + 0.04)**2 - 58.47*t*arctan(27.28*t + 0.04) - 531.59*t + 0.04*arctan(27.28*t + 0.04)**2 - 0.79*arctan(27.28*t + 0.04) - 7.19)

3700 processed
3800 processed
3900 processed
4000 processed
4100 processed
4200 processed
Cannot get_coefficients_and_exponents


3.7*t**2 + 7.07*t - 3.2*t/(4.97 - 0.4*arctan(0.58))

4300 processed
Cannot get_coefficients_and_exponents


1.44*t**2 - 3.67*t*arctan(0.02) + 33.14*t + 0.01

Cannot get_coefficients_and_exponents


-11.59*t*arctan(0.05) - 8.58*t - 0.03*arctan(0.05)

4400 processed
4500 processed
Cannot get_coefficients_and_exponents


0.02*t**2*arctan(0.43) - 4.31*t**2

4600 processed
4700 processed
4800 processed
Cannot get_coefficients_and_exponents


-2.15*t**2*arctan(-5134.7509765625) + 0.22*t**2 + 0.01*t*arctan(-5134.7509765625) + 7.62*t

4900 processed
5000 processed
Cannot filter non-polynomials on 1.6*t**2 + 2.0*t + 1.26*t/(-1.43*t**2 + 1185.29*t*(0.48 - 1/(-0.37*t - 7.04))**0.5*arctan(-0.02*t - 93.9) - 13269.35*t*(0.48 - 1/(-0.37*t - 7.04))**0.5 + 0.17*t*arctan(-0.02*t - 93.9)**2 - 3.8*t*arctan(-0.02*t - 93.9) + 989609.4*t - 2066773.52*t/(-0.37*t - 7.04) + 0.85*(0.48 - 1/(-0.37*t - 7.04))**0.5*arctan(-0.02*t - 93.9) - 9.48*(0.48 - 1/(-0.37*t - 7.04))**0.5 + 707.11 - 1476.79/(-0.37*t - 7.04))
Cannot get_coefficients_and_exponents


1.6*t**2 + 2.0*t + 1.26*t/(-1.43*t**2 + 1185.29*t*(0.48 - 1/(-0.37*t - 7.04))**0.5*arctan(-0.02*t - 93.9) - 13269.35*t*(0.48 - 1/(-0.37*t - 7.04))**0.5 + 0.17*t*arctan(-0.02*t - 93.9)**2 - 3.8*t*arctan(-0.02*t - 93.9) + 989609.4*t - 2066773.52*t/(-0.37*t - 7.04) + 0.85*(0.48 - 1/(-0.37*t - 7.04))**0.5*arctan(-0.02*t - 93.9) - 9.48*(0.48 - 1/(-0.37*t - 7.04))**0.5 + 707.11 - 1476.79/(-0.37*t - 7.04))

5100 processed
Cannot get_coefficients_and_exponents


3.1*t**2 + 254.35*t*arctan(-0.0022066) - 2.85*t + 0.03*arctan(-0.0022066)

5200 processed
Cannot get_coefficients_and_exponents


-2.0*t**2 - 1.5*t + 0.05/(12.26*t + zoo*t + zoo)

5300 processed
5400 processed
5500 processed
Skipping {"original": "(11*t - 12)**2/3", "integral": "121*t**3/9 - 44*t**2 + 48*t", "rounded_integral": "121*t**3/9 - 44*t**2 + 48*t", "regressed": "-2.362714929808825*t + (1.3421100639658838*t - 0.034186530761226524)*(26.407514837270834*(0.6159048132400425*t - 1)**2 + 10.288924736951953) + 1.2545246034571302", "rounded_regressed": "13.44*t**3 - 44.0*t**2 + 48.0*t", "generated_regressed": "121*t**3/9 - 44*t**2 + 48*t", "diff_rounded": "0.004517*t**3", "diff_generated": "0"}

5600 processed
5700 processed
Cannot get_coefficients_and_exponents


-1.5*t**2 - 0.01*t*arctan(62.87) + 4.68*t

5800 processed
Cannot filter non-polynomials on 0.06*t**2*(0.93 + 1/(6415115.33*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 + 158.34*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) - 5937.54*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 - 0.15*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) + 1.37*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2))**0.5 + 1.77*t**2 - 0.07*t*(0.93 + 1/(6415115.33*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 + 158.34*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) - 5937.54*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 - 0.15*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) + 1.37*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2))**0.5 - 1.94*t
Cannot get_coefficients_and_exponents


0.06*t**2*(0.93 + 1/(6415115.33*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 + 158.34*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) - 5937.54*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 - 0.15*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) + 1.37*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2))**0.5 + 1.77*t**2 - 0.07*t*(0.93 + 1/(6415115.33*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 + 158.34*t**2*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) - 5937.54*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2 - 0.15*t*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8)) + 1.37*Abs(-0.12*t + 2.02 + 0.05/(10.1*cos(0.88*t)**2 + 0.8))**2))**0.5 - 1.94*t

5900 processed
6000 processed
6100 processed
Cannot get_coefficients_and_exponents


3.6*t**2 + 0.02*t*arctan(0.01) - 1.8*t

6200 processed
6300 processed
6400 processed
Cannot get_coefficients_and_exponents


2.17*t**2 + 8.33*t + 0.55*arctan(0.0)

6500 processed
Cannot filter non-polynomials on 0.9*t**2 - 2.72*t**2/(-3275.62*t**2*tan(7.7 + 2215.0/t) + 69692.71*t**2 - 4.29*t*tan(7.7 + 2215.0/t) + 91.36*t - 9.75*tan(7.7 + 2215.0/t) + 207.34) - 1.8*t + 0.18*t/(-3275.62*t**2*tan(7.7 + 2215.0/t) + 69692.71*t**2 - 4.29*t*tan(7.7 + 2215.0/t) + 91.36*t - 9.75*tan(7.7 + 2215.0/t) + 207.34)
Cannot get_coefficients_and_exponents


0.9*t**2 - 2.72*t**2/(-3275.62*t**2*tan(7.7 + 2215.0/t) + 69692.71*t**2 - 4.29*t*tan(7.7 + 2215.0/t) + 91.36*t - 9.75*tan(7.7 + 2215.0/t) + 207.34) - 1.8*t + 0.18*t/(-3275.62*t**2*tan(7.7 + 2215.0/t) + 69692.71*t**2 - 4.29*t*tan(7.7 + 2215.0/t) + 91.36*t - 9.75*tan(7.7 + 2215.0/t) + 207.34)

Cannot get_coefficients_and_exponents


2.5*t**2 + 0.57*t*sin(0.55*arctan(60889.21) - 0.02) + 0.01*t

6600 processed
6700 processed
6800 processed
6900 processed
7000 processed
7100 processed
Cannot get_coefficients_and_exponents


0.55*t**3 + 1.46*t**2*arctan(75.4) - 21.56*t**2 + 0.96*t*arctan(75.4)**2 - 28.5*t*arctan(75.4) + 267.05*t

7200 processed
7300 processed
7400 processed
7500 processed
7600 processed
Cannot get_coefficients_and_exponents


3.62*t**2 + 0.4*t**2/(16.36 - 0.28*arctan(0.0)) + 0.43*t + 0.05*t/(16.36 - 0.28*arctan(0.0))

7700 processed
7800 processed
7900 processed
Cannot get_coefficients_and_exponents


-1.62*t**3*arctan(0.0) + 17.27*t**3 - 0.2674*t**2*arctan(0.0) + 2.878*t**2 - 0.01739*t*arctan(0.0) + 0.1621*t - 0.0008639

8000 processed
8100 processed
8200 processed
Cannot get_coefficients_and_exponents


0.08*t**3 + 0.0250292397660819*t**2*arctan(0.05)

8300 processed
8400 processed
8500 processed
8600 processed
Cannot get_coefficients_and_exponents


1.2*t**2 + 0.22*t*arctan(0.16) + 7.56*t

8700 processed
8800 processed
8900 processed
9000 processed


In [35]:
est_simple = symbolicregression.model.SymbolicTransformerRegressor(
                        model=sr_model,
                        max_input_points=10001,
                        n_trees_to_refine=5,
                        rescale=True
                        )


x1s = np.reshape(np.asarray(data_series[1][0])[:,1], (len(data_series[1][1]),1))
xs = np.asarray(data_series[1][0])
ys = np.reshape(data_series[1][1], (len(data_series[1][1]),1))
ys = ys + 2*x1s*x1s + 3.5

# print(xs[0:10, 1])
# print(ys[0:10])

est_simple.fit(x1s,ys)

replace_ops = {"add": "+", "mul": "*", "sub": "-", "pow": "**", "inv": "1/"}
model_str = est_simple.retrieve_tree(with_infos=True)["relabed_predicted_tree"].infix()
for op,replace_op in replace_ops.items():
    model_str = model_str.replace(op,replace_op)
    
print(model_str)

raw_expr = sp.parse_expr(model_str)
sp.simplify(sp.expand(raw_expr))

  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')


((4.491977455886875 + (0.04277414834490412 * (-0.4268642389131769 + (0.01981438991014428 * x_0)))) - (1.0990225441131243 + (-0.9454860922513988 * ((31.69247952223734 + (73.39847807253604 * (-0.4268642389131769 + (0.01981438991014428 * x_0)))))**2)))


  warn_deprecated('grad')


1.999818880640652*x_0**2 + 0.994452388751196*x_0 + 3.4981136576698576

In [40]:
y_idx=2

x3s = np.reshape(np.asarray(data_series[y_idx][0])[:,3], (len(data_series[y_idx][1]),1))
xs = np.asarray(data_series[y_idx][0])
ys = np.reshape(data_series[y_idx][1], (len(data_series[y_idx][1]),1))
ys = ys + 2*x3s*x3s + 3*x3s + 3.5

# print(xs[0:10, 1])
# print(ys[0:10])

est_simple.fit(x3s,ys)

replace_ops = {"add": "+", "mul": "*", "sub": "-", "pow": "**", "inv": "1/"}
model_str = est_simple.retrieve_tree(with_infos=True)["relabed_predicted_tree"].infix()
for op,replace_op in replace_ops.items():
    model_str = model_str.replace(op,replace_op)
    
print(model_str)

raw_expr = sp.parse_expr(model_str)
sp.simplify(sp.expand(raw_expr))

  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')
  warn_deprecated('grad')


(((-12.485996101796932 * 1/((59.428897125824335 + (0.4400843372809274 * (0.0010929793650849954 + (0.02030257522230901 * x_0)))))) - (733.5144801519266 + (-109.17606121067313 * (((0.32142396984495697 + (-6.709960133036752 * (0.0010929793650849954 + (0.02030257522230901 * x_0)))) + ((0.14249416222887892 + (1.8528134966362885e-08 * 1/((0.11767499432962569 + (0.0010929793650849954 + (0.02030257522230901 * x_0)))))) * (-2.915071371303532 + (0.24190824888269735 * (0.0010929793650849954 + (0.02030257522230901 * x_0)))))))**2))) + (-734.9131517254541 + (1482.716911587979 * sin((1.4452354128706124 + (0.13369423518579324 * (0.0010929793650849954 + (0.02030257522230901 * x_0))))))))


(1.0630107575163575e-5*x_0**6 + 0.0709077116370039*x_0**5 + 0.0078595914625261832*x_0**4*sin(0.0027143372666486393*x_0 + 1.4453815379109013) + 1.3400942828296085*x_0**4 + 52.415349971070804*x_0**3*sin(0.0027143372666486393*x_0 + 1.4453815379109013) - 42.761349494816911*x_0**3 + 918.25940179460633*x_0**2*sin(0.0027143372666486393*x_0 + 1.4453815379109013) - 883.84289377210346*x_0**2 + 5368.5760714351907*x_0*sin(0.0027143372666486393*x_0 + 1.4453815379109013) - 5292.3961039574214*x_0 + 10465.470886683563*sin(0.0027143372666486393*x_0 + 1.4453815379109013) - 10358.195351216068)/(5.3008038156849632e-6*x_0**4 + 0.035350881588673826*x_0**3 + 0.61930864524311468*x_0**2 + 3.6207694331114662*x_0 + 7.0583068183090459)

In [33]:
np.asarray(data_series[1][0]).shape

(9048, 1)

In [50]:
ys = data_series[y_idx][1]
xs = np.asarray(data_series[y_idx][0])[:,3]

diff = (ys-0.5*xs).flatten()
for i in range(len(diff)):
    if abs(diff[i]) > 0.02:
        print(i, diff[i])

32 -0.026666666666666394
35 -0.11791731913884451
48 129.0
67 0.053333333333334565
110 4.393333333333333
174 7.5
203 -0.6319555555555549
355 -0.02300000000000002
413 0.0344444444444445
443 -0.03000000000000025
528 -0.5966666666666667
567 -44.44444444444444
698 0.040000000000000036
736 52.77333333333333
838 -0.04999999999999982
879 -0.040000000000000036
896 40.0
914 0.020000000000003126
935 0.07000000000000028
964 -0.023333333333333428
969 -0.28999999999999915
970 -0.5
992 -0.033333333333333215
1051 -0.020612244897959542
1059 -0.10714285714285765
1135 0.030000000000000027
1153 -0.03000000000000025
1197 0.07888888888888879
1207 -0.060000000000002274
1221 0.25
1238 -0.09999999999999964
1248 -0.04816326530612258
1263 -0.1999999999999993
1270 -0.13666666666666671
1293 2.2399999999999998
1439 -0.040000000000000036
1497 -0.13999999999999968
1646 0.04999999999999982
1668 -1.215
1677 -1.0799999999999998
1685 60.0
1690 0.021020408163265225
1889 -0.040000000000000036
1893 0.125
1912 6.2
1927 -0.23

In [44]:
np.asarray(data_series[y_idx][0])[:,3]

array([  2.        , -17.        ,   2.75      , ...,  -2.        ,
       -26.66666667,  -3.5       ])

In [45]:
data_series[y_idx][1]

[1.0,
 -8.5,
 1.38,
 54.0,
 -56.0,
 -8.25,
 -3.0,
 -3.0,
 0.67,
 -4.25,
 5.35,
 3.88,
 -2.36,
 -2.0,
 120.0,
 -0.48,
 -42.0,
 -10.0,
 -18.75,
 -2.14,
 1.3,
 -2.11,
 -3.33,
 2.1,
 8.0,
 17.89,
 1.14,
 1.17,
 102.86,
 -28.89,
 0.0,
 -4.29,
 2.64,
 27.0,
 3.2,
 4.2154160141944885,
 41.56,
 -4.33,
 -10.5,
 -13.75,
 4.0,
 -12.92,
 5.2,
 -0.66,
 -0.5,
 42.67,
 -4.2,
 -2.57,
 0.0,
 3.25,
 9.35,
 -6.0,
 4.5,
 -11.52,
 -40.0,
 4.38,
 -1.25,
 3.43,
 27.13,
 3.17,
 -9.63,
 0.189,
 3.67,
 -4.17,
 0.25,
 2.5,
 -2.43,
 8.72,
 -1.5,
 1.83,
 -1.17,
 -3.14,
 1.5,
 1.57,
 -1.3,
 20.63,
 81.0,
 -7.49,
 13.5,
 -2.67,
 -35.75,
 -0.14,
 2.0,
 -1.43,
 -2.43,
 -1.38,
 20.22,
 -0.5,
 -1.83,
 2.36,
 -35.14,
 -0.56,
 3.0,
 -2.75,
 3.89,
 -27.86,
 3.57,
 -4.25,
 -19.25,
 18.0,
 -1.79,
 -30.0,
 -2.0,
 2.71,
 0.5,
 -28.11,
 -2.0,
 -5.2,
 -46.75,
 4.0,
 0.06,
 -52.5,
 -3.07,
 -3.0,
 -63.0,
 -4.4,
 -2.0,
 0.5,
 0.33,
 22.0,
 3.29,
 -3.75,
 -0.75,
 1.3556327819824219,
 -3.75,
 12.67,
 4.04,
 -1.67,
 3.68,
 3.17,
 -7.1