In this notebook we explore how well the algorithm is capable of recovering the following dependencies:
- Linear combination of 3 variables and bias with standard-normally distributed coefficients
  
  We can see that in 6 cases out of 10 the formulas are recovered exactly, while in other 5 cases there is a strong similarity (the degree of one variable is a small number, and the loss is compensated by the bias term).


- Linear combination of 3 squared variables and bias with standard-normally distributed coefficients

    We can clearly see that in 9 cases out of 10 the formulas are recovered exactly.


- Linear combination of 3 variables with powers uniformly distributed over {1, 2 ,3 ,4, 5} and standard-normally distributed coefficients

    We can see that in 6 cases out of 10 the formulas are recovered exactly, while in other 3 cases there is no similarity (the degree of one variable is a small number, and the loss is compensated by the bias term).

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.metrics import mean_squared_error
from torch.nn import MSELoss
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import copy
import time
from IPython.display import display, Math, Latex

import Formula
import functions
from functions import *
import importlib
from hessian import hessian
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:100% !important; }</style>"))

device = torch.device("cpu")
if torch.cuda.is_available():
    device = torch.device("cuda")

In [2]:
importlib.reload(Formula)
importlib.reload(functions)

<module 'functions' from '/home/zybinmikhail/Documents/personal github projects/LearningFormulas/functions.py'>

In [3]:
def print_sharps():
    print("\n################################################################################")

In [4]:
def print_results(symbolic_mse_list, number_of_tested_formulas, recovery_threshold=1e-5):
    print_sharps()
    print("MSEs between parameters:")
    print(symbolic_mse_list)
    symbolic_mse_list = np.array(symbolic_mse_list)
    number_of_small_errors = (symbolic_mse_list < recovery_threshold).sum()
    print(f"For {number_of_small_errors} formulas out of {number_of_tested_formulas} ", end="") 
    print(f"the error is less than {recovery_threshold}.")

In [5]:
def print_time(cnt, cnt_iteration):
    time_from_start = time.perf_counter() - cnt
    time_iteration = time.perf_counter() - cnt_iteration
    min_from_start, sec_from_start = divmod(time_from_start, 60)
    min_iteration, sec_iteration = divmod(time_iteration, 60)    
    print(f"{remove_zero_minutes(min_from_start)}{sec_from_start :.0f} seconds passed from the start, ", end="")
    print(f"the iteration took {remove_zero_minutes(min_iteration)}{sec_iteration :.0f} seconds")

In [6]:
def get_params(regressor):
    lambdas = [regressor.get_lambda(i).item() for i in range(3)]
    powers = [regressor.get_power(i).item() for i in range(3)]
    bias_term = [regressor.last_subformula.lambda_0.item()]
    obtained_params = np.array(lambdas + powers + bias_term)
    return obtained_params

In [7]:
def generate_data(n_variables=3, m_samples=1000, min_power=1, max_power=2):
    X = torch.rand(m_samples, n_variables)
    b = torch.randn(1)
    coeffs = torch.randn((n_variables, 1))
    powers = torch.randint(min_power, max_power, (1, n_variables))
    y = X**powers @ coeffs + b
    true_params = np.array(coeffs.view(-1,).tolist() + powers.view(-1,).tolist() + [b.item()])
    return coeffs, powers, b, X, y, true_params

In [8]:
def print_ground_truth(coeffs, powers, b):
    formula = []
    powers = powers.view(-1,)
    for i in range(len(coeffs)):
        new_term = [round(coeffs[i].item(), 3), "x_{", i + 1, "}^{", round(powers[i].item(), 3), "}"]
        if new_term[0] > 0 and i > 0:
            formula.append('+')
        formula.extend(list(map(str, new_term)))
    if b > 0:
        formula.append("+")
    formula.append(str(round(b.item(), 3)))
    PrintFormula("".join(formula))

In [9]:
def explore(n_variables=3, m_samples=1000, min_power=1, max_power=1, number_of_tested_formulas=10, recovery_threshold=1e-5):
    cnt = time.perf_counter()
    symbolic_mse_list = []

    for i in range(number_of_tested_formulas):
        print(f"\n\n----------------------------Exploring new formula #{i + 1}----------------------------")
        coeffs, powers, b, X, y, true_params = generate_data(n_variables, m_samples, min_power, max_power + 1)
        cnt_iteration = time.perf_counter()
        regressor, _ = Formula.LearnFormula(X, y, optimizer_for_formula=torch.optim.Rprop, n_init=10)
        print_time(cnt, cnt_iteration)
        print("ground truth and obtained formula")
        print_ground_truth(coeffs, powers, b)
        PrintFormula(regressor)

        obtained_params = get_params(regressor)
        symbolic_mse = mean_squared_error(true_params, obtained_params)
        symbolic_mse_list.append(symbolic_mse)
        print(f"MSE between formula parameters is {symbolic_mse}")
        if symbolic_mse < recovery_threshold:
            print("EXACT RECOVERY")
        else:
            print("FAILURE")

    print_results(symbolic_mse_list, number_of_tested_formulas)

## Linear combination of 3 variables and bias with standard-normally distributed coefficients

In [10]:
explore()



----------------------------Exploring new formula #1----------------------------
  Initialization #1
  Finished run #1, loss 1.9545899121453658e-08, best loss 1.9545899121453658e-08
loss is smaller than 0.001, terminating learning process
3 seconds passed from the start, the iteration took 3 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 1.0063899250464407e-06
EXACT RECOVERY


----------------------------Exploring new formula #2----------------------------
  Initialization #1
  Finished run #1, loss 2.3432189522054614e-09, best loss 2.3432189522054614e-09
loss is smaller than 0.001, terminating learning process
4 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 3.308736343005896e-07
EXACT RECOVERY


----------------------------Exploring new formula #3----------------------------
  Initialization #1
  Finished run #1, loss 0.012428538873791695, best loss 0.012428538873791695
  Initialization #2
  Finished run #2, loss 8.420081298865512e-10, best loss 8.420081298865512e-10
loss is smaller than 0.001, terminating learning process
9 seconds passed from the start, the iteration took 5 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 5.875073967003581e-08
EXACT RECOVERY


----------------------------Exploring new formula #4----------------------------
  Initialization #1
  Finished run #1, loss 0.04012183099985123, best loss 0.04012183099985123
  Initialization #2
  Finished run #2, loss 0.037687137722969055, best loss 0.037687137722969055
  Initialization #3
  Finished run #3, loss 0.0023925129789859056, best loss 0.0023925129789859056
  Initialization #4
  Finished run #4, loss 0.037928178906440735, best loss 0.0023925129789859056
  Initialization #5
  Finished run #5, loss 0.03764372691512108, best loss 0.0023925129789859056
  Initialization #6
  Finished run #6, loss 0.001723115099593997, best loss 0.001723115099593997
  Initialization #7
  Finished run #7, loss 0.02971377968788147, best loss 0.001723115099593997
  Initialization #8
  Finished run #8, loss 0.03281833976507187, best loss 0.001723115099593997
  Initialization #9
  Finished run #9, loss 0.0379251129925251, best lo

<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.40518855792418684
FAILURE


----------------------------Exploring new formula #5----------------------------
  Initialization #1
  Finished run #1, loss 1.114528025425443e-08, best loss 1.114528025425443e-08
loss is smaller than 0.001, terminating learning process
1 minutes 5 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 4.3907026754068976e-07
EXACT RECOVERY


----------------------------Exploring new formula #6----------------------------
  Initialization #1
  Finished run #1, loss 4.9057074647862464e-05, best loss 4.9057074647862464e-05
loss is smaller than 0.001, terminating learning process
1 minutes 7 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.2654291738345941
FAILURE


----------------------------Exploring new formula #7----------------------------
  Initialization #1
  Finished run #1, loss 1.9324502886775008e-07, best loss 1.9324502886775008e-07
loss is smaller than 0.001, terminating learning process
1 minutes 8 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.0004490945575166906
FAILURE


----------------------------Exploring new formula #8----------------------------
  Initialization #1
  Finished run #1, loss 1.8502367311157286e-05, best loss 1.8502367311157286e-05
loss is smaller than 0.001, terminating learning process
1 minutes 10 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.41407103044771354
FAILURE


----------------------------Exploring new formula #9----------------------------
  Initialization #1
  Finished run #1, loss 1.6655112844432551e-09, best loss 1.6655112844432551e-09
loss is smaller than 0.001, terminating learning process
1 minutes 12 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.2995982725180853e-07
EXACT RECOVERY


----------------------------Exploring new formula #10----------------------------
  Initialization #1
  Finished run #1, loss 0.0022249321918934584, best loss 0.0022249321918934584
  Initialization #2
  Finished run #2, loss 0.002683191327378154, best loss 0.0022249321918934584
  Initialization #3
  Finished run #3, loss 0.0011217775754630566, best loss 0.0011217775754630566
  Initialization #4
  Finished run #4, loss 5.520704449146763e-10, best loss 5.520704449146763e-10
loss is smaller than 0.001, terminating learning process
1 minutes 20 seconds passed from the start, the iteration took 8 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.7258411502264053e-06
EXACT RECOVERY

################################################################################
MSEs between parameters:
[1.0063899250464407e-06, 3.308736343005896e-07, 5.875073967003581e-08, 0.40518855792418684, 4.3907026754068976e-07, 0.2654291738345941, 0.0004490945575166906, 0.41407103044771354, 2.2995982725180853e-07, 2.7258411502264053e-06]
For 6 formulas out of 10 the error is less than 1e-05.


## Linear combination of 3 squared variables and bias with standard-normally distributed coefficients

In [11]:
explore(min_power=2, max_power=2)



----------------------------Exploring new formula #1----------------------------
  Initialization #1
  Finished run #1, loss 0.19504092633724213, best loss 0.19504092633724213
  Initialization #2
  Finished run #2, loss 0.007013747934252024, best loss 0.007013747934252024
  Initialization #3
  Finished run #3, loss 0.009375261142849922, best loss 0.007013747934252024
  Initialization #4
  Finished run #4, loss 0.009800040163099766, best loss 0.007013747934252024
  Initialization #5
  Finished run #5, loss 1.8762023185070476e-12, best loss 1.8762023185070476e-12
loss is smaller than 0.001, terminating learning process
15 seconds passed from the start, the iteration took 15 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 6.69699216717033e-12
EXACT RECOVERY


----------------------------Exploring new formula #2----------------------------
  Initialization #1
  Finished run #1, loss 3.5127137743701242e-12, best loss 3.5127137743701242e-12
loss is smaller than 0.001, terminating learning process
16 seconds passed from the start, the iteration took 1 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 7.412026548081485e-11
EXACT RECOVERY


----------------------------Exploring new formula #3----------------------------
  Initialization #1
  Finished run #1, loss 0.1483605057001114, best loss 0.1483605057001114
  Initialization #2
  Finished run #2, loss 0.02518836408853531, best loss 0.02518836408853531
  Initialization #3
  Finished run #3, loss 1.3154793226943795e-11, best loss 1.3154793226943795e-11
loss is smaller than 0.001, terminating learning process
32 seconds passed from the start, the iteration took 15 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 3.558649565503304e-10
EXACT RECOVERY


----------------------------Exploring new formula #4----------------------------
  Initialization #1
  Finished run #1, loss 1.5556061647159503e-11, best loss 1.5556061647159503e-11
loss is smaller than 0.001, terminating learning process
33 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 4.6123866630815817e-10
EXACT RECOVERY


----------------------------Exploring new formula #5----------------------------
  Initialization #1
  Finished run #1, loss 0.0013860539766028523, best loss 0.0013860539766028523
  Initialization #2
  Finished run #2, loss 0.0016387712676078081, best loss 0.0013860539766028523
  Initialization #3
  Finished run #3, loss 5.024526801566953e-12, best loss 5.024526801566953e-12
loss is smaller than 0.001, terminating learning process
38 seconds passed from the start, the iteration took 5 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 3.520428805612953e-09
EXACT RECOVERY


----------------------------Exploring new formula #6----------------------------
  Initialization #1
  Finished run #1, loss 0.03510764613747597, best loss 0.03510764613747597
  Initialization #2
  Finished run #2, loss 0.03426942229270935, best loss 0.03426942229270935
  Initialization #3
  Finished run #3, loss 0.035176824778318405, best loss 0.03426942229270935
  Initialization #4
  Finished run #4, loss 0.023263581097126007, best loss 0.023263581097126007
  Initialization #5
    Epoch 5000, current loss 0.054, current formula \left(0.879x_1^{1.832}-8.612x_2^{-0.023} + 8.933x_3^{-0.019}-1.041\right)
    Epoch 10000, current loss 0.0535, current formula \left(0.879x_1^{1.835}-12.666x_2^{-0.016} + 12.986x_3^{-0.013}-1.041\right)
  Finished run #5, loss 0.05348413810133934, best loss 0.023263581097126007
  Initialization #6
  Finished run #6, loss 8.06729336111367e-12, best loss 8.06729336111367e-12
loss is smalle

<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 3.1134510842483516e-10
EXACT RECOVERY


----------------------------Exploring new formula #7----------------------------
  Initialization #1
  Finished run #1, loss 0.010849709622561932, best loss 0.010849709622561932
  Initialization #2
  Finished run #2, loss 0.038911230862140656, best loss 0.010849709622561932
  Initialization #3
  Finished run #3, loss 0.009395286440849304, best loss 0.009395286440849304
  Initialization #4
  Finished run #4, loss 0.0017017177306115627, best loss 0.0017017177306115627
  Initialization #5
  Finished run #5, loss 0.03575057163834572, best loss 0.0017017177306115627
  Initialization #6
  Finished run #6, loss 0.011514522135257721, best loss 0.0017017177306115627
  Initialization #7
    Epoch 5000, current loss 0.0114, current formula \left(-5.762x_1^{-0.020} + 0.945x_2^{1.895} + 3.524x_3^{-0.011} + 3.652\right)
  Finished run #7, loss 0.011352745816111565, best loss 0.0017017177306115627
  Initialization #8
  Finished

<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 6.36695601003672e-09
EXACT RECOVERY


----------------------------Exploring new formula #8----------------------------
  Initialization #1
  Finished run #1, loss 0.00043949144310317934, best loss 0.00043949144310317934
loss is smaller than 0.001, terminating learning process
2 minutes 48 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.306655452500273
FAILURE


----------------------------Exploring new formula #9----------------------------
  Initialization #1
  Finished run #1, loss 2.1731370175581688e-11, best loss 2.1731370175581688e-11
loss is smaller than 0.001, terminating learning process
2 minutes 50 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 7.782828465030533e-08
EXACT RECOVERY


----------------------------Exploring new formula #10----------------------------
  Initialization #1
  Finished run #1, loss 7.292567591465904e-12, best loss 7.292567591465904e-12
loss is smaller than 0.001, terminating learning process
2 minutes 55 seconds passed from the start, the iteration took 6 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 5.295644557788495e-10
EXACT RECOVERY

################################################################################
MSEs between parameters:
[6.69699216717033e-12, 7.412026548081485e-11, 3.558649565503304e-10, 4.6123866630815817e-10, 3.520428805612953e-09, 3.1134510842483516e-10, 6.36695601003672e-09, 2.306655452500273, 7.782828465030533e-08, 5.295644557788495e-10]
For 9 formulas out of 10 the error is less than 1e-05.


## Linear combination of 3 variables with powers uniformly distributed over {1, 2 ,3 ,4, 5} and standard-normally distributed coefficients

In [12]:
explore(max_power=5)



----------------------------Exploring new formula #1----------------------------
  Initialization #1
  Finished run #1, loss 0.2498483806848526, best loss 0.2498483806848526
  Initialization #2
  Finished run #2, loss 0.24947260320186615, best loss 0.24947260320186615
  Initialization #3
  Finished run #3, loss 4.875495623543613e-13, best loss 4.875495623543613e-13
loss is smaller than 0.001, terminating learning process
14 seconds passed from the start, the iteration took 14 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.6391587328232295e-13
EXACT RECOVERY


----------------------------Exploring new formula #2----------------------------
  Initialization #1
  Finished run #1, loss 0.00034576840698719025, best loss 0.00034576840698719025
loss is smaller than 0.001, terminating learning process
16 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.31358897096567057
FAILURE


----------------------------Exploring new formula #3----------------------------
  Initialization #1
  Finished run #1, loss 0.0013306501787155867, best loss 0.0013306501787155867
  Initialization #2
  Finished run #2, loss 4.9519394661190486e-12, best loss 4.9519394661190486e-12
loss is smaller than 0.001, terminating learning process
20 seconds passed from the start, the iteration took 4 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 1.9162608008319435e-11
EXACT RECOVERY


----------------------------Exploring new formula #4----------------------------
  Initialization #1
  Finished run #1, loss 0.16619305312633514, best loss 0.16619305312633514
  Initialization #2
  Finished run #2, loss 0.2476569265127182, best loss 0.16619305312633514
  Initialization #3
  Finished run #3, loss 0.008306590840220451, best loss 0.008306590840220451
  Initialization #4
  Finished run #4, loss 1.8903615651988392e-12, best loss 1.8903615651988392e-12
loss is smaller than 0.001, terminating learning process
37 seconds passed from the start, the iteration took 18 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.532678828564323e-11
EXACT RECOVERY


----------------------------Exploring new formula #5----------------------------
  Initialization #1
  Finished run #1, loss 3.509463553097447e-11, best loss 3.509463553097447e-11
loss is smaller than 0.001, terminating learning process
39 seconds passed from the start, the iteration took 2 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 5.2594433253005946e-09
EXACT RECOVERY


----------------------------Exploring new formula #6----------------------------
  Initialization #1
  Finished run #1, loss 0.003747019451111555, best loss 0.003747019451111555
  Initialization #2
  Finished run #2, loss 0.0032388486433774233, best loss 0.0032388486433774233
  Initialization #3
  Finished run #3, loss 0.0034776830580085516, best loss 0.0032388486433774233
  Initialization #4
  Finished run #4, loss 8.082959539024159e-05, best loss 8.082959539024159e-05
loss is smaller than 0.001, terminating learning process
45 seconds passed from the start, the iteration took 5 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 2.27516452866415
FAILURE


----------------------------Exploring new formula #7----------------------------
  Initialization #1
  Finished run #1, loss 4.859391101313948e-12, best loss 4.859391101313948e-12
loss is smaller than 0.001, terminating learning process
46 seconds passed from the start, the iteration took 1 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 1.6838589950956314e-08
EXACT RECOVERY


----------------------------Exploring new formula #8----------------------------
  Initialization #1
  Finished run #1, loss 0.0006477777496911585, best loss 0.0006477777496911585
loss is smaller than 0.001, terminating learning process
47 seconds passed from the start, the iteration took 1 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 3.337249927827354
FAILURE


----------------------------Exploring new formula #9----------------------------
  Initialization #1
  Finished run #1, loss 0.00010155868949368596, best loss 0.00010155868949368596
loss is smaller than 0.001, terminating learning process
49 seconds passed from the start, the iteration took 1 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 0.7749553351816612
FAILURE


----------------------------Exploring new formula #10----------------------------
  Initialization #1
  Finished run #1, loss 0.00399942509829998, best loss 0.00399942509829998
  Initialization #2
  Finished run #2, loss 3.6368278180654023e-12, best loss 3.6368278180654023e-12
loss is smaller than 0.001, terminating learning process
53 seconds passed from the start, the iteration took 4 seconds
ground truth and obtained formula


<IPython.core.display.Math object>

<IPython.core.display.Math object>

MSE between formula parameters is 1.8920433473925057e-10
EXACT RECOVERY

################################################################################
MSEs between parameters:
[2.6391587328232295e-13, 0.31358897096567057, 1.9162608008319435e-11, 2.532678828564323e-11, 5.2594433253005946e-09, 2.27516452866415, 1.6838589950956314e-08, 3.337249927827354, 0.7749553351816612, 1.8920433473925057e-10]
For 6 formulas out of 10 the error is less than 1e-05.
