In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


Linearna regresija

In [4]:
def linearna_regresija(X, y, meja=1e-2):
    imena = X.columns
    beta = np.linalg.pinv(X.T.dot(X)).dot(X.T).dot(y)

    izraz = ""
    for i,b in enumerate(beta):
        if b > meja:
            if len(izraz) > 0:
                izraz += " + "
            izraz +=  f"{b:.3f}*{imena[i]}"
    return izraz

In [5]:

data = pd.read_csv('DN4_1_podatki.csv')

X = data.drop('Q', axis=1)
Y = data['Q']

f = linearna_regresija(X, Y, meja=1e-5)
print(f)

0.075*Tw


In [6]:
from sklearn.preprocessing import PolynomialFeatures

poly = PolynomialFeatures(3)
Xp = poly.fit_transform(X)
imena_stolpcev = poly.get_feature_names_out()
Xp = pd.DataFrame(Xp, columns=imena_stolpcev)

fp = linearna_regresija(Xp, Y, meja=1e-1)
print(fp)

1.163*eta + 0.151*Tw theta + 0.194*theta^2 + 0.550*theta^2 eta + 0.311*eta^3


Rige regresija

In [7]:
def ridge_regresija(X, y, lam=1, meja=1e-2):
    imena = X.columns
    beta = np.linalg.pinv(X.T.dot(X) + lam*np.identity(X.shape[1])).dot(X.T).dot(y)

    izraz = ""
    for i,b in enumerate(beta):
        if b > meja:
            if len(izraz) > 0:
                izraz += " + "
            izraz +=  f"{b:.3f}*{imena[i]}"
    return izraz


In [8]:


fr = ridge_regresija(Xp, Y, lam=0.5)
print(fr)

0.048*Ta + 0.129*eta + 0.142*Tw theta + 0.053*Ta eta + 0.148*theta^2 + 0.123*eta^2 + 0.010*Tw theta eta + 0.045*Tw eta^2 + 0.051*Ta theta^2 + 0.461*theta^2 eta + 0.178*eta^3


Lasso

In [9]:
from scipy.optimize import minimize
def lasso_regresija(X, y, lam=1, meja=1e-2):
    imena = X.columns

    def f(beta):
        yhat = X.dot(beta)
        return np.sum((yhat-y)**2) + lam*np.sum(np.abs(beta))
    beta = minimize(f, np.random.random(X.shape[1]))["x"]
    
    izraz = ""
    for i,b in enumerate(beta):
        if b > meja:
            if len(izraz) > 0:
                izraz += " + "
            izraz +=  f"{b:.3f}*{imena[i]}"
    return izraz

In [10]:
fl = lasso_regresija(Xp, Y, lam=1)
print(fl)

0.080*Ta + 0.130*Tw theta + 0.040*Ta eta + 0.109*theta^2 + 0.040*Tw eta^2 + 0.051*Ta theta^2 + 0.013*theta^3 + 0.435*theta^2 eta


Bacon

In [11]:
def bacon(df, max_iter=20):
    for iteracija in range(max_iter):
        stolpci = list(df.columns)
        df_array = np.array(df)
        # ocenimo, ali obstaja konstanta
        sig = np.std(df_array, axis=0)
        # testirajmo se trivialnost
        if min(sig) < 10 ** -12:  # pazimo na skalo?
            break
        # izracunajmo vrstne rede, najdemo korelacije med njimi
        vrstni_redi = np.array(df.rank(axis=0)).T
        korelacije = np.corrcoef(vrstni_redi)  # korelacije[i, j]
        n = korelacije.shape[0]
        korelacije = [(abs(korelacije[i, j]), (i, j), korelacije[i, j] < 0) for i in range(n) for j in range(i + 1, n)]
        korelacije.sort(reverse=True)
        for kakovost, (i, j), je_mnozenje in korelacije:
            if je_mnozenje:
                ime_novega = f"({stolpci[i]}) * ({stolpci[j]})"
                vrednosti_novega = df_array[:, i] * df_array[:, j]
            else:
                ime_novega = f"({stolpci[i]}) / ({stolpci[j]})"
                vrednosti_novega = df_array[:, i] / df_array[:, j]
            if ime_novega not in stolpci:
                df[ime_novega] = vrednosti_novega
                break
    # najdemo "konstanto"
    df_array = np.array(df)
    sig = np.std(df_array, axis=0)
    i = np.argmin(sig)
    const = np.mean(df_array[:, i])
    print(f"{const:.5e} = {df.columns[i]} (napaka: {sig[i]})")

In [12]:
podatki = data
print(data.columns)
bacon(podatki)

Index(['Q', 'Tw', 'Ta', 'theta', 'eta'], dtype='object')
1.00000e+00 = ((Tw) / (Ta)) / ((Q) / ((Q) / ((Tw) / (Ta)))) (napaka: 4.213000162292041e-17)


PySR

In [17]:
from pysr import PySRRegressor
import sympy 

model = PySRRegressor(
    niterations=1000,  # < Increase me for better results
    binary_operators=["+", "*", "-", "/"],
    unary_operators=[
        "cos",
        "sin",
        "inv(x) = 1/x",
        "abs",
        "square",
        "cube",
        "cos2(x)=cos(x)^2",
        "sin2(x)=sin(x)^2"
        # ^ Custom operator (julia syntax)
    ],
    extra_sympy_mappings={"inv": lambda x: 1 / x,
                          "cos2": lambda x: sympy.cos(x)**2,
                          "sin2": lambda x: sympy.sin(x)**2},
    # ^ Define operator for SymPy as well
    loss="loss(prediction, target) = (prediction - target)^2",
    # ^ Custom loss function (julia syntax)
    timeout_in_seconds=60 * 5,
)

In [20]:
model.fit(X, Y)



Started!

Expressions evaluated per second: 2.690e+05
Head worker occupation: 6.5%
Progress: 591 / 15000 total iterations (3.940%)
Hall of Fame:
---------------------------------------------------------------------------------------------------
Complexity  Loss       Score     Equation
1           1.150e+00  3.109e-07  0.6830067
2           1.026e+00  1.144e-01  sin(theta)
4           9.466e-01  4.011e-02  cos2(Ta / Tw)
5           6.005e-01  4.551e-01  ((Tw + -21.53733) / Ta)
6           5.398e-01  1.066e-01  ((cube(Tw) * 0.00025319468) / Ta)
7           5.158e-01  4.534e-02  (((Tw - 12.057189) / Ta) - eta)
8           3.226e-01  4.695e-01  (((Tw - Ta) * sin(theta)) / 12.2925205)
9           1.624e-01  6.862e-01  square(((Tw - Ta) * sin(theta)) / -16.332458)
10          1.436e-01  1.229e-01  square(((Tw - Ta) * sin(sin(theta))) / 14.23498)
11          1.364e-01  5.130e-02  square(((Tw - Ta) * sin(sin(sin(theta)))) / 12.82538)
12          1.161e-01  1.613e-01  square((((Tw - Ta) * sin(

In [19]:
from sklearn.metrics import mean_squared_error

#print(model)
#print(model.predict(X, 11))

for k in range(17):
    pred = model.predict(X, k)
    err = mean_squared_error(Y, pred)
    print(k, err)


0 1.1499304045516108
1 1.0256187590418555
2 1.0096372068679893
3 0.9442137703362646
4 0.5716818689835105
5 0.429826258396103
6 0.315939704512936
7 0.1251105061590083
8 0.056142070846614725
9 0.019176886828392945
10 0.016050304642276473
11 0.012722027249519175
12 0.012720963597696837
13 0.012716963162983834
14 0.012660242267847838
15 0.011126912235004902
16 0.01098580436394301
