In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.linear_model import (
    LinearRegression,
    Lasso,
    Ridge,
    HuberRegressor,
    TheilSenRegressor,
    ElasticNet,
)
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVR, SVR
from sklearn.tree import DecisionTreeRegressor
import joblib
from figrecipes import PlotlyFig

from figures.helpers import featureMap, unitsMap, bounds
from workFiles.functions.coordinator import (
    fit_model,
    extract_data_to_fit,
    predicted_properties,
)
from workFiles.functions.getDF import get_df
from workFiles.functions.helpers import (
    calculate_total_time_left,
    count_combinations,
)
from workFiles.types import Data_splitted

In [None]:
df = get_df()
data = extract_data_to_fit(df, predicted_properties)

In [None]:
models = [
    (LinearRegression, {
        "fit_intercept": [True, False],
        "positive": [True, False]
    }),
    (Lasso, {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10],
        'tol': [0.0001, 0.001, 0.01, 0.1],
        'copy_X': [True]
    }),
    (Ridge, {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10.0, 100],
        'solver': ['svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
        'copy_X': [True],
        'max_iter': [100_000],
        'fit_intercept': [True],
    }),
    (ElasticNet, {
        'alpha': [0.001, 0.01, 0.1, 1.0, 10],
        'tol': [0.00001, 0.0001, 0.001, 0.01, 0.1],
        'l1_ratio': [0.1, 0.2, 0.4, 0.6, 0.8, 0.9],
        "fit_intercept": [True],
        'copy_X': [True],
        "selection": ['cyclic', 'random']
    }),
    (HuberRegressor, {
        'max_iter': [10_000],
        'epsilon': [1.0, 1.5, 2.0, 2.5, 3.0],
        'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 1.0],
        'fit_intercept': [True],
    }),
    (SVR, {
        'max_iter': [10_000_000],
        'kernel': ['poly', 'rbf', 'sigmoid'],
        'degree': [2],
        'tol': [1e-04, 1e-03, 1e-02, 1e-01, 1e+00],
        'epsilon': [0.0, 0.1, 0.3, 1.0, 3.0, 10.0, 30],
    }),
    (LinearSVR, {
        'max_iter': [100_000],
        'loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
        'epsilon': [0.0, 0.1, 0.3, 0.6, 1.0, 3.0, 10.0],
        'tol': [1e-04, 1e-03, 1e-02, 1e-01, 1e+00],
    }),
    (TheilSenRegressor, {
        'max_iter': [100],
        'max_subpopulation': [500, 700, 900, 1100],
        'fit_intercept': [True],
        'tol': [1e-05, 1e-04, 1e-03, 1e-02],
    }),
    (DecisionTreeRegressor, {
        'max_depth': [3, 7, 10, 30, 60, 100, 150, 200],
        'min_samples_split': [2, 4, 8, 16, 32],
        'min_samples_leaf': [1, 2, 4, 8],
        'splitter': ['best', 'random'],
    }),
    (RandomForestRegressor, {
        'n_estimators': [50, 75, 100, 150, 200],
        'max_leaf_nodes': [32, 64, 128, 256],
        'max_depth': [3, 9, 27, 81],
        'bootstrap': [True],
    }),
]


In [None]:
model_combinations = {model_name: count for model_name, count in map(count_combinations, models)}
model_combinations

In [None]:
# # Results per model and property and grid search parameters
try:
    Results = joblib.load("Results.joblib")
    print("Loaded Results.joblib")
except FileNotFoundError as e:
    print(e)

    Results = {
        column_name: {
            model.__name__: {
                "instance": None,
                "time_taken": None,
                "time_taken_per_run": None,
                "r2": None,
                "rmse": None,
            }
            for (model, _) in models
        }
        for column_name in data.y
    }

In [None]:
for i, column_name in enumerate(data.y):
    X_train, X_test, y_train, y_test = train_test_split(
        data.X, data.y[column_name], test_size=0.1, random_state=1234
    )
    data_split = Data_splitted(X_train, X_test, y_train, y_test)

    print(f"Property {i}: {column_name}")
    for j, (model, options) in enumerate(models):
        if Results[column_name][model.__name__]["instance"] is not None:
            continue
        print(i, j)

        model_instance, time_taken = fit_model(model, data_split, options)

        Results[column_name][model.__name__]["instance"] = model_instance.instance
        Results[column_name][model.__name__]["r2"] = model_instance.r2_test
        Results[column_name][model.__name__]["rmse"] = model_instance.rmse_test

        time_taken_per_run = time_taken / model_combinations[model.__name__]
        Results[column_name][model.__name__]["time_taken_per_run"] = time_taken_per_run
        Results[column_name][model.__name__]["time_taken"] = time_taken

        print(f"Time taken: {time_taken:.2f}[min]")
        print(f"Time per run: {time_taken_per_run:.2f}[min]", end="\n\n")
        print(f"Estimated Time Left: {calculate_total_time_left(Results):.2f}[min]")

        joblib.dump(Results, "Results.joblib")


In [None]:
for feature in Results:
    for model_name in Results[feature]:
        fitted_model = Results[feature][model_name]
        r2 = fitted_model['r2']
        rmse = fitted_model['rmse']
        time_taken = fitted_model['time_taken']
        instance = fitted_model['instance']

        property_name = f'{featureMap[feature]} {unitsMap[feature]}'

        pf = PlotlyFig(x_title=f'Wartość rzeczywista:\n{property_name}',
                       y_title=f'Wartość przewidziana:\n{property_name}',
                       title=model_name,
                       filename=f"figures/{feature}-{model_name}.html",
                       fontsize=20)

        y = data.y[feature]
        X = data.X

        y_pred = instance.best_estimator_.predict(X)

        pf.xy(xy_pairs=[(y, y_pred), bounds[feature]],
              labels=df['formula_pretty'],
              modes=['markers', 'lines'],
              lines=[{}, {'color': 'black', 'dash': 'dash'}],
              showlegends=False
              )
