In [None]:
!pip install autogluon
!pip install flaml

Collecting autogluon
  Downloading autogluon-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.core==1.2 (from autogluon.core[all]==1.2->autogluon)
  Downloading autogluon.core-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.features==1.2 (from autogluon)
  Downloading autogluon.features-1.2-py3-none-any.whl.metadata (11 kB)
Collecting autogluon.tabular==1.2 (from autogluon.tabular[all]==1.2->autogluon)
  Downloading autogluon.tabular-1.2-py3-none-any.whl.metadata (14 kB)
Collecting autogluon.multimodal==1.2 (from autogluon)
  Downloading autogluon.multimodal-1.2-py3-none-any.whl.metadata (12 kB)
Collecting autogluon.timeseries==1.2 (from autogluon.timeseries[all]==1.2->autogluon)
  Downloading autogluon.timeseries-1.2-py3-none-any.whl.metadata (12 kB)
Collecting scikit-learn<1.5.3,>=1.4.0 (from autogluon.core==1.2->autogluon.core[all]==1.2->autogluon)
  Downloading scikit_learn-1.5.2-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collec

Collecting flaml
  Downloading FLAML-2.3.4-py3-none-any.whl.metadata (16 kB)
Downloading FLAML-2.3.4-py3-none-any.whl (314 kB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/314.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.2/314.2 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flaml
Successfully installed flaml-2.3.4


In [None]:
!pip install openml
import numpy as np
import openml
import pandas as pd
import time
from pathlib import Path

from sklearn.metrics import mean_squared_error
from functools import wraps
from sklearn.model_selection import train_test_split


def get_dataset_from_openml(dataset_id: int) -> pd.DataFrame:
    """
       Fetches a dataset from OpenML and returns it as a pandas DataFrame.

       Parameters:
           dataset_id (int): The unique ID of the dataset to be fetched from OpenML.

       Returns:
           pd.DataFrame: A pandas DataFrame containing the data from the requested OpenML dataset.
       """
    dataset = openml.datasets.get_dataset(dataset_id)
    # we will perform any data splits manually, so we are only interested in the first value of the tuple
    data, _, _, _ = dataset.get_data(dataset_format='dataframe')
    return data


def load_dataset(dataset_id: int, dataset_path: str) -> pd.DataFrame:
    file_path = Path(dataset_path)

    if not file_path.exists():
        file_path.parent.mkdir(parents=True, exist_ok=True)
        data = get_dataset_from_openml(dataset_id)
        data.to_csv(file_path)

    return pd.read_csv(file_path)


def get_train_test_data(df: pd.DataFrame, target: str, split_size):
    x = df.drop(columns=[target])
    y = df[target]
    return train_test_split(x, y, test_size=split_size)


def timer(func):
    """
    A decorator to measure and print the execution time of a function.

    Args:
    - func (function): The function to be wrapped by the timer decorator.

    Returns:
    - wrapper (function): A wrapped function that calculates and prints the time
                           taken to execute the original function.

    This decorator can be used to wrap functions and output their execution time
    in seconds.
    """

    @wraps(func)
    def wrapper(*args, **kwargs):
        start_time = time.time()
        result = func(*args, **kwargs)
        end_time = time.time()
        duration = end_time - start_time
        print(f"{func.__name__} executed in {duration:.4f} seconds\n")
        return result

    return wrapper


def get_rmse(y_pred, y_true):
    return np.sqrt(mean_squared_error(y_true, y_pred))


def convert_to_numpy(*args: 'pd.DataFrame | np.ndarray | pd.Series') -> tuple[np.ndarray, ...]:
    return tuple(arg.to_numpy() if isinstance(arg, (pd.DataFrame, pd.Series)) else arg for arg in args)


def train_test_to_numpy(x_train, x_test, y_train, y_test) -> tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    return x_train.to_numpy(), x_test.to_numpy(), y_train.to_numpy(), y_test.to_numpy()




In [None]:
from pathlib import Path
from typing import Tuple
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder
from sklearn.preprocessing import StandardScaler

#from assignment3.util.data_utils import load_dataset, get_train_test_data, timer, convert_to_numpy

_DATASET_ID = 44994
_DATASET_PATH = 'data/cars.csv'
_TEST_SPLIT_SIZE = 0.2
_TARGET_VARIABLE = 'Price'
_CORRELATION_DROP_THRESHOLD = 1.0
_TEST_RUN = True

_OUTPUT_FOLDER = Path('output/cars')
_OUTPUT_HYPERPARAMETERS_FOLDER = _OUTPUT_FOLDER / 'parameter_permutation'
_OUTPUT_HYPERPARAMETERS_RESULTS = _OUTPUT_HYPERPARAMETERS_FOLDER / 'results.csv'


@timer
def prepare_cars_dataset() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

    _DATASET_ID = 44994
    _DATASET_PATH = 'data/cars.csv'
    _TEST_SPLIT_SIZE = 0.2
    _TARGET_VARIABLE = 'Price'
    _CORRELATION_DROP_THRESHOLD = 1.0
    _TEST_RUN = True

    _OUTPUT_FOLDER = Path('output/cars')
    _OUTPUT_HYPERPARAMETERS_FOLDER = _OUTPUT_FOLDER / 'parameter_permutation'
    _OUTPUT_HYPERPARAMETERS_RESULTS = _OUTPUT_HYPERPARAMETERS_FOLDER / 'results.csv'

    df = load_dataset(_DATASET_ID, _DATASET_PATH)

    if _TEST_RUN:
        df = df.iloc[:100, :]

    print("Dimensions for training:", df.shape)

    X = df.drop(columns=[_TARGET_VARIABLE])

    numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns
    categorical_cols = X.select_dtypes(include=['object', 'bool']).columns

    numerical_transformer = StandardScaler()

    categorical_transformer = OneHotEncoder(handle_unknown='ignore')

    preprocessor = ColumnTransformer(
        transformers=[
            ('num', numerical_transformer, numerical_cols),
            ('cat', categorical_transformer, categorical_cols)
        ])

    pipeline = Pipeline(steps=[('preprocessor', preprocessor)])

    x_train, x_test, y_train, y_test = get_train_test_data(df=df, target=_TARGET_VARIABLE, split_size=_TEST_SPLIT_SIZE)
    x_train = pipeline.fit_transform(x_train)
    x_test = pipeline.transform(x_test)

    x_train, x_test, y_train, y_test = convert_to_numpy(x_train, x_test, y_train, y_test)

    return x_train, x_test, y_train, y_test

from sklearn.impute import SimpleImputer



print("dataset id",_DATASET_ID)
@timer
def prepare_employee_salaries_dataset() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

    _DATASET_ID = 42125
    _DATASET_PATH = 'data/employee_salaries.csv'
    _TEST_SPLIT_SIZE = 0.2
    _TARGET_VARIABLE = 'current_annual_salary'
    _CORRELATION_DROP_THRESHOLD = 1.0

    _OUTPUT_FOLDER = Path('output/employee_salaries')
    _OUTPUT_HYPERPARAMETERS_FOLDER = _OUTPUT_FOLDER / 'parameter_permutation'
    _OUTPUT_HYPERPARAMETERS_RESULTS = _OUTPUT_HYPERPARAMETERS_FOLDER / 'results.csv'

    df = load_dataset(_DATASET_ID, _DATASET_PATH)

    if _TEST_RUN:
        df = df.iloc[:100, :]
    print("Dimensions for training:", df.shape)
    if 'date_first_hired' in df.columns:
      # split date_first_hired into year, month, day
      date_first_hired = pd.to_datetime(df['date_first_hired'])
      df['year_first_hired'] = date_first_hired.dt.year
      df['month_first_hired'] = date_first_hired.dt.month
      df['day_first_hired'] = date_first_hired.dt.day
      df.drop(columns=['date_first_hired', 'full_name'], inplace=True)

    x_train, x_test, y_train, y_test = get_train_test_data(df=df, target=_TARGET_VARIABLE, split_size=_TEST_SPLIT_SIZE)

    # setup larger preprocessing pipeline
    # Columns with missing values:
    # ---------------------------
    # gender
    # 2016_gross_pay_received
    # 2016_overtime_pay
    # underfilled_job_title

    gender_preprocessing_pipeline = Pipeline([
        ('constant_imputed', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('one_hot_encoded', OneHotEncoder(handle_unknown="ignore"))
    ])

    job_title_preprocessing_pipeline = Pipeline([
        ('constant_imputed', SimpleImputer(strategy='constant', fill_value='unknown')),
        ('ordinal_encoded', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    # Gender variable is one-hot-encoded
    # Other categorical variables are ordinal encoded due to high cardinality
    # constant imputation for categorical variables
    # median imputation for numerical variables
    #
    # scaling / value transformations should be unnecessary for tree-based models
    preprocessing_pipeline = Pipeline([
        ('column transformations', ColumnTransformer([
            ('gender', gender_preprocessing_pipeline, ['gender']),
            ('job', job_title_preprocessing_pipeline, ['underfilled_job_title']),
            # Other transformations
            ('median_imputed', SimpleImputer(strategy='median'), ['2016_gross_pay_received', '2016_overtime_pay']),
            ('ordinal_encoded', OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1),
             ['department', 'department_name', 'division', 'assignment_category', 'employee_position_title'])],
            remainder='passthrough',
            verbose_feature_names_out=False
        )),
    ])

    x_train = preprocessing_pipeline.fit_transform(x_train)
    x_test = preprocessing_pipeline.transform(x_test)

    x_train, x_test, y_train, y_test = convert_to_numpy(x_train, x_test, y_train, y_test)

    return x_train, x_test, y_train, y_test





@timer
def prepare_energy_efficiency_dataset() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:
    _DATASET_ID = 43918
    _DATASET_PATH = 'data/energy_efficiency.csv'
    _TEST_SPLIT_SIZE = 0.2
    _TARGET_VARIABLE = 'Y1'
    _CORRELATION_DROP_THRESHOLD = 1.0


    _OUTPUT_FOLDER = Path('output/energy_efficiency')
    _OUTPUT_HYPERPARAMETERS_FOLDER = _OUTPUT_FOLDER / 'parameter_permutation'
    _OUTPUT_HYPERPARAMETERS_RESULTS = _OUTPUT_HYPERPARAMETERS_FOLDER / 'results.csv'

    df = load_dataset(_DATASET_ID, _DATASET_PATH)

    if _TEST_RUN:
        df = df.iloc[:100, :]
    print("Dimensions for training:", df.shape)

    x_train, x_test, y_train, y_test = get_train_test_data(df=df, target=_TARGET_VARIABLE, split_size=_TEST_SPLIT_SIZE)

    x_train, x_test, y_train, y_test = convert_to_numpy(x_train, x_test, y_train, y_test)

    return x_train, x_test, y_train, y_test





@timer
def prepare_toronto_rental_dataset() -> Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]:

    _DATASET_ID = 43723
    _DATASET_PATH = 'data/toronto_rental.csv'
    _TEST_SPLIT_SIZE = 0.2
    _TARGET_VARIABLE = 'Price'
    _CORRELATION_DROP_THRESHOLD = 1.0


    _OUTPUT_FOLDER = Path('output/toronto_rental')
    _OUTPUT_HYPERPARAMETERS_FOLDER = _OUTPUT_FOLDER / 'parameter_permutation'
    _OUTPUT_HYPERPARAMETERS_RESULTS = _OUTPUT_HYPERPARAMETERS_FOLDER / 'results.csv'

    _OUTPUT_KNN = _OUTPUT_FOLDER / 'knn'
    _OUTPUT_KNN_HYPERPARAMETER_PERMUTATIONS = _OUTPUT_KNN / 'parameter_permutations.csv'

    df = load_dataset(_DATASET_ID, _DATASET_PATH)

    if _TEST_RUN:
        df = df.iloc[:100, :]
    print("Dimensions for training:", df.shape)

    df = df.iloc[:, 1:]
    df['Price'] = df['Price'].str.replace(
        ',', '').astype(float)
    x_train, x_test, y_train, y_test = get_train_test_data(
        df=df, target=_TARGET_VARIABLE, split_size=_TEST_SPLIT_SIZE)

    address_preprocessing_pipeline = Pipeline([
        ('ordinal_encoded', OrdinalEncoder(
            handle_unknown='use_encoded_value', unknown_value=-1))
    ])

    preprocessing_pipeline = Pipeline([
        ('column transformations', ColumnTransformer([
            ('address', address_preprocessing_pipeline, ['Address'])
        ], remainder='passthrough', verbose_feature_names_out=False))
    ])

    x_train = preprocessing_pipeline.fit_transform(x_train)
    x_test = preprocessing_pipeline.transform(x_test)

    x_train, x_test, y_train, y_test = convert_to_numpy(x_train, x_test, y_train, y_test)

    return x_train, x_test, y_train, y_test


dataset id 44994


In [None]:
import pandas as pd
import numpy as np
from autogluon.tabular import TabularPredictor
from flaml import AutoML
from sklearn.metrics import mean_squared_error

_TARGET_VARIABLE = "target"  # Példa célváltozó neve
_OUTPUT_FOLDER = "results"  # Példa output mappa


def train_autogluon(x_train, y_train, x_test, y_test):
    train_data = pd.concat([pd.DataFrame(x_train), pd.DataFrame(y_train, columns=[_TARGET_VARIABLE])], axis=1)
    test_data = pd.concat([pd.DataFrame(x_test), pd.DataFrame(y_test, columns=[_TARGET_VARIABLE])], axis=1)

    predictor = TabularPredictor(label=_TARGET_VARIABLE, path=_OUTPUT_FOLDER + '/autogluon').fit(train_data)
    leaderboard = predictor.leaderboard(silent=True)  # Megnézzük, melyik modell teljesített a legjobban
    best_model = leaderboard.iloc[0]['model']  # Legjobb modell neve

    predictions = predictor.predict(test_data.drop(columns=[_TARGET_VARIABLE]))

    r2 = predictor.evaluate_predictions(y_true=test_data[_TARGET_VARIABLE], y_pred=predictions)['r2']
    rmse = np.sqrt(mean_squared_error(test_data[_TARGET_VARIABLE], predictions))

    return "AutoGluon", best_model, r2, rmse


def train_flaml(x_train, y_train, x_test, y_test):
    automl = AutoML()
    automl.fit(X_train=x_train, y_train=y_train.ravel(), task="regression", time_budget=300)

    best_model = automl.best_estimator  # Legjobb algoritmus neve FLAML-ben
    predictions = automl.predict(x_test)

    r2 = automl.score(x_test, y_test)
    rmse = np.sqrt(mean_squared_error(y_test, predictions))

    return "FLAML", best_model, r2, rmse


def get_datasets():
    return {
        'toronto_rental': prepare_toronto_rental_dataset,
        'employee_salaries': prepare_employee_salaries_dataset,
        'energy_efficiency': prepare_energy_efficiency_dataset,
        'cars': prepare_cars_dataset
    }


def train_on_all_datasets():
    datasets = get_datasets()
    results_on_all_datasets = []

    for name, dataset_func in datasets.items():
        print(f"\nTraining on dataset: {name}")

        x_train, x_test, y_train, y_test = dataset_func()

        model_ag, best_model_ag, r2_ag, rmse_ag = train_autogluon(x_train, y_train, x_test, y_test)
        model_flaml, best_model_flaml, r2_flaml, rmse_flaml = train_flaml(x_train, y_train, x_test, y_test)

        best_overall_model = best_model_ag if r2_ag > r2_flaml else best_model_flaml
        best_r2 = max(r2_ag, r2_flaml)
        best_rmse = min(rmse_ag, rmse_flaml)

        results_on_all_datasets.append([name, best_overall_model, best_r2, best_rmse])

    results_df = pd.DataFrame(results_on_all_datasets, columns=["Dataset", "Best Algorithm", "R2", "RMSE"])
    results_df.to_csv(_OUTPUT_FOLDER + "/automl_results.csv", index=False)

    print("Results saved to automl_results.csv")


In [None]:
train_on_all_datasets()

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.54 GB / 12.67 GB (83.1%)
Disk Space Avail:   78.40 GB / 112.64 GB (69.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with ver


Training on dataset: toronto_rental
Dimensions for training: (100, 8)
prepare_toronto_rental_dataset executed in 0.0164 seconds



	-546.1627	 = Validation score   (-root_mean_squared_error)
	0.48s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ...
	-525.6826	 = Validation score   (-root_mean_squared_error)
	0.48s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-446.8256	 = Validation score   (-root_mean_squared_error)
	0.42s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-434.0977	 = Validation score   (-root_mean_squared_error)
	0.2s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-418.4943	 = Validation score   (-root_mean_squared_error)
	0.42s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-365.8592	 = Validation score   (-root_mean_squared_error)
	0.56s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-454.3904	 = Validation score   (-root_mean_squared_error)
	0.13s	 = Training   runtime
	0.0s	 = Validation r

[flaml.automl.logger: 02-24 12:17:06] {1728} INFO - task = regression
[flaml.automl.logger: 02-24 12:17:06] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 02-24 12:17:06] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 02-24 12:17:06] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-24 12:17:06] {2393} INFO - Estimated sufficient time budget=230s. Estimated necessary time budget=2s.
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.0s,	estimator lgbm's best error=0.9026,	best estimator lgbm's best error=0.9026
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.9026,	best estimator lgbm's best error=0.9026
[flaml.automl.logger: 02-24 12:17:0

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.1s,	estimator sgd's best error=15.4534,	best estimator lgbm's best error=0.5425
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 4, current learner xgboost
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.2s,	estimator xgboost's best error=0.6932,	best estimator lgbm's best error=0.5425
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 5, current learner lgbm
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.4872,	best estimator lgbm's best error=0.4872
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 6, current learner lgbm
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.4872,	best estimator lgbm's best error=0.4872
[flaml.automl.logger: 02-24 12:17:06] {2258} INFO - iteration 7, current learner lgbm
[flaml.automl.logger: 02-24 12:17:06] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.4872,	

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.52 GB / 12.67 GB (83.0%)
Disk Space Avail:   78.40 GB / 112.64 GB (69.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with ver

[flaml.automl.logger: 02-24 12:22:22] {1728} INFO - task = regression
[flaml.automl.logger: 02-24 12:22:22] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 02-24 12:22:22] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 02-24 12:22:22] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 02-24 12:22:22] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-24 12:22:22] {2393} INFO - Estimated sufficient time budget=865s. Estimated necessary time budget=7s.
[flaml.automl.logger: 02-24 12:22:22] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.7090,	best estimator lgbm's best error=0.7090
[flaml.automl.logger: 02-24 12:22:22] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-24 12:22:22] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.7090,	best estimator lgbm's best error=0.7090
[flaml.automl.logger: 02-24 12:22:2

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 02-24 12:22:22] {2442} INFO -  at 0.3s,	estimator sgd's best error=7.3469,	best estimator lgbm's best error=0.3914
[flaml.automl.logger: 02-24 12:22:22] {2258} INFO - iteration 4, current learner lgbm
[flaml.automl.logger: 02-24 12:22:23] {2442} INFO -  at 0.6s,	estimator lgbm's best error=0.3914,	best estimator lgbm's best error=0.3914
[flaml.automl.logger: 02-24 12:22:23] {2258} INFO - iteration 5, current learner xgboost
[flaml.automl.logger: 02-24 12:22:23] {2442} INFO -  at 0.7s,	estimator xgboost's best error=0.7126,	best estimator lgbm's best error=0.3914
[flaml.automl.logger: 02-24 12:22:23] {2258} INFO - iteration 6, current learner xgboost
[flaml.automl.logger: 02-24 12:22:23] {2442} INFO -  at 0.8s,	estimator xgboost's best error=0.7126,	best estimator lgbm's best error=0.3914
[flaml.automl.logger: 02-24 12:22:23] {2258} INFO - iteration 7, current learner extra_tree
[flaml.automl.logger: 02-24 12:22:23] {2442} INFO -  at 1.0s,	estimator extra_tree's be

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.52 GB / 12.67 GB (83.0%)
Disk Space Avail:   78.40 GB / 112.64 GB (69.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with ver


Training on dataset: energy_efficiency
Dimensions for training: (100, 10)
prepare_energy_efficiency_dataset executed in 0.0049 seconds



	Types of features in processed data (raw dtype, special dtypes):
		('float', [])     : 7 | ['0', '1', '2', '3', '4', ...]
		('int', ['bool']) : 2 | ['5', '7']
	0.0s = Fit runtime
	9 features in original data used to generate 9 features in processed data.
	Train Data (Processed) Memory Usage: 0.00 MB (0.0% of available memory)
Data preprocessing and feature engineering runtime = 0.06s ...
AutoGluon will gauge predictive performance using evaluation metric: 'root_mean_squared_error'
	This metric's sign has been flipped to adhere to being higher_is_better. The metric score can be multiplied by -1 to get the metric value.
	To change this, specify the eval_metric parameter of Predictor()
Automatically generating train/validation split with holdout_frac=0.2, Train Rows: 64, Val Rows: 16
User-specified model hyperparameters to be fit:
{
	'NN_TORCH': [{}],
	'GBM': [{'extra_trees': True, 'ag_args': {'name_suffix': 'XT'}}, {}, {'learning_rate': 0.03, 'num_leaves': 128, 'feature_fraction': 0.9, 

[1000]	valid_set's rmse: 1.55574
[2000]	valid_set's rmse: 1.38831
[3000]	valid_set's rmse: 1.33149
[4000]	valid_set's rmse: 1.3128
[5000]	valid_set's rmse: 1.32477


	-1.3077	 = Validation score   (-root_mean_squared_error)
	1.27s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ...


[1000]	valid_set's rmse: 1.64787
[2000]	valid_set's rmse: 1.47828
[3000]	valid_set's rmse: 1.3844
[4000]	valid_set's rmse: 1.30664
[5000]	valid_set's rmse: 1.23276
[6000]	valid_set's rmse: 1.24654


	-1.2247	 = Validation score   (-root_mean_squared_error)
	1.35s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-1.1917	 = Validation score   (-root_mean_squared_error)
	0.44s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: CatBoost ...
	-1.214	 = Validation score   (-root_mean_squared_error)
	33.31s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-0.965	 = Validation score   (-root_mean_squared_error)
	0.56s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-0.9966	 = Validation score   (-root_mean_squared_error)
	0.78s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: XGBoost ...
	-0.9626	 = Validation score   (-root_mean_squared_error)
	0.21s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: NeuralNetTorch ...
	-0.6409	 = Validation score   (-root_mean_squared_error)
	1.25s	 = Training   runtime
	0.0s	 = Validation runtime


[1000]	valid_set's rmse: 1.14285


	-0.5776	 = Validation score   (-root_mean_squared_error)
	0.02s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 40.59s ... Best model: WeightedEnsemble_L2 | Estimated inference throughput: 836.7 rows/s (16 batch size)
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("/content/results/autogluon")


[flaml.automl.logger: 02-24 12:28:03] {1728} INFO - task = regression
[flaml.automl.logger: 02-24 12:28:03] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 02-24 12:28:03] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 02-24 12:28:03] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-24 12:28:03] {2393} INFO - Estimated sufficient time budget=1329s. Estimated necessary time budget=11s.
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.6792,	best estimator lgbm's best error=0.6792
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.2s,	estimator lgbm's best error=0.6792,	best estimator lgbm's best error=0.6792
[flaml.automl.logger: 02-24 12:28

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.5s,	estimator sgd's best error=1.1843,	best estimator lgbm's best error=0.0298
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 9, current learner lgbm
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.6s,	estimator lgbm's best error=0.0298,	best estimator lgbm's best error=0.0298
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 10, current learner sgd
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.6s,	estimator sgd's best error=1.1843,	best estimator lgbm's best error=0.0298
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 11, current learner sgd
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.6s,	estimator sgd's best error=1.1843,	best estimator lgbm's best error=0.0298
[flaml.automl.logger: 02-24 12:28:03] {2258} INFO - iteration 12, current learner lgbm
[flaml.automl.logger: 02-24 12:28:03] {2442} INFO -  at 0.7s,	estimator lgbm's best error=0.0298,	best est

Verbosity: 2 (Standard Logging)
AutoGluon Version:  1.2
Python Version:     3.11.11
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP PREEMPT_DYNAMIC Thu Jun 27 21:05:47 UTC 2024
CPU Count:          2
Memory Avail:       10.52 GB / 12.67 GB (83.0%)
Disk Space Avail:   78.39 GB / 112.64 GB (69.6%)
No presets specified! To achieve strong results with AutoGluon, it is recommended to use the available presets. Defaulting to `'medium'`...
	Recommended Presets (For more details refer to https://auto.gluon.ai/stable/tutorials/tabular/tabular-essentials.html#presets):
	presets='experimental' : New in v1.2: Pre-trained foundation model + parallel fits. The absolute best accuracy without consideration for inference speed. Does not support GPU.
	presets='best'         : Maximize accuracy. Recommended for most users. Use in competitions and benchmarks.
	presets='high'         : Strong accuracy with fast inference speed.
	presets='good'         : Good accuracy with ver


Training on dataset: cars
Dimensions for training: (100, 19)
prepare_cars_dataset executed in 0.0140 seconds



	-5099.0242	 = Validation score   (-root_mean_squared_error)
	0.0s	 = Training   runtime
	0.02s	 = Validation runtime
Fitting model: KNeighborsDist ...
	-5249.3257	 = Validation score   (-root_mean_squared_error)
	0.0s	 = Training   runtime
	0.01s	 = Validation runtime
Fitting model: LightGBMXT ...
	-6050.589	 = Validation score   (-root_mean_squared_error)
	0.65s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: LightGBM ...
	-6328.029	 = Validation score   (-root_mean_squared_error)
	0.63s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-1676.1769	 = Validation score   (-root_mean_squared_error)
	0.5s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: CatBoost ...
	-1080.5898	 = Validation score   (-root_mean_squared_error)
	1.89s	 = Training   runtime
	0.0s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-1498.0235	 = Validation score   (-root_mean_squared_error)
	0.46s	 = Training   runtime
	0.04s	 = Valida

[flaml.automl.logger: 02-24 12:33:39] {1728} INFO - task = regression
[flaml.automl.logger: 02-24 12:33:39] {1739} INFO - Evaluation method: cv
[flaml.automl.logger: 02-24 12:33:39] {1838} INFO - Minimizing error metric: 1-r2
[flaml.automl.logger: 02-24 12:33:39] {1955} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'sgd', 'catboost']
[flaml.automl.logger: 02-24 12:33:39] {2258} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 02-24 12:33:39] {2393} INFO - Estimated sufficient time budget=345s. Estimated necessary time budget=3s.
[flaml.automl.logger: 02-24 12:33:39] {2442} INFO -  at 0.0s,	estimator lgbm's best error=0.9441,	best estimator lgbm's best error=0.9441
[flaml.automl.logger: 02-24 12:33:39] {2258} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 02-24 12:33:39] {2442} INFO -  at 0.1s,	estimator lgbm's best error=0.9441,	best estimator lgbm's best error=0.9441
[flaml.automl.logger: 02-24 12:33:3

INFO:flaml.tune.searcher.blendsearch:No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


[flaml.automl.logger: 02-24 12:33:40] {2442} INFO -  at 0.2s,	estimator sgd's best error=11.7393,	best estimator lgbm's best error=0.9441
[flaml.automl.logger: 02-24 12:33:40] {2258} INFO - iteration 3, current learner lgbm
[flaml.automl.logger: 02-24 12:33:40] {2442} INFO -  at 0.3s,	estimator lgbm's best error=0.3392,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 02-24 12:33:40] {2258} INFO - iteration 4, current learner xgboost
[flaml.automl.logger: 02-24 12:33:40] {2442} INFO -  at 0.4s,	estimator xgboost's best error=0.5869,	best estimator lgbm's best error=0.3392
[flaml.automl.logger: 02-24 12:33:40] {2258} INFO - iteration 5, current learner extra_tree
[flaml.automl.logger: 02-24 12:33:40] {2442} INFO -  at 0.5s,	estimator extra_tree's best error=0.2012,	best estimator extra_tree's best error=0.2012
[flaml.automl.logger: 02-24 12:33:40] {2258} INFO - iteration 6, current learner rf
[flaml.automl.logger: 02-24 12:33:40] {2442} INFO -  at 0.7s,	estimator rf's best 