In [2]:
!pip install mlflow optuna

Collecting mlflow
  Downloading mlflow-2.13.1-py3-none-any.whl (25.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m25.0/25.0 MB[0m [31m37.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting optuna
  Downloading optuna-3.6.1-py3-none-any.whl (380 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m380.1/380.1 kB[0m [31m26.6 MB/s[0m eta [36m0:00:00[0m
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.1-py3-none-any.whl (233 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m233.4/233.4 kB[0m [31m20.2 MB/s[0m eta [36m0:00:00[0m
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl (147 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m147.8/147.8 kB[0m [31m15.9 MB/s[0m eta [36m0:00:00[0m
Collecting gitpython<4,>=3.1.9 (from mlflow)
  Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m

In [3]:
import mlflow
import mlflow.sklearn
import optuna
from sklearn.metrics import mean_squared_error

import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from xgboost import XGBRegressor

подключаем диск

In [24]:
from google.colab import drive
drive.mount('/content/drive/')
%cd /content/drive/MyDrive

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
/content/drive/MyDrive


Скачайте датасет, выберите столбец Price в качестве целевого, удалите его из X, затем сделайте сплит.

In [25]:
df = pd.read_csv('/content/drive/MyDrive/fourthsemestr/pipeline/Laptop_price.csv')
X = df.drop('Price', axis=1)
y = df['Price']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=11)

Создайте переменные для категориальных и количественных типов данных и запишите в них данные.

In [26]:
categoricalFeatures = X.select_dtypes(include=['object']).columns.tolist()
numericalFeatures = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

Далее создайте пайплайн

In [27]:
numericalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

categoricalTransformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

In [28]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numericalTransformer, numericalFeatures),
        ('cat', categoricalTransformer, categoricalFeatures)
    ])

In [29]:
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('model', XGBRegressor(random_state=42))
])

Использование Optuna для подбора гиперпараметров

In [30]:
def objective(trial):
    params = {
        'model__n_estimators': trial.suggest_int('model__n_estimators', 50, 200),
        'model__learning_rate': trial.suggest_float('model__learning_rate', 0.01, 0.2),
        'model__max_depth': trial.suggest_int('model__max_depth', 3, 7),
        'model__gamma': trial.suggest_float('model__gamma', 0, 0.2),
        'model__subsample': trial.suggest_float('model__subsample', 0.8, 1.0)
    }

    pipeline.set_params(**params)
    pipeline.fit(X_train, y_train)
    preds = pipeline.predict(X_test)
    mse = mean_squared_error(y_test, preds)

    return mse

study = optuna.create_study(direction='minimize')
study.optimize(objective, n_trials=50)

bestTrial = study.best_trial
bestParams = bestTrial.params

print("Лучшие параметры: ", bestParams)

[I 2024-06-03 23:40:18,237] A new study created in memory with name: no-name-3571df2d-e8e7-4da0-9165-9ada8aad6e93
[I 2024-06-03 23:40:18,389] Trial 0 finished with value: 40534.43828866106 and parameters: {'model__n_estimators': 193, 'model__learning_rate': 0.04733555284067461, 'model__max_depth': 4, 'model__gamma': 0.039344082181617474, 'model__subsample': 0.931510083036462}. Best is trial 0 with value: 40534.43828866106.
[I 2024-06-03 23:40:18,570] Trial 1 finished with value: 46761.34937157814 and parameters: {'model__n_estimators': 141, 'model__learning_rate': 0.12225138641853722, 'model__max_depth': 6, 'model__gamma': 0.013137925923389094, 'model__subsample': 0.8118165398817139}. Best is trial 0 with value: 40534.43828866106.
[I 2024-06-03 23:40:18,744] Trial 2 finished with value: 41212.13149702245 and parameters: {'model__n_estimators': 105, 'model__learning_rate': 0.13158871741122063, 'model__max_depth': 4, 'model__gamma': 0.0016758780191741041, 'model__subsample': 0.8988589157

Лучшие параметры:  {'model__n_estimators': 93, 'model__learning_rate': 0.08703780540336377, 'model__max_depth': 3, 'model__gamma': 0.14259393914227161, 'model__subsample': 0.8616945568272715}


Применение этих параметров

In [31]:
pipeline.set_params(**bestParams)
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
mse = mean_squared_error(y_test, preds)

print(f"Mean Squared Error: {mse}")

Mean Squared Error: 38284.71449160577


In [32]:
mlflow.set_experiment("XGBRegressor Optimization with Optuna")

with mlflow.start_run():
    mlflow.log_params(bestParams)
    mlflow.log_metric("mse", mse)
    mlflow.sklearn.log_model(pipeline, "model")

print("Модель и результаты сохранены в MLflow")

Модель и результаты сохранены в MLflow
