# Препроцессинг

In [101]:
import numpy as np
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score
import optuna
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import cross_val_score
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
import numpy as np
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor

In [83]:
def convert_to_number(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(' ', '')  # удаляем пробелы
    # обработка значений '<число'
    if val.startswith('<'):
        num = re.findall(r'<(\d+\.?\d*)', val)
        return float(num[0]) if num else np.nan
    # обработка значений с ±
    elif '±' in val:
        nums = re.findall(r'([\d\.]+)±([\d\.]+)', val)
        if nums:
            main, uncertainty = nums[0]
            return float(main)  # берём только среднее (первое число)
        else:
            return np.nan
    # обработка значений с '/'
    elif '/' in val:
        nums = re.findall(r'([\d\.]+)/([\d\.]+)', val)
        if nums:
            num1, num2 = nums[0]
            return (float(num1) + float(num2)) / 2
        else:
            return np.nan
    # пробуем просто преобразовать в число
    else:
        try:
            return float(val)
        except:
            return np.nan

In [84]:
# Load dataset
df = pd.read_csv("for_regr_descriptors_full.csv")
df['raw_efficiency'] = df['raw_efficiency'].apply(convert_to_number)

# Load embeddings
blomap_embeddings = np.load("blomap_regr.npy")
fingerprints_embeddings = np.load("fingerprints_regr.npy")
protbert_embeddings = np.load("protbert_regr.npy")

In [85]:
# Select numerical features
fp_path_index = df.columns.get_loc('fp_path')
selected_features = ['raw_efficiency', 'uptake_type'] + list(df.columns[fp_path_index + 1:])

X_numerical = df[selected_features].copy()

In [86]:
# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)

In [87]:
# Функция для удаления выбросов методом IQR
def remove_outliers(df, target_column):
    """
    Удаляет выбросы из числовых колонок методом межквартильного размаха (IQR).

    Аргументы:
    df — pandas DataFrame с числовыми признаками.
    target_column — название столбца с таргетом (raw_efficiency).

    Возвращает:
    Очищенный DataFrame без выбросов.
    """
    df_clean = df.copy()

    Q1 = df_clean['raw_efficiency'].quantile(0.25)
    Q3 = df_clean['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_clean = df_clean[(df_clean['raw_efficiency'] >= lower_bound) & (df_clean['raw_efficiency'] <= upper_bound)]

    return df_clean

In [88]:
# Фильтруем датафрейм по столбцу 'uptake_type'
X_numerical_filtered = X_numerical[X_numerical['uptake_type'].isin(['Mean Fluorescence intensity', 'Fluorescence intensity'])].copy()
# X_numerical_filtered = X_numerical_filtered[X_numerical_filtered['raw_efficiency'] <= 50000]

# Удаляем выбросы из числовых признаков
X_numerical_filtered_no_outliers = remove_outliers(X_numerical_filtered, 'raw_efficiency')

# Проверяем размер до и после удаления выбросов
print(f"Размер данных ДО удаления выбросов: {X_numerical_filtered.shape}")
print(f"Размер данных ПОСЛЕ удаления выбросов: {X_numerical_filtered_no_outliers.shape}")

# Получаем индексы отфильтрованных молекул
filtered_indices = X_numerical_filtered_no_outliers.index

# Фильтруем массивы эмбеддингов
blomap_embeddings_filtered = blomap_embeddings[filtered_indices]
fingerprints_embeddings_filtered = fingerprints_embeddings[filtered_indices]
protbert_embeddings_filtered = protbert_embeddings[filtered_indices]
X_cell_line_filtered = X_cell_line.loc[filtered_indices]

Размер данных ДО удаления выбросов: (312, 225)
Размер данных ПОСЛЕ удаления выбросов: (268, 225)


In [89]:
target = 'raw_efficiency'
rdkit_descriptors = X_numerical_filtered_no_outliers.drop(columns=['uptake_type', target])

In [90]:
imputer = SimpleImputer(strategy="mean")
rdkit_descriptors = pd.DataFrame(imputer.fit_transform(rdkit_descriptors))

In [91]:
rdkit_descriptors = rdkit_descriptors.reset_index(drop=True)
blomap_embeddings_filtered = pd.DataFrame(blomap_embeddings_filtered).reset_index(drop=True)
fingerprints_embeddings_filtered = pd.DataFrame(fingerprints_embeddings_filtered).reset_index(drop=True)
protbert_embeddings_filtered = pd.DataFrame(protbert_embeddings_filtered).reset_index(drop=True)
X_cell_line_filtered = pd.DataFrame(X_cell_line_filtered).reset_index(drop=True)

In [92]:
list_of_dfs = [
rdkit_descriptors,
blomap_embeddings_filtered,
fingerprints_embeddings_filtered,
protbert_embeddings_filtered
]

# Объединяем датафреймы из списка по столбцам
combined_df_concat = pd.concat(list_of_dfs, axis=1)

print("Успешно объединенный датафрейм (pd.concat):")
print(combined_df_concat)
print("\nИнформация об объединенном датафрейме:")
combined_df_concat.info()

Успешно объединенный датафрейм (pd.concat):
          0         1          2          3         4         5         6     \
0    3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1    2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2    2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3    3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4    2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   
..         ...       ...        ...        ...       ...       ...       ...   
263   424.5177  0.266667   8.249713   0.532924  0.177641  0.333333  0.996992   
264   390.5015  0.833333   8.249713   0.532924  0.177641  0.000000  0.996992   
265    89.0932  1.800000   5.570017  -0.392198 -0.392198  0.000000  0.996992   
266  3712.9716 -3.611538  11.999968   8.179901  0.314612  0.000000  0.967821   
267  3712.9716 -3.611538  11.999968   8.179901  0.314612  0.000000  0.967821

In [106]:
X = pd.concat([rdkit_descriptors, X_cell_line_filtered], axis=1)
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

In [45]:
# Вычисляем корреляции каждого признака с целевой переменной
correlations = X.apply(lambda col: col.corr(y))
# Фильтруем признаки по модулю корреляции
selected_features = correlations[correlations.abs() >= 0.2].index
# Оставляем только отобранные признаки в X
X_corr = X[selected_features]
# Выводим результат (по желанию)
print("Оставленные признаки:", len(list(selected_features)), list(selected_features))

Оставленные признаки: 26 [5, 7, 18, 31, 34, 37, 38, 68, 87, 94, 109, 114, 115, 118, 123, 124, 133, 134, 158, 160, 174, 202, 209, 'cell_line_CHO cells', 'cell_line_Cal 27', 'cell_line_NIH-3T3 cells']


  c /= stddev[:, None]
  c /= stddev[None, :]


In [107]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [108]:
def apply_scaler (train, test):
    train.columns = train.columns.astype(str)
    test.columns = test.columns.astype(str)

    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = pd.DataFrame(scaler.fit_transform(train))
    test_scaled = pd.DataFrame(scaler.transform(test))
    return train_scaled, test_scaled

In [96]:
def apply_pca (X_train, X_test, threshold=0.95):
    pca = PCA(n_components=threshold, svd_solver='full')
    train_transformed = pd.DataFrame(pca.fit_transform(X_train))
    test_transformed = pd.DataFrame(pca.transform(X_test))
    dispersion=round((sum(pca.explained_variance_ratio_)*100),2)
    number_of_comp = pca.n_components_
    print(number_of_comp,dispersion)

    return train_transformed, test_transformed

In [109]:
X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)

In [110]:
X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled)

29 95.28


## Describe

In [18]:
y_train.describe()

count    240.000000
mean       5.263885
std        2.535125
min        0.000000
25%        3.179944
50%        5.354222
75%        7.191704
max       10.732781
Name: raw_efficiency, dtype: float64

In [19]:
y_test.describe()

count    61.000000
mean      5.693010
std       2.559058
min       0.000000
25%       3.713572
50%       6.274762
75%       7.650169
max      10.070738
Name: raw_efficiency, dtype: float64

In [20]:
X_train_transformed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,...,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0,240.0
mean,-1.480297e-17,1.110223e-16,5.921189000000001e-17,-5.551115e-18,-2.312965e-18,5.551115e-18,-1.480297e-17,-1.050086e-16,-3.23815e-17,5.1810410000000004e-17,...,-6.938894e-18,3.4694470000000005e-17,2.2204460000000003e-17,3.515706e-17,-3.7932620000000004e-17,-2.2435760000000003e-17,-3.851086e-17,-2.312965e-17,-1.480297e-17,2.6830390000000003e-17
std,1.272786,0.8186397,0.7376091,0.5797919,0.5195548,0.4817326,0.4394338,0.4151679,0.3866189,0.3497617,...,0.182031,0.1768045,0.1649148,0.1535916,0.1521741,0.1477393,0.1413964,0.1388675,0.1348582,0.1280971
min,-1.96544,-1.379869,-2.864736,-1.686678,-1.405151,-2.585827,-1.129805,-0.9184562,-1.413026,-1.096961,...,-0.6527295,-0.6869134,-0.4855656,-0.4161874,-0.4479006,-0.6320621,-0.4223442,-0.3943208,-0.5615375,-0.3236971
25%,-0.8618459,-0.8219735,-0.2332418,-0.3275711,-0.2466079,-0.174987,-0.2481082,-0.2719088,-0.17008,-0.1637774,...,-0.1065281,-0.06893674,-0.09215177,-0.07090407,-0.06426667,-0.06689714,-0.0751658,-0.0633355,-0.07163601,-0.08053163
50%,-0.1773972,0.05699123,0.07311377,0.1218503,-0.03167169,0.008145083,-0.0002538494,-0.01635784,-0.009801625,-0.05251453,...,-0.006726567,-0.002213239,-0.02267681,-0.004670004,0.01262641,0.001057987,-0.003116924,-0.01551345,0.002177639,-0.007635684
75%,0.4316677,0.7688228,0.4255782,0.3018353,0.1754976,0.1868054,0.2780078,0.2646072,0.1519039,0.1450116,...,0.08635289,0.08054209,0.07386376,0.07731287,0.07334667,0.05460041,0.06505729,0.0445758,0.06900418,0.07286904
max,6.289197,1.704504,1.800595,1.637605,1.900389,1.663029,1.617632,0.9819365,0.817667,1.108351,...,0.4892399,0.4924286,0.6584128,0.7565124,0.3814527,0.4342774,0.5933998,0.8514134,0.4089468,0.5327979


In [21]:
X_test_transformed.describe()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,19,20,21,22,23,24,25,26,27,28
count,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,...,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0,61.0
mean,-0.12162,-0.132343,-0.09751,-0.04092,0.016798,0.023605,0.153955,0.013409,-0.065572,0.014037,...,-0.005066,-0.023699,-0.042413,-0.008293,-0.028573,-0.022994,-0.036142,0.02737,-0.007746,-0.007914
std,1.002221,0.820522,0.597391,0.72452,0.491218,0.568724,0.577743,0.389276,0.513935,0.377261,...,0.230022,0.220255,0.133789,0.138368,0.188349,0.133305,0.18393,0.148096,0.167668,0.150598
min,-2.244204,-1.385896,-1.738986,-2.206963,-1.106289,-2.161688,-0.835065,-0.754857,-2.082711,-1.463621,...,-0.68239,-0.573973,-0.484112,-0.376816,-0.505614,-0.335953,-0.587269,-0.225886,-0.736731,-0.42148
25%,-0.772537,-0.847,-0.415368,-0.316927,-0.348895,-0.184905,-0.177802,-0.278426,-0.190907,-0.161895,...,-0.127145,-0.118033,-0.106933,-0.090636,-0.140948,-0.102766,-0.133699,-0.060107,-0.066284,-0.109115
50%,-0.435549,-0.390371,-0.096181,0.077986,-0.027697,0.058828,0.068481,0.008451,-0.028296,0.037461,...,-0.006666,-0.007286,-0.058839,-0.010459,-0.003165,-0.011713,-0.029561,0.013217,-0.005586,0.000167
75%,0.331,0.381126,0.228356,0.296788,0.191613,0.216392,0.33223,0.200616,0.139683,0.201162,...,0.117236,0.089889,0.040441,0.070244,0.094913,0.060823,0.043551,0.082175,0.064995,0.085054
max,2.805501,1.822887,1.50323,1.462923,1.512882,2.146285,2.444992,0.756069,0.830455,0.900849,...,0.44896,0.400708,0.36166,0.281639,0.393675,0.334128,0.509213,0.669058,0.415241,0.343931


# Облучение

In [24]:
from sklearn.svm import SVR

In [None]:
from sklearn.inspection import permutation_importance

In [67]:
# Определяем функцию для оценки модели
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    train_rmse = mean_squared_error(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_rmse = mean_squared_error(y_test, y_pred_test)

    print(f'Train RMSE: {train_rmse:.4f}')
    print(f'Train R2: {train_r2:.4f}')
    print(f'Test RMSE: {test_rmse:.4f}')
    print(f'Test R2: {test_r2:.4f}')

In [68]:
# Инициализируем модель SVR и оцениваем её
def svr_learning(X_train, y_train, X_test, y_test):
    model = SVR()
    model.fit(X_train, y_train)
    evaluate_model(model, X_train, y_train, X_test, y_test)

In [117]:
def svr_optuna(X_train, y_train, X_test, y_test):
    # Целевая функция для оптимизации
    def objective(trial):
        params = {
            "C": trial.suggest_float("C", 1, 100, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
            'kernel': trial.suggest_categorical('kernel', ['rbf'])
        }
        model = SVR(**params)
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    # Создание и оптимизация исследования
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Лучшие гиперпараметры:", study.best_params)
    print("Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими гиперпараметрами
    best_model = SVR(**study.best_params)
    best_model.fit(X_train, y_train)

    # Оцениваем модель (предположим, у тебя есть функция evaluate_model)
    evaluate_model(best_model, X_train, y_train, X_test, y_test)

    result = permutation_importance(
    best_model, X_test, y_test, n_repeats=10, random_state=42, scoring='neg_root_mean_squared_error'
    )

    # Вывод важности признаков
    feature_importance = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': result.importances_mean,
        'importance_std': result.importances_std
    }).sort_values(by='importance_mean', ascending=False)

    print(feature_importance)    
    # Сортируем по убыванию важности
    feature_importance_sorted = feature_importance.reset_index(drop=True)
    total_importance = feature_importance_sorted['importance_mean'].sum()
    feature_importance_sorted['cumulative_importance'] = feature_importance_sorted['importance_mean'].cumsum() / total_importance

    # Оставляем признаки, дающие в сумме до 80% важности
    top_features = feature_importance_sorted[feature_importance_sorted['cumulative_importance'] <= 0.8]

    # Если последний признак недобрал до 80%, добавим ещё один
    if top_features['cumulative_importance'].iloc[-1] < 0.95:
        top_features = feature_importance_sorted.iloc[:len(top_features)+1]

    print("Признаки, дающие 80% важности:")
    print(top_features[['feature', 'importance_mean', 'cumulative_importance']])

    # Формируем новые обучающие и тестовые выборки с отобранными признаками
    selected_features = top_features['feature'].tolist()
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Переобучаем модель на отобранных признаках
    final_model = SVR(**study.best_params)
    final_model.fit(X_train_selected, y_train)

    # Финальная оценка модели
    print("\nОценка модели на отобранных признаках:")
    evaluate_model(final_model, X_train_selected, y_train, X_test_selected, y_test)

    return final_model

In [118]:
model = svr_optuna(X_train_scaled, y_train, X_test_scaled, y_test)

[I 2025-04-17 02:58:27,128] A new study created in memory with name: no-name-348041db-0f6f-4674-83a1-a4bface5c410
[I 2025-04-17 02:58:27,166] Trial 0 finished with value: -1.8253670864409735 and parameters: {'C': 1.1056522156186428, 'epsilon': 0.012481763333814268, 'kernel': 'rbf'}. Best is trial 0 with value: -1.8253670864409735.
[I 2025-04-17 02:58:27,204] Trial 1 finished with value: -1.7971550900078932 and parameters: {'C': 1.8263021090800395, 'epsilon': 0.0018682593248433692, 'kernel': 'rbf'}. Best is trial 1 with value: -1.7971550900078932.
[I 2025-04-17 02:58:27,231] Trial 2 finished with value: -1.7229917189030395 and parameters: {'C': 4.184431517014224, 'epsilon': 0.4788796640653082, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7229917189030395.
[I 2025-04-17 02:58:27,269] Trial 3 finished with value: -1.7917626752399354 and parameters: {'C': 1.8767421505662099, 'epsilon': 0.03673113748407373, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7229917189030395.
[I 2025-04-17

Лучшие гиперпараметры: {'C': 15.077242047189756, 'epsilon': 0.6621827961314292, 'kernel': 'rbf'}
Лучшая средняя ошибка (CV): 1.6626057525623879
Train RMSE: 0.9115
Train R2: 0.8136
Test RMSE: 2.7528
Test R2: 0.5012
     feature  importance_mean  importance_std
237      237         0.187030        0.069598
161      161         0.177170        0.065562
266      266         0.095257        0.053384
221      221         0.053500        0.037485
241      241         0.044124        0.031379
..       ...              ...             ...
267      267        -0.004994        0.003610
104      104        -0.005973        0.007803
6          6        -0.013388        0.010300
279      279        -0.028463        0.027678
240      240        -0.045952        0.012682

[318 rows x 3 columns]
Признаки, дающие 80% важности:
    feature  importance_mean  cumulative_importance
0       237         0.187030               0.198176
1       161         0.177170               0.385904
2       266         0.0

In [113]:
def run_model_with_optuna(model_name, X_train, y_train, X_test, y_test):

    def objective(trial):
        if model_name == "svm":
            params = {
                "C": trial.suggest_float("C", 1e-1, 100, log=True),
                "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
                "kernel": trial.suggest_categorical("kernel", ["rbf"])
            }
            model = SVR(**params)

        elif model_name == "sgd":
            params = {
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "penalty": trial.suggest_categorical("penalty", ["l2", "elasticnet"]),
                "max_iter": 1000
            }
            model = SGDRegressor(**params)

        elif model_name == "knn":
            params = {
                "n_neighbors": trial.suggest_int("n_neighbors", 3, 20),
                "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            }
            model = KNeighborsRegressor(**params)

        elif model_name == "dt":
            params = {
                "max_depth": trial.suggest_int("max_depth", 2, 20),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10)
            }
            model = DecisionTreeRegressor(**params)

        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = RandomForestRegressor(**params)

        elif model_name == "et":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = ExtraTreesRegressor(**params)

        elif model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = XGBRegressor(**params, objective='reg:squarederror', verbosity=0)

        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", -1, 20),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = LGBMRegressor(**params)

        elif model_name == "mlp":
            params = {
                "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(64,), (128,), (64, 64)]),
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "learning_rate_init": trial.suggest_float("learning_rate_init", 1e-4, 1e-2),
                "max_iter": 1000
            }
            model = MLPRegressor(**params)

        else:
            raise ValueError(f"Unknown model: {model_name}")

        # Кросс-валидация
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    print(f"\n🔍 Оптимизация модели: {model_name}")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    print("🏆 Лучшие гиперпараметры:", study.best_params)
    print("📉 Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими параметрами

    model_classes = {
        "svm": SVR,
        "sgd": SGDRegressor,
        "knn": KNeighborsRegressor,
        "dt": DecisionTreeRegressor,
        "rf": RandomForestRegressor,
        "et": ExtraTreesRegressor,
        "xgb": XGBRegressor,
        "lgbm": LGBMRegressor,
        "mlp": MLPRegressor
    }
    best_model = model_classes[model_name](**study.best_params)

    best_model.fit(X_train, y_train)

    # 🔍 Оценка модели
    print("📊 Оценка модели на train/test:")
    evaluate_model(best_model, X_train, y_train, X_test, y_test)


In [114]:
for model in ["lgbm"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-17 02:48:15,649] A new study created in memory with name: no-name-df10ada3-c973-4a8c-8442-1a9857f52a3d
[I 2025-04-17 02:48:15,821] Trial 0 finished with value: -1.6960896423980512 and parameters: {'n_estimators': 226, 'max_depth': 19, 'learning_rate': 0.16344993638147193, 'num_leaves': 122, 'subsample': 0.5216219378836007, 'colsample_bytree': 0.6092546502178895}. Best is trial 0 with value: -1.6960896423980512.



🔍 Оптимизация модели: lgbm


[I 2025-04-17 02:48:15,939] Trial 1 finished with value: -1.7828403799838721 and parameters: {'n_estimators': 125, 'max_depth': 8, 'learning_rate': 0.23183321761462347, 'num_leaves': 36, 'subsample': 0.6167854428141397, 'colsample_bytree': 0.5474085100679746}. Best is trial 0 with value: -1.6960896423980512.
[I 2025-04-17 02:48:16,068] Trial 2 finished with value: -1.654327358506092 and parameters: {'n_estimators': 141, 'max_depth': 13, 'learning_rate': 0.12245592721145833, 'num_leaves': 89, 'subsample': 0.9229162404440199, 'colsample_bytree': 0.6833690768132368}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,162] Trial 3 finished with value: -1.7479825187568578 and parameters: {'n_estimators': 95, 'max_depth': 17, 'learning_rate': 0.2742393313169498, 'num_leaves': 44, 'subsample': 0.7478510679912089, 'colsample_bytree': 0.5495123166150259}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,312] Trial 4 finished with value: -1.6840289838842

🏆 Лучшие гиперпараметры: {'n_estimators': 152, 'max_depth': 11, 'learning_rate': 0.07759053090052767, 'num_leaves': 50, 'subsample': 0.9318155523102657, 'colsample_bytree': 0.7137131727645704}
📉 Лучшая средняя ошибка (CV): 1.6518158163223498
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2117
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 29
[LightGBM] [Info] Start training from score 4.786516
📊 Оценка модели на train/test:
Train RMSE: 0.0997
Train R2: 0.9796
Test RMSE: 3.4065
Test R2: 0.3828


In [None]:
for model in ["svm", "sgd", "knn", "dt", "rf", "et", "xgb", "lgbm", "mlp"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-17 02:23:57,001] A new study created in memory with name: no-name-7d0ef6f1-379c-414d-a0a3-852cb871fa5c



🔍 Оптимизация модели: svm


[I 2025-04-17 02:24:01,299] Trial 0 finished with value: -1.9704878188372874 and parameters: {'C': 0.17126744015861523, 'epsilon': 0.0338872491502521, 'kernel': 'rbf'}. Best is trial 0 with value: -1.9704878188372874.
[I 2025-04-17 02:24:03,045] Trial 1 finished with value: -1.7850185622083377 and parameters: {'C': 46.61337700036316, 'epsilon': 0.07289497194479191, 'kernel': 'rbf'}. Best is trial 1 with value: -1.7850185622083377.
[I 2025-04-17 02:24:04,827] Trial 2 finished with value: -1.7608667663395896 and parameters: {'C': 7.08555392387036, 'epsilon': 0.0037180610940864955, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7608667663395896.
[I 2025-04-17 02:24:06,716] Trial 3 finished with value: -1.7938288964420437 and parameters: {'C': 56.01148707188686, 'epsilon': 0.03502153285287117, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7608667663395896.
[I 2025-04-17 02:24:06,757] Trial 4 finished with value: -1.7948116559311296 and parameters: {'C': 40.3394533348504, 'epsilon': 0.

🏆 Лучшие гиперпараметры: {'C': 5.733136090860661, 'epsilon': 0.7571120107141919, 'kernel': 'rbf'}
📉 Лучшая средняя ошибка (CV): 1.7078269261764987
📊 Оценка модели на train/test:
Train RMSE: 0.6463
Train R2: 0.8678
Test RMSE: 3.2340
Test R2: 0.4141

🔍 Оптимизация модели: sgd


[I 2025-04-17 02:24:11,329] Trial 6 finished with value: -3.3252180430216733 and parameters: {'alpha': 0.00022694098586585574, 'penalty': 'l2'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,360] Trial 7 finished with value: -5.3268465977483315 and parameters: {'alpha': 0.0011040026707761568, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,392] Trial 8 finished with value: -4.632746471069512 and parameters: {'alpha': 5.724240741297575e-05, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,422] Trial 9 finished with value: -5.126750882395365 and parameters: {'alpha': 0.008066547453117535, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,461] Trial 10 finished with value: -3.7000249964335303 and parameters: {'alpha': 1.5658442248230646e-05, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 

🏆 Лучшие гиперпараметры: {'alpha': 0.04898901289127776, 'penalty': 'l2'}
📉 Лучшая средняя ошибка (CV): 2.1677238590961005
📊 Оценка модели на train/test:
Train RMSE: 72.2007
Train R2: -13.7657
Test RMSE: 4.3826
Test R2: 0.2059

🔍 Оптимизация модели: knn


[I 2025-04-17 02:24:15,255] Trial 0 finished with value: -1.9723820421989458 and parameters: {'n_neighbors': 12, 'weights': 'distance'}. Best is trial 0 with value: -1.9723820421989458.
[I 2025-04-17 02:24:15,502] Trial 1 finished with value: -1.9623771495984847 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,736] Trial 2 finished with value: -2.0096887540988395 and parameters: {'n_neighbors': 17, 'weights': 'uniform'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,901] Trial 3 finished with value: -1.9623771495984847 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,929] Trial 4 finished with value: -1.9752694795474846 and parameters: {'n_neighbors': 14, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,959] Trial 5 finished with value: -1.96416098243

🏆 Лучшие гиперпараметры: {'n_neighbors': 3, 'weights': 'distance'}
📉 Лучшая средняя ошибка (CV): 1.8668547172222734
📊 Оценка модели на train/test:
Train RMSE: 0.0000
Train R2: 1.0000
Test RMSE: 3.3282
Test R2: 0.3970

🔍 Оптимизация модели: dt


[I 2025-04-17 02:24:19,843] Trial 2 finished with value: -2.4304805524276025 and parameters: {'max_depth': 11, 'min_samples_split': 3}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,894] Trial 3 finished with value: -2.4916752895021994 and parameters: {'max_depth': 14, 'min_samples_split': 5}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,933] Trial 4 finished with value: -2.432850043273622 and parameters: {'max_depth': 9, 'min_samples_split': 8}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,972] Trial 5 finished with value: -2.4308768544035395 and parameters: {'max_depth': 16, 'min_samples_split': 6}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:20,001] Trial 6 finished with value: -2.4627234742088 and parameters: {'max_depth': 14, 'min_samples_split': 10}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:20,040] Trial 7 finished with value: -2.4406904805649043 an

🏆 Лучшие гиперпараметры: {'max_depth': 3, 'min_samples_split': 2}
📉 Лучшая средняя ошибка (CV): 2.0136835876777144
📊 Оценка модели на train/test:
Train RMSE: 2.7539
Train R2: 0.4368
Test RMSE: 5.7947
Test R2: -0.0499

🔍 Оптимизация модели: rf


[I 2025-04-17 02:24:25,198] Trial 0 finished with value: -1.883224555323904 and parameters: {'n_estimators': 101, 'max_depth': 4}. Best is trial 0 with value: -1.883224555323904.
[I 2025-04-17 02:24:26,165] Trial 1 finished with value: -1.8867020231198754 and parameters: {'n_estimators': 149, 'max_depth': 4}. Best is trial 0 with value: -1.883224555323904.
[I 2025-04-17 02:24:28,078] Trial 2 finished with value: -1.8757950766597116 and parameters: {'n_estimators': 252, 'max_depth': 6}. Best is trial 2 with value: -1.8757950766597116.
[I 2025-04-17 02:24:30,437] Trial 3 finished with value: -1.869465220104918 and parameters: {'n_estimators': 261, 'max_depth': 17}. Best is trial 3 with value: -1.869465220104918.
[I 2025-04-17 02:24:32,082] Trial 4 finished with value: -1.868020648187586 and parameters: {'n_estimators': 186, 'max_depth': 11}. Best is trial 4 with value: -1.868020648187586.
[I 2025-04-17 02:24:32,447] Trial 5 finished with value: -1.9113840020423425 and parameters: {'n_est

🏆 Лучшие гиперпараметры: {'n_estimators': 155, 'max_depth': 18}
📉 Лучшая средняя ошибка (CV): 1.8374692392081358


[I 2025-04-17 02:27:06,498] A new study created in memory with name: no-name-a95c8354-ec5e-4a4c-99e1-a26cc7915b6b


📊 Оценка модели на train/test:
Train RMSE: 0.4608
Train R2: 0.9058
Test RMSE: 3.5887
Test R2: 0.3498

🔍 Оптимизация модели: et


[I 2025-04-17 02:27:07,682] Trial 0 finished with value: -1.799399672288144 and parameters: {'n_estimators': 214, 'max_depth': 19}. Best is trial 0 with value: -1.799399672288144.
[I 2025-04-17 02:27:08,634] Trial 1 finished with value: -1.8147346305256875 and parameters: {'n_estimators': 176, 'max_depth': 20}. Best is trial 0 with value: -1.799399672288144.
[I 2025-04-17 02:27:09,671] Trial 2 finished with value: -1.7984290996125196 and parameters: {'n_estimators': 204, 'max_depth': 13}. Best is trial 2 with value: -1.7984290996125196.
[I 2025-04-17 02:27:10,946] Trial 3 finished with value: -1.7875090108175844 and parameters: {'n_estimators': 290, 'max_depth': 9}. Best is trial 3 with value: -1.7875090108175844.
[I 2025-04-17 02:27:12,151] Trial 4 finished with value: -1.80630502692599 and parameters: {'n_estimators': 222, 'max_depth': 17}. Best is trial 3 with value: -1.7875090108175844.
[I 2025-04-17 02:27:13,119] Trial 5 finished with value: -1.8054122286513046 and parameters: {'n

🏆 Лучшие гиперпараметры: {'n_estimators': 251, 'max_depth': 15}
📉 Лучшая средняя ошибка (CV): 1.7792212073669398


[I 2025-04-17 02:28:26,087] A new study created in memory with name: no-name-9cf0583a-bec4-40a3-9c76-04d6130758f3


📊 Оценка модели на train/test:
Train RMSE: 0.0004
Train R2: 0.9999
Test RMSE: 3.2298
Test R2: 0.4148

🔍 Оптимизация модели: xgb


[I 2025-04-17 02:28:28,608] Trial 0 finished with value: -1.896445077700036 and parameters: {'n_estimators': 265, 'max_depth': 7, 'learning_rate': 0.20839597865041376, 'subsample': 0.5034636159792547, 'colsample_bytree': 0.8765085964031631}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:30,886] Trial 1 finished with value: -1.938958121935319 and parameters: {'n_estimators': 79, 'max_depth': 8, 'learning_rate': 0.1136685155894583, 'subsample': 0.9749595059880913, 'colsample_bytree': 0.8818714372636052}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:32,391] Trial 2 finished with value: -2.0466749656743226 and parameters: {'n_estimators': 225, 'max_depth': 4, 'learning_rate': 0.29730179249337274, 'subsample': 0.7419444049250039, 'colsample_bytree': 0.7606984184747096}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:32,911] Trial 3 finished with value: -2.0064434449235975 and parameters: {'n_estimators': 149, 'max_depth': 2, 

🏆 Лучшие гиперпараметры: {'n_estimators': 285, 'max_depth': 7, 'learning_rate': 0.01883083413917217, 'subsample': 0.6675772164904088, 'colsample_bytree': 0.5134562254608362}
📉 Лучшая средняя ошибка (CV): 1.8297309654860912


[I 2025-04-17 02:31:37,323] A new study created in memory with name: no-name-d8caa16f-8d3b-47f1-9703-baed819d2641


📊 Оценка модели на train/test:
Train RMSE: 0.0342
Train R2: 0.9930
Test RMSE: 3.5027
Test R2: 0.3654

🔍 Оптимизация модели: mlp


[I 2025-04-17 02:31:37,724] Trial 0 finished with value: -1.9326934060397094 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.03447388659083492, 'learning_rate_init': 0.00543144836381885}. Best is trial 0 with value: -1.9326934060397094.
[I 2025-04-17 02:31:38,072] Trial 1 finished with value: -1.843023666449539 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.06310826942842783, 'learning_rate_init': 0.009816109714056293}. Best is trial 1 with value: -1.843023666449539.
[I 2025-04-17 02:31:38,585] Trial 2 finished with value: -1.9502665600514895 and parameters: {'hidden_layer_sizes': (64, 64), 'alpha': 0.00018024094524191436, 'learning_rate_init': 0.0045779543119559805}. Best is trial 1 with value: -1.843023666449539.
[I 2025-04-17 02:31:39,060] Trial 3 finished with value: -1.8291021493541648 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.024237383514900727, 'learning_rate_init': 0.002931903757276672}. Best is trial 3 with value: -1.8291021493541648.
[I 2025-

🏆 Лучшие гиперпараметры: {'hidden_layer_sizes': (64,), 'alpha': 0.02834235225512128, 'learning_rate_init': 0.00047747003027425947}
📉 Лучшая средняя ошибка (CV): 1.7160655540100223
📊 Оценка модели на train/test:
Train RMSE: 1.4186
Train R2: 0.7099
Test RMSE: 3.9238
Test R2: 0.2891




In [53]:
svr_learning(X_train_scaled, y_train, X_test_scaled, y_test)

Train RMSE: 3.7003
Train R2: 0.2432
Test RMSE: 4.5344
Test R2: 0.1784
