# Препроцессинг

In [None]:
import numpy as np
import pandas as pd
import re
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import VarianceThreshold
from sklearn.decomposition import PCA
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error, r2_score, make_scorer
import optuna
from sklearn.linear_model import SGDRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.manifold import TSNE
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.model_selection import RandomizedSearchCV, KFold
from sklearn.pipeline import Pipeline
from lightgbm import LGBMRegressor
import umap
from scipy.stats import randint, uniform, loguniform
from sklearn.svm import SVR
from sklearn.inspection import permutation_importance

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def convert_to_number(val):
    if pd.isna(val):
        return np.nan
    val = str(val).replace(' ', '')  # удаляем пробелы
    # обработка значений '<число'
    if val.startswith('<'):
        num = re.findall(r'<(\d+\.?\d*)', val)
        return float(num[0]) if num else np.nan
    # обработка значений с ±
    elif '±' in val:
        nums = re.findall(r'([\d\.]+)±([\d\.]+)', val)
        if nums:
            main, uncertainty = nums[0]
            return float(main)  # берём только среднее (первое число)
        else:
            return np.nan
    # обработка значений с '/'
    elif '/' in val:
        nums = re.findall(r'([\d\.]+)/([\d\.]+)', val)
        if nums:
            num1, num2 = nums[0]
            return (float(num1) + float(num2)) / 2
        else:
            return np.nan
    # пробуем просто преобразовать в число
    else:
        try:
            return float(val)
        except:
            return np.nan

In [3]:
# Load dataset
df = pd.read_csv("for_regr_descriptors_full.csv")
df['raw_efficiency'] = df['raw_efficiency'].apply(convert_to_number)

# Load embeddings
blomap_embeddings = np.load("blomap_regr.npy")
fingerprints_embeddings = np.load("fingerprints_regr.npy")
protbert_embeddings = np.load("protbert_regr.npy")

In [4]:
# Select numerical features
fp_path_index = df.columns.get_loc('fp_path')
selected_features = ['raw_efficiency', 'uptake_type'] + list(df.columns[fp_path_index + 1:])

X_numerical = df[selected_features].copy()

In [5]:
# One-hot encoding for cell_line
if "cell_line" in df.columns:
    enc = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
    cell_line_encoded = enc.fit_transform(df[['cell_line']])
    cell_line_feature_names = enc.get_feature_names_out(["cell_line"])
    X_cell_line = pd.DataFrame(cell_line_encoded, columns=cell_line_feature_names)

In [6]:
# Функция для удаления выбросов методом IQR
def remove_outliers(df, target_column):
    """
    Удаляет выбросы из числовых колонок методом межквартильного размаха (IQR).

    Аргументы:
    df — pandas DataFrame с числовыми признаками.
    target_column — название столбца с таргетом (raw_efficiency).

    Возвращает:
    Очищенный DataFrame без выбросов.
    """
    df_clean = df.copy()

    Q1 = df_clean['raw_efficiency'].quantile(0.25)
    Q3 = df_clean['raw_efficiency'].quantile(0.75)
    IQR = Q3 - Q1

    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR

    df_clean = df_clean[(df_clean['raw_efficiency'] >= lower_bound) & (df_clean['raw_efficiency'] <= upper_bound)]

    return df_clean

In [7]:
# Фильтруем датафрейм по столбцу 'uptake_type'
X_numerical_filtered = X_numerical[X_numerical['uptake_type'].isin(['Mean Fluorescence intensity', 'Fluorescence intensity'])].copy()
# X_numerical_filtered = X_numerical_filtered[X_numerical_filtered['raw_efficiency'] <= 50000]

# Удаляем выбросы из числовых признаков
X_numerical_filtered_no_outliers = remove_outliers(X_numerical_filtered, 'raw_efficiency')

# Проверяем размер до и после удаления выбросов
print(f"Размер данных ДО удаления выбросов: {X_numerical_filtered.shape}")
print(f"Размер данных ПОСЛЕ удаления выбросов: {X_numerical_filtered_no_outliers.shape}")

# Получаем индексы отфильтрованных молекул
filtered_indices = X_numerical_filtered_no_outliers.index

# Фильтруем массивы эмбеддингов
blomap_embeddings_filtered = blomap_embeddings[filtered_indices]
fingerprints_embeddings_filtered = fingerprints_embeddings[filtered_indices]
protbert_embeddings_filtered = protbert_embeddings[filtered_indices]
X_cell_line_filtered = X_cell_line.loc[filtered_indices]

Размер данных ДО удаления выбросов: (312, 225)
Размер данных ПОСЛЕ удаления выбросов: (268, 225)


In [8]:
target = 'raw_efficiency'
rdkit_descriptors = X_numerical_filtered_no_outliers.drop(columns=['uptake_type', target])

In [9]:
imputer = SimpleImputer(strategy="mean")
rdkit_descriptors = pd.DataFrame(imputer.fit_transform(rdkit_descriptors))

In [10]:
rdkit_descriptors = rdkit_descriptors.reset_index(drop=True)
blomap_embeddings_filtered = pd.DataFrame(blomap_embeddings_filtered).reset_index(drop=True)
fingerprints_embeddings_filtered = pd.DataFrame(fingerprints_embeddings_filtered).reset_index(drop=True)
protbert_embeddings_filtered = pd.DataFrame(protbert_embeddings_filtered).reset_index(drop=True)
X_cell_line_filtered = pd.DataFrame(X_cell_line_filtered).reset_index(drop=True)

In [11]:
list_of_dfs_named = {
    "rdkit": rdkit_descriptors,
    "blomap": blomap_embeddings_filtered,
    "fingerprints": fingerprints_embeddings_filtered,
    "protbert": protbert_embeddings_filtered,
}

# Объединяем датафреймы из списка по столбцам
def rename_columns_with_suffix(dfs: dict) -> dict:
    renamed = {}
    for name, df in dfs.items():
        df_ = df.copy()
        df_.columns = [f"{col}_{name}" for col in df.columns]
        renamed[name] = df_
    return renamed

renamed_dfs = rename_columns_with_suffix(list_of_dfs_named)
combined_df_concat = pd.concat(renamed_dfs.values(), axis=1)

print("\nИнформация об объединенном датафрейме:")
combined_df_concat.info()


Информация об объединенном датафрейме:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 268 entries, 0 to 267
Columns: 4640 entries, 0_rdkit to 1023_protbert
dtypes: float32(1024), float64(1568), int64(2048)
memory usage: 8.4 MB


In [12]:
dfs_dict = {
    'RDKit_Descriptors': rdkit_descriptors,
    'BLOMAP_Embeddings': blomap_embeddings_filtered,
    'Fingerprints_Embeddings': fingerprints_embeddings_filtered,
    'ProtBERT_Embeddings': protbert_embeddings_filtered,
    'Combined_All_Features': combined_df_concat
}

In [None]:
X = pd.concat([rdkit_descriptors, X_cell_line_filtered], axis=1)
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

X.columns = pd.Index(X.columns).map(str)
X.columns = X.columns.to_series().reset_index(drop=True).duplicated().cumsum().astype(str) + '_' + X.columns

In [13]:
def apply_varThreshold (X, threshold=0): 

    # 1. Создаем объект VarianceThreshold
    selector = VarianceThreshold(threshold)

    # 2. Применяем селектор к числовым данным
    X_transformed_array = selector.fit_transform(X)
    selected_columns_mask = selector.get_support()
    selected_columns_names = X.columns[selected_columns_mask]

    # Создаем новый датафрейм только с выбранными столбцами
    X_filtered_var = pd.DataFrame(X_transformed_array, columns=selected_columns_names)

    print("\nДатафрейм после отсева по дисперсии (метод Scikit-learn):")
    print(X_filtered_var.head())
    print("\nОписание отфильтрованного датафрейма (метод Scikit-learn):")
    print(X_filtered_var.describe())

    return X_filtered_var

    '''# Проверка, какие столбцы были удалены sklearn методом
    all_numeric_columns = X.columns.tolist()
    removed_columns_sklearn = [col for col in all_numeric_columns if col not in selected_columns_names]
    print(f"\nСтолбцы, удаленные методом Scikit-learn (порог {selector.threshold}):")
    print(removed_columns_sklearn)'''

In [14]:
def apply_corr(X, threshold = 0.2):
    # Вычисляем корреляции каждого признака с целевой переменной
    correlations = X.apply(lambda col: col.corr(y))
    # Фильтруем признаки по модулю корреляции
    selected_features = correlations[correlations.abs() >= threshold].index
    # Оставляем только отобранные признаки в X
    X_corr = X[selected_features]
    # Выводим результат
    print("Оставленные признаки:", len(list(selected_features)), list(selected_features))

    return X_corr

In [230]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [15]:
def apply_scaler (train, test):
    train.columns = train.columns.astype(str)
    test.columns = test.columns.astype(str)

    scaler = MinMaxScaler(feature_range=(0, 1))
    train_scaled = pd.DataFrame(scaler.fit_transform(train))
    test_scaled = pd.DataFrame(scaler.transform(test))
    return train_scaled, test_scaled

In [None]:
def apply_pca (X_train, X_test, threshold=0.95):
    pca = PCA(n_components=threshold, svd_solver='full')
    train_transformed = pd.DataFrame(pca.fit_transform(X_train))
    test_transformed = pd.DataFrame(pca.transform(X_test))
    dispersion=round((sum(pca.explained_variance_ratio_)*100),2)
    number_of_comp = pca.n_components_
    print(number_of_comp,dispersion)

    return train_transformed, test_transformed

In [17]:
def apply_tsne(X_train, X_test, n_components=2, perplexity=30, n_iter=5000, random_state=42):
    # Объединяем данные
    X_combined = np.vstack((X_train, X_test))
    
    # Применяем t-SNE ко всему датасету
    tsne = TSNE(n_components=n_components, perplexity=perplexity, n_iter=n_iter, random_state=random_state)
    X_tsne = tsne.fit_transform(X_combined)
    
    # Разделяем обратно
    X_train_tsne = X_tsne[:len(X_train)]
    X_test_tsne = X_tsne[len(X_train):]
    
    return X_train_tsne, X_test_tsne

In [231]:
X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)

In [232]:
X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled, 0.95)

29 95.28


In [191]:
X_train_transformed, X_test_transformed = apply_tsne (X_train_scaled, X_test_scaled, perplexity=60)



In [195]:
def apply_umap(X_train: pd.DataFrame, 
                         X_test: pd.DataFrame, 
                         n_components: int = 10,
                         n_neighbors: int = 15,
                         min_dist: float = 0.1,
                         metric: str = 'euclidean',
                         random_state: int = 42):
    """
    Применяет UMAP для понижения размерности к обучающей и тестовой выборкам.

    Обучает UMAP только на обучающей выборке (X_train) и затем применяет
    обученный трансформер к обеим выборкам для предотвращения утечки данных.

    Args:
        X_train (pd.DataFrame): Обучающая выборка (признаки).
        X_test (pd.DataFrame): Тестовая выборка (признаки).
        n_components (int): Целевое количество компонент после понижения.
                           По умолчанию 10.
        n_neighbors (int): Параметр UMAP, влияющий на баланс локальной/глобальной
                           структуры. По умолчанию 15.
        min_dist (float): Параметр UMAP, влияющий на плотность точек.
                          По умолчанию 0.1.
        metric (str): Метрика расстояния для UMAP. По умолчанию 'euclidean'.
        random_state (int): Зерно для генератора случайных чисел для
                            воспроизводимости. По умолчанию 42.

    Returns:
        tuple: Кортеж из двух numpy массивов:
               - X_train_reduced (np.ndarray): Обучающая выборка после UMAP.
               - X_test_reduced (np.ndarray): Тестовая выборка после UMAP.
    """
    
    print(f"Применение UMAP с n_components={n_components}, n_neighbors={n_neighbors}, min_dist={min_dist}...")
    
    # 1. Создаем экземпляр UMAP трансформера
    # Важно: используем random_state для воспроизводимости
    umap_transformer = umap.UMAP(n_components=n_components,
                                 n_neighbors=n_neighbors,
                                 min_dist=min_dist,
                                 metric=metric,
                                 random_state=random_state,
                                 # Дополнительные опции, можно добавить
                                 # n_epochs=None, # Можно увеличить для больших датасетов
                                 # learning_rate=1.0,
                                 # init='spectral', # Или 'random'
                                )
    
    # 2. Обучаем трансформер ТОЛЬКО на обучающей выборке
    print("Обучение UMAP на X_train...")
    umap_transformer.fit(X_train)
    print("Обучение завершено.")
    
    # 3. Преобразуем обучающую выборку
    print("Преобразование X_train...")
    X_train_reduced = umap_transformer.transform(X_train)
    print(f"X_train размерность до: {X_train.shape}, после: {X_train_reduced.shape}")
    
    # 4. Преобразуем тестовую выборку, используя ТОТ ЖЕ обученный трансформер
    print("Преобразование X_test...")
    X_test_reduced = umap_transformer.transform(X_test)
    print(f"X_test размерность до: {X_test.shape}, после: {X_test_reduced.shape}")
    
    return X_train_reduced, X_test_reduced

In [196]:
X_train_umap, X_test_umap = apply_umap(X_train_scaled, X_test_scaled, n_components=30, n_neighbors=15, min_dist=0.1)

Применение UMAP с n_components=30, n_neighbors=15, min_dist=0.1...
Обучение UMAP на X_train...


  warn(


Обучение завершено.
Преобразование X_train...
X_train размерность до: (214, 4735), после: (214, 30)
Преобразование X_test...




X_test размерность до: (54, 4735), после: (54, 30)


# Облучение

In [21]:
# Определяем функцию для оценки модели
def evaluate_model(model, X_train, y_train, X_test, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    train_r2 = r2_score(y_train, y_pred_train)
    train_mse = mean_squared_error(y_train, y_pred_train)
    test_r2 = r2_score(y_test, y_pred_test)
    test_mse = mean_squared_error(y_test, y_pred_test)

    print(f'Train MSE: {train_mse:.4f}')
    print(f'Train R2: {train_r2:.4f}')
    print(f'Test MSE: {test_mse:.4f}')
    print(f'Test R2: {test_r2:.4f}')

In [136]:
# Инициализируем модель SVR и оцениваем её
def svr_learning(X_train, y_train, X_test, y_test):
    model = SVR()
    model.fit(X_train, y_train)
    evaluate_model(model, X_train, y_train, X_test, y_test)

In [256]:
def svr_optuna(X_train, y_train, X_test, y_test):
    # Целевая функция для оптимизации
    def objective(trial):
        kernel = trial.suggest_categorical('kernel', ['poly', 'rbf'])
    
        params = {
            "C": trial.suggest_float("C", 1e-1, 1e3, log=True),
            "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
            "kernel": kernel,
        }

        # Добавляем gamma, если оно актуально для выбранного ядра
        if kernel in ['rbf', 'poly', 'sigmoid']:
            params["gamma"] = trial.suggest_categorical("gamma", ['scale', 'auto'])

        # Параметр degree актуален только для полиномиального ядра
        if kernel == 'poly':
            params["degree"] = trial.suggest_int("degree", 2, 5)

        # coef0 используется в poly и sigmoid
        if kernel in ['poly', 'sigmoid']:
            params["coef0"] = trial.suggest_float("coef0", -3.0, 3.0)

        model = SVR(**params)
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    # Создание и оптимизация исследования
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=100)

    print("Лучшие гиперпараметры:", study.best_params)
    print("Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими гиперпараметрами
    best_model = SVR(**study.best_params)
    best_model.fit(X_train, y_train)

    # Оцениваем модель
    evaluate_model(best_model, X_train, y_train, X_test, y_test)

    '''
    result = permutation_importance(
    best_model, X_test, y_test, n_repeats=10, random_state=42, scoring='neg_root_mean_squared_error'
    )

    # Вывод важности признаков
    feature_importance = pd.DataFrame({
        'feature': X_test.columns,
        'importance_mean': result.importances_mean,
        'importance_std': result.importances_std
    }).sort_values(by='importance_mean', ascending=False)

    print(feature_importance)    
    # Сортируем по убыванию важности
    feature_importance_sorted = feature_importance.reset_index(drop=True)
    total_importance = feature_importance_sorted['importance_mean'].sum()
    feature_importance_sorted['cumulative_importance'] = feature_importance_sorted['importance_mean'].cumsum() / total_importance

    # Оставляем признаки, дающие в сумме до 80% важности
    top_features = feature_importance_sorted[feature_importance_sorted['cumulative_importance'] <= 0.8]

    # Если последний признак недобрал до 80%, добавим ещё один
    if top_features['cumulative_importance'].iloc[-1] < 0.95:
        top_features = feature_importance_sorted.iloc[:len(top_features)+1]

    print("Признаки, дающие 80% важности:")
    print(top_features[['feature', 'importance_mean', 'cumulative_importance']])

    # Формируем новые обучающие и тестовые выборки с отобранными признаками
    selected_features = top_features['feature'].tolist()
    X_train_selected = X_train[selected_features]
    X_test_selected = X_test[selected_features]

    # Переобучаем модель на отобранных признаках
    final_model = SVR(**study.best_params)
    final_model.fit(X_train_selected, y_train)

    # Финальная оценка модели
    print("\nОценка модели на отобранных признаках:")
    evaluate_model(final_model, X_train_selected, y_train, X_test_selected, y_test)
    '''

    return best_model

In [24]:
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

for name, df in dfs_dict.items():
    print(f"\n=== {name} ===")

    X = pd.concat([df, X_cell_line_filtered], axis=1)

    X.columns = pd.Index(X.columns).map(str)
    
    X_filtered_var = apply_varThreshold(X, 0)
    # X_corr = apply_corr(X_filtered_var, 0.2)
    X_train, X_test, y_train, y_test = train_test_split(X_filtered_var, y, test_size=0.2, random_state=42)

    X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)

    # 1. Создаем пайплайн, объединяющий UMAP и SVR
    # Важно: UMAP должен быть первым шагом
    pipeline = Pipeline([
        ('umap', umap.UMAP(random_state=42)),
        ('svr', SVR(kernel = 'rbf'))
    ])

    # 2. Определяем пространство поиска гиперпараметров
    # Используем распределения для Randomized Search
    param_distributions = {
        'umap__n_components': randint(5, 151), # Пример значений
        'umap__n_neighbors': randint(5, 76),       # Пример значений
        'umap__min_dist': uniform(0.0, 0.5),         # Пример значений
        'umap__metric': ['euclidean', 'cosine', 'correlation'], # Пример метрик

        # Параметры SVR тоже включаем в поиск!
        'svr__C': loguniform(1.0, 1e2),          # от 1 до 100
        'svr__epsilon': loguniform(1e-3, 1.0),    # от 0.001 до 1.0
        'svr__gamma': ['scale', 'auto'] + list(np.logspace(-4, 1, 6)) # Например, 'scale', 'auto' и значения
    }

    # 3. Определяем метрику для оценки
    # Для регрессии часто используют отрицательные метрики ошибок для оптимизации
    # (чем меньше ошибка, тем лучше -> чем больше отрицательная ошибка, тем лучше)
    scoring_metric = make_scorer(mean_squared_error, greater_is_better=False) # Оптимизируем по MSE (минимизируем)

    # 4. Настраиваем кросс-валидацию
    kf = KFold(n_splits=5, shuffle=True, random_state=42) # Например, 5 фолдов

    # 5. Создаем объект RandomizedSearchCV
    # n_iter - количество комбинаций для проверки (увеличьте, если есть ресурсы)
    # cv - объект кросс-валидации или число фолдов
    # scoring - метрика для оценки
    # verbose - для вывода информации о процессе
    # n_jobs - количество ядер для параллельных вычислений (-1 использует все доступные)
    random_search = RandomizedSearchCV(
        pipeline,
        param_distributions=param_distributions,
        n_iter=200, # Проверим 50 случайных комбинаций
        cv=5,      # 5-кратная кросс-валидация
        scoring=scoring_metric,
        verbose=2,
        random_state=42,
        n_jobs=-1
    )

    # 6. Запускаем поиск гиперпараметров на тренировочных данных
    print("Запуск Randomized Search для пайплайна UMAP+SVR...")
    random_search.fit(X_train_scaled, y_train)
    print("Поиск завершен.")

    # 7. Получаем лучшие параметры и лучший результат CV
    print("\nЛучшие параметры найдены:")
    print(random_search.best_params_)
    print(f"\nЛучший результат кросс-валидации (Negative MSE): {random_search.best_score_:.4f}")
    print(f"Соответствующий MSE: {-random_search.best_score_:.4f}")

    # 8. Лучший обученный пайплайн доступен как random_search.best_estimator_
    # Он уже обучен на всем X_train с лучшими параметрами.
    # --- Добавляем оценку на данных ---
    print("\nОценка лучшей модели на тренировочных данных:")
    # Получаем предсказания лучшей модели на тренировочных данных
    y_train_pred = random_search.best_estimator_.predict(X_train_scaled)

    # Рассчитываем MSE на тренировочных данных
    train_mse = mean_squared_error(y_train, y_train_pred)
    # Рассчитываем R2 на тренировочных данных
    train_r2 = r2_score(y_train, y_train_pred)

    # Выводим метрики для тренировочных данных
    print(f"MSE на X_train: {train_mse:.4f}")
    print(f"R2 на X_train: {train_r2:.4f}")

    # Оцениваем его на отложенном тестовом наборе
    print("\nОценка лучшей модели на тестовых данных:")
    y_pred = random_search.best_estimator_.predict(X_test_scaled)
    final_mse = mean_squared_error(y_test, y_pred)
    print(f"MSE на X_test: {final_mse:.4f}")
    final_r2 = r2_score(y_test, y_pred)
    print(f"R2 на X_test: {final_r2:.4f}")


=== RDKit_Descriptors ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
           0         1          2          3         4         5         6  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   

          7         8     9  ...  cell_line_MDA-MB-435S cells  \
0  0.269231  1.000000   7.0  ...                          0.0   
1  0.272727  0.727273   7.0  ...                          0.0   
2  0.235294  0.647059   4.0  ...                          0.0   
3  0.366667  1.033333  13.0  ...                          0.0   
4  0.461538  1.192308  12.0  ...                          0.0   

   cell_line_N. tabacum cells  cell_line_NIH-3T3 cells

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)


                 0           1           2           3           4  \
count   268.000000  268.000000  268.000000  268.000000  268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431    0.294253   
std    1027.891708    1.349577    2.217002    4.505073    0.257694   
min      89.093200   -4.500000    4.050028   -6.131594   -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760    0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524    0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305    0.457545   
max    8511.835100    2.475000   11.999968   35.776307    0.944656   

                5           6           7           8           9  ...  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.095923    0.996992    0.193904    0.681644    4.343284  ...   
std      0.112545    0.019976    0.182269    0.336047    3.891034  ...   
min      0.000000    0.933810    0.000000    0.000000    0.000000  ...   

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 12.814703696157855, 'svr__epsilon': 0.8089237879387583, 'svr__gamma': 'auto', 'umap__metric': 'correlation', 'umap__min_dist': 0.31469931906763127, 'umap__n_components': 18, 'umap__n_neighbors': 19}

Лучший результат кросс-валидации (Negative MSE): -3.2599
Соответствующий MSE: 3.2599

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 2.7961
R2 на X_train: 0.4282

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.1142
R2 на X_test: 0.2546

=== BLOMAP_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
      0     1    2     3     5     6    7     8    10   11  ...  \
0  0.62  0.29  0.0 -0.06 -1.50  0.00  2.9  0.00 -1.50  0.0  ...   
1  0.62  0.29  0.0 -0.06 -2.53  0.00  3.0  1.17 -1.50  0.0  ...   
2  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -2.53  0.0  ...   
3  0.62  0.29  0.0 -0.06 -0.90  0.46  1.6  0.00  1.08 -0.1  ...   
4  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -1.50  0.0  ...



                0           1           2           3           5           6  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean    -0.660112   -0.003433    1.272761    0.275821   -0.465746   -0.033284   
std      1.316615    0.166192    1.406847    0.501976    1.349090    0.169254   
min     -2.530000   -0.300000    0.000000   -0.060000   -2.530000   -0.300000   
25%     -1.500000   -0.100000    0.000000    0.000000   -1.500000   -0.100000   
50%     -0.400000    0.000000    0.200000    0.000000   -0.180000    0.000000   
75%      0.290000    0.000000    2.900000    0.000000    0.682500    0.000000   
max      1.380000    0.580000    3.000000    1.170000    1.380000    0.580000   

                7           8          10          11  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     1.161940    0.238321   -0.448470   -0.025709  ...   
std      1.361306    0.474427    1.330883    0.225169  ...   
min      0.000000   -0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 25.00238716980236, 'svr__epsilon': 0.0036954269299859697, 'svr__gamma': 0.01, 'umap__metric': 'correlation', 'umap__min_dist': 0.4806576440924537, 'umap__n_components': 35, 'umap__n_neighbors': 27}

Лучший результат кросс-валидации (Negative MSE): -3.5998
Соответствующий MSE: 3.5998

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 3.4976
R2 на X_train: 0.2847

Оценка лучшей модели на тестовых данных:
MSE на X_test: 5.2941
R2 на X_test: 0.0408

=== Fingerprints_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
    11   22   23   27   36   41   56   70   72   79  ...  \
0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  ...   
1  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  ...   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...   
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  ...   
4  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...   

   cell_line_MDA-MB-435S cells  



               11          22          23          27          36          41  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.227612    0.003731    0.007463    0.037313    0.044776    0.294776   
std      0.420075    0.061085    0.086225    0.189883    0.207199    0.456795   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
50%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
75%      0.000000    0.000000    0.000000    0.000000    0.000000    1.000000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

               56          70          72          79  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.003731    0.003731    0.014925    0.727612  ...   
std      0.061085    0.061085    0.121481    0.446021  ...   
min      0.000000    0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 1.8536616154169725, 'svr__epsilon': 0.0012199668475623266, 'svr__gamma': 1.0, 'umap__metric': 'euclidean', 'umap__min_dist': 0.31015477567673233, 'umap__n_components': 70, 'umap__n_neighbors': 55}

Лучший результат кросс-валидации (Negative MSE): -3.6756
Соответствующий MSE: 3.6756

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 2.0501
R2 на X_train: 0.5807

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.4579
R2 на X_test: 0.1923

=== ProtBERT_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
         0         1         2         3         4         5         6  \
0  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
1  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
2  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
3  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
4  0.03056  0.024347  0.1363



                0           1           2           3           4           5  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.030560    0.024347    0.136336    0.032939   -0.065260   -0.135982   
std      0.000003    0.000007    0.000007    0.000004    0.000014    0.000008   
min      0.030527    0.024347    0.136259    0.032896   -0.065424   -0.135983   
25%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
50%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
75%      0.030560    0.024347    0.136336    0.032939   -0.065259   -0.135983   
max      0.030560    0.024433    0.136337    0.032939   -0.065259   -0.135885   

                6           7           8           9  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean    -0.046606    0.032417    0.011023   -0.019774  ...   
std      0.000008    0.000007    0.000008    0.000008  ...   
min     -0.046607    0

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 5.203857985647047, 'svr__epsilon': 0.00219115336741892, 'svr__gamma': 'auto', 'umap__metric': 'cosine', 'umap__min_dist': 0.26015385045189665, 'umap__n_components': 122, 'umap__n_neighbors': 7}

Лучший результат кросс-валидации (Negative MSE): -3.4849
Соответствующий MSE: 3.4849

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 3.3635
R2 на X_train: 0.3121

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.5766
R2 на X_test: 0.1708

=== Combined_All_Features ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
     0_rdkit   1_rdkit    2_rdkit    3_rdkit   4_rdkit   5_rdkit   6_rdkit  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)


           0_rdkit     1_rdkit     2_rdkit     3_rdkit     4_rdkit  \
count   268.000000  268.000000  268.000000  268.000000  268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431    0.294253   
std    1027.891708    1.349577    2.217002    4.505073    0.257694   
min      89.093200   -4.500000    4.050028   -6.131594   -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760    0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524    0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305    0.457545   
max    8511.835100    2.475000   11.999968   35.776307    0.944656   

          5_rdkit     6_rdkit     7_rdkit     8_rdkit     9_rdkit  ...  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.095923    0.996992    0.193904    0.681644    4.343284  ...   
std      0.112545    0.019976    0.182269    0.336047    3.891034  ...   
min      0.000000    0.933810    0.000000    0.000000    0.000000  ...   

  warn(


Поиск завершен.

Лучшие параметры найдены:
{'svr__C': 2.54754843343881, 'svr__epsilon': 0.6738460883888424, 'svr__gamma': 1.0, 'umap__metric': 'correlation', 'umap__min_dist': 0.3473924665198523, 'umap__n_components': 32, 'umap__n_neighbors': 70}

Лучший результат кросс-валидации (Negative MSE): -3.6723
Соответствующий MSE: 3.6723

Оценка лучшей модели на тренировочных данных:
MSE на X_train: 1.6785
R2 на X_train: 0.6567

Оценка лучшей модели на тестовых данных:
MSE на X_test: 4.1997
R2 на X_test: 0.2391




In [260]:
y = np.log1p(X_numerical_filtered_no_outliers[target]).reset_index(drop=True)

for name, df in dfs_dict.items():
    print(f"\n=== {name} ===")

    X = pd.concat([df, X_cell_line_filtered], axis=1)

    X.columns = pd.Index(X.columns).map(str)
    
    X_filtered_var = apply_varThreshold(X, 0)
    X_corr = apply_corr(X_filtered_var, 0.2)
    X_train, X_test, y_train, y_test = train_test_split(X_corr, y, test_size=0.2, random_state=42)

    X_train_scaled, X_test_scaled = apply_scaler (X_train, X_test)
    # X_train_transformed, X_test_transformed = apply_pca (X_train_scaled, X_test_scaled, 0.95)

    model = svr_optuna(X_train_scaled, y_train, X_test_scaled, y_test)


=== RDKit_Descriptors ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
           0         1          2          3         4         5         6  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   

          7         8     9  ...  cell_line_MDA-MB-435S cells  \
0  0.269231  1.000000   7.0  ...                          0.0   
1  0.272727  0.727273   7.0  ...                          0.0   
2  0.235294  0.647059   4.0  ...                          0.0   
3  0.366667  1.033333  13.0  ...                          0.0   
4  0.461538  1.192308  12.0  ...                          0.0   

   cell_line_N. tabacum cells  cell_line_NIH-3T3 cells

  sqr = np.multiply(arr, arr, out=arr, where=where)
  sqr = _ensure_numeric((avg - values) ** 2)
  c /= stddev[:, None]
[I 2025-04-20 22:59:55,427] A new study created in memory with name: no-name-e5e46aaa-74ff-4884-9872-f8c3dd9e5625
[I 2025-04-20 22:59:55,454] Trial 0 finished with value: -2.0019216634322254 and parameters: {'kernel': 'rbf', 'C': 0.10245432441261401, 'epsilon': 0.0713114552043479, 'gamma': 'scale'}. Best is trial 0 with value: -2.0019216634322254.
[I 2025-04-20 22:59:55,482] Trial 1 finished with value: -5.361261418953569 and parameters: {'kernel': 'poly', 'C': 0.26157380594261775, 'epsilon': 0.0024919541621217585, 'gamma': 'scale', 'degree': 4, 'coef0': -1.5379530997174042}. Best is trial 0 with value: -2.0019216634322254.
[I 2025-04-20 22:59:55,510] Trial 2 finished with value: -2.057759469476707 and parameters: {'kernel': 'rbf', 'C': 0.2812800144270166, 'epsilon': 0.17130871574330717, 'gamma': 'auto'}. Best is trial 0 with value: -2.0019216634322254.


                 0           1           2           3           4  \
count   268.000000  268.000000  268.000000  268.000000  268.000000   
mean   1970.765109   -1.549609   10.064478    4.185431    0.294253   
std    1027.891708    1.349577    2.217002    4.505073    0.257694   
min      89.093200   -4.500000    4.050028   -6.131594   -0.392198   
25%    1321.481800   -2.355833    8.956109    0.652760    0.091271   
50%    1814.091950   -1.362121   10.650902    3.576524    0.284260   
75%    2333.846750   -0.684295   11.999968    5.679305    0.457545   
max    8511.835100    2.475000   11.999968   35.776307    0.944656   

                5           6           7           8           9  ...  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.095923    0.996992    0.193904    0.681644    4.343284  ...   
std      0.112545    0.019976    0.182269    0.336047    3.891034  ...   
min      0.000000    0.933810    0.000000    0.000000    0.000000  ...   

[I 2025-04-20 22:59:56,235] Trial 3 finished with value: -9.061533272183276 and parameters: {'kernel': 'poly', 'C': 150.0857834502849, 'epsilon': 0.060968705833596044, 'gamma': 'scale', 'degree': 5, 'coef0': 2.318477497723989}. Best is trial 0 with value: -2.0019216634322254.
[I 2025-04-20 22:59:56,262] Trial 4 finished with value: -2.0650188877764135 and parameters: {'kernel': 'rbf', 'C': 0.24991513721365496, 'epsilon': 0.02426778900903614, 'gamma': 'auto'}. Best is trial 0 with value: -2.0019216634322254.
[I 2025-04-20 22:59:56,289] Trial 5 finished with value: -1.8543703067841242 and parameters: {'kernel': 'rbf', 'C': 3.516456088185003, 'epsilon': 0.07234277248887508, 'gamma': 'scale'}. Best is trial 5 with value: -1.8543703067841242.
[I 2025-04-20 22:59:56,316] Trial 6 finished with value: -1.8913123057066272 and parameters: {'kernel': 'rbf', 'C': 6.561114537308692, 'epsilon': 0.29632693940540256, 'gamma': 'auto'}. Best is trial 5 with value: -1.8543703067841242.
[I 2025-04-20 22:5

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 7.8969500527355665, 'epsilon': 0.7133393127840363, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.7917077228550284
Train RMSE: 2.0373
Train R2: 0.5834
Test RMSE: 3.2812
Test R2: 0.4055

=== BLOMAP_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
      0     1    2     3     5     6    7     8    10   11  ...  \
0  0.62  0.29  0.0 -0.06 -1.50  0.00  2.9  0.00 -1.50  0.0  ...   
1  0.62  0.29  0.0 -0.06 -2.53  0.00  3.0  1.17 -1.50  0.0  ...   
2  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -2.53  0.0  ...   
3  0.62  0.29  0.0 -0.06 -0.90  0.46  1.6  0.00  1.08 -0.1  ...   
4  0.62  0.29  0.0 -0.06 -0.74  0.00  1.4  0.00 -1.50  0.0  ...   

   cell_line_MDA-MB-435S cells  cell_line_N. tabacum cells  \
0                          0.0                         0.0   
1                          0.0                         0.0   
2                          0.0                         0.0   
3                          0.0 

[I 2025-04-20 23:00:00,393] A new study created in memory with name: no-name-78256c65-f7cd-4ce4-b4ed-1fe2daf66312
[I 2025-04-20 23:00:00,422] Trial 0 finished with value: -1.9181844350740729 and parameters: {'kernel': 'rbf', 'C': 14.174044309446842, 'epsilon': 0.025933861785752122, 'gamma': 'scale'}. Best is trial 0 with value: -1.9181844350740729.
[I 2025-04-20 23:00:00,453] Trial 1 finished with value: -1.899025744306274 and parameters: {'kernel': 'rbf', 'C': 73.53518868580521, 'epsilon': 0.0022875260404394268, 'gamma': 'auto'}. Best is trial 1 with value: -1.899025744306274.
[I 2025-04-20 23:00:00,508] Trial 2 finished with value: -2.0385631279534553 and parameters: {'kernel': 'rbf', 'C': 120.32792546118212, 'epsilon': 0.020852688569727395, 'gamma': 'scale'}. Best is trial 1 with value: -1.899025744306274.


                0           1           2           3           5           6  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean    -0.660112   -0.003433    1.272761    0.275821   -0.465746   -0.033284   
std      1.316615    0.166192    1.406847    0.501976    1.349090    0.169254   
min     -2.530000   -0.300000    0.000000   -0.060000   -2.530000   -0.300000   
25%     -1.500000   -0.100000    0.000000    0.000000   -1.500000   -0.100000   
50%     -0.400000    0.000000    0.200000    0.000000   -0.180000    0.000000   
75%      0.290000    0.000000    2.900000    0.000000    0.682500    0.000000   
max      1.380000    0.580000    3.000000    1.170000    1.380000    0.580000   

                7           8          10          11  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     1.161940    0.238321   -0.448470   -0.025709  ...   
std      1.361306    0.474427    1.330883    0.225169  ...   
min      0.000000   -0

[I 2025-04-20 23:00:00,552] Trial 3 finished with value: -1.9229815545350282 and parameters: {'kernel': 'rbf', 'C': 17.768405906605874, 'epsilon': 0.0355466917888135, 'gamma': 'scale'}. Best is trial 1 with value: -1.899025744306274.
[I 2025-04-20 23:00:00,584] Trial 4 finished with value: -1.9760418585744983 and parameters: {'kernel': 'rbf', 'C': 0.13221578780122498, 'epsilon': 0.4106965334662749, 'gamma': 'scale'}. Best is trial 1 with value: -1.899025744306274.
[I 2025-04-20 23:00:00,628] Trial 5 finished with value: -1.884215132556077 and parameters: {'kernel': 'poly', 'C': 3.02812525801711, 'epsilon': 0.8156973734359756, 'gamma': 'scale', 'degree': 2, 'coef0': 2.90407585762303}. Best is trial 5 with value: -1.884215132556077.
[I 2025-04-20 23:00:00,662] Trial 6 finished with value: -4008.823682836509 and parameters: {'kernel': 'poly', 'C': 183.6586573060865, 'epsilon': 0.8193789958596698, 'gamma': 'scale', 'degree': 3, 'coef0': -2.00842844516408}. Best is trial 5 with value: -1.88

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 202.67998591205898, 'epsilon': 0.5044543850454173, 'gamma': 'auto'}
Лучшая средняя ошибка (CV): 1.873871803808455
Train RMSE: 3.2536
Train R2: 0.3346
Test RMSE: 4.0254
Test R2: 0.2707

=== Fingerprints_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
    11   22   23   27   36   41   56   70   72   79  ...  \
0  0.0  0.0  1.0  0.0  0.0  1.0  0.0  0.0  0.0  1.0  ...   
1  1.0  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  ...   
2  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  0.0  1.0  ...   
3  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  1.0  1.0  ...   
4  0.0  0.0  0.0  0.0  0.0  1.0  0.0  0.0  0.0  0.0  ...   

   cell_line_MDA-MB-435S cells  cell_line_N. tabacum cells  \
0                          0.0                         0.0   
1                          0.0                         0.0   
2                          0.0                         0.0   
3                          0.0                         0.0   
4      

[I 2025-04-20 23:00:05,421] A new study created in memory with name: no-name-2963f9a3-f36b-45ec-b062-419ecc0a74b4
[I 2025-04-20 23:00:05,450] Trial 0 finished with value: -1.870686397440079 and parameters: {'kernel': 'rbf', 'C': 0.7167742006043022, 'epsilon': 0.13693827023909805, 'gamma': 'scale'}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,481] Trial 1 finished with value: -2.1079310663570574 and parameters: {'kernel': 'poly', 'C': 0.1136585076673189, 'epsilon': 0.11809029337549588, 'gamma': 'scale', 'degree': 4, 'coef0': 2.2592902967245907}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,510] Trial 2 finished with value: -2.0444489412361246 and parameters: {'kernel': 'rbf', 'C': 16.772786343991417, 'epsilon': 0.04700928327314749, 'gamma': 'scale'}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,540] Trial 3 finished with value: -2.0291154192669136 and parameters: {'kernel': 'poly', 'C': 0.4180393406242141, 'e

               11          22          23          27          36          41  \
count  268.000000  268.000000  268.000000  268.000000  268.000000  268.000000   
mean     0.227612    0.003731    0.007463    0.037313    0.044776    0.294776   
std      0.420075    0.061085    0.086225    0.189883    0.207199    0.456795   
min      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
25%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
50%      0.000000    0.000000    0.000000    0.000000    0.000000    0.000000   
75%      0.000000    0.000000    0.000000    0.000000    0.000000    1.000000   
max      1.000000    1.000000    1.000000    1.000000    1.000000    1.000000   

               56          70          72          79  ...  \
count  268.000000  268.000000  268.000000  268.000000  ...   
mean     0.003731    0.003731    0.014925    0.727612  ...   
std      0.061085    0.061085    0.121481    0.446021  ...   
min      0.000000    0

[I 2025-04-20 23:00:05,569] Trial 4 finished with value: -1.9264738408320121 and parameters: {'kernel': 'rbf', 'C': 0.5650369947713533, 'epsilon': 0.005299178414434676, 'gamma': 'auto'}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,611] Trial 5 finished with value: -1.969694973587496 and parameters: {'kernel': 'rbf', 'C': 5.235446502752227, 'epsilon': 0.0024134536915859524, 'gamma': 'scale'}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,640] Trial 6 finished with value: -1.8933229425069267 and parameters: {'kernel': 'rbf', 'C': 9.320608031211792, 'epsilon': 0.8151640101088528, 'gamma': 'auto'}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05,669] Trial 7 finished with value: -2.063467888768044 and parameters: {'kernel': 'poly', 'C': 0.1320255775915632, 'epsilon': 0.02114520003058961, 'gamma': 'auto', 'degree': 5, 'coef0': 0.3735517916293687}. Best is trial 0 with value: -1.870686397440079.
[I 2025-04-20 23:00:05

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 2.036898831990904, 'epsilon': 0.9145236744020449, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.8404803974792028
Train RMSE: 2.1290
Train R2: 0.5646
Test RMSE: 4.1269
Test R2: 0.2523

=== ProtBERT_Embeddings ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
         0         1         2         3         4         5         6  \
0  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
1  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
2  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
3  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   
4  0.03056  0.024347  0.136336  0.032939 -0.065259 -0.135983 -0.046607   

          7         8         9  ...  cell_line_MDA-MB-435S cells  \
0  0.032417  0.011024 -0.019775  ...                          0.0   
1  0.032417  0.011024 -0.019775  ...                          0.0   
2  0.032417  0.011024 -0.0197

[I 2025-04-20 23:00:11,335] A new study created in memory with name: no-name-c6ad9e58-6e95-414a-87c1-40e0d3cf9ab5
[I 2025-04-20 23:00:11,362] Trial 0 finished with value: -1.9605030049612489 and parameters: {'kernel': 'rbf', 'C': 0.2445630550592583, 'epsilon': 0.11321451171245421, 'gamma': 'scale'}. Best is trial 0 with value: -1.9605030049612489.
[I 2025-04-20 23:00:11,392] Trial 1 finished with value: -1.9072407885770524 and parameters: {'kernel': 'poly', 'C': 725.0250005696463, 'epsilon': 0.006813554286770167, 'gamma': 'scale', 'degree': 3, 'coef0': 2.7473441020207847}. Best is trial 1 with value: -1.9072407885770524.
[I 2025-04-20 23:00:11,421] Trial 2 finished with value: -1.9071376448275785 and parameters: {'kernel': 'poly', 'C': 7.985288128351298, 'epsilon': 0.010677677955467795, 'gamma': 'scale', 'degree': 5, 'coef0': 0.0905948087497217}. Best is trial 2 with value: -1.9071376448275785.
[I 2025-04-20 23:00:11,449] Trial 3 finished with value: -1.900945078111322 and parameters: 

Оставленные признаки: 3 ['cell_line_CHO cells', 'cell_line_Cal 27', 'cell_line_NIH-3T3 cells']


[I 2025-04-20 23:00:11,562] Trial 7 finished with value: -1.8965346108891161 and parameters: {'kernel': 'poly', 'C': 42.54180094378005, 'epsilon': 0.362251011586881, 'gamma': 'scale', 'degree': 4, 'coef0': 1.9267235388217738}. Best is trial 7 with value: -1.8965346108891161.
[I 2025-04-20 23:00:11,591] Trial 8 finished with value: -1.9073731991639682 and parameters: {'kernel': 'rbf', 'C': 8.11581765581353, 'epsilon': 0.0026671981026976715, 'gamma': 'scale'}. Best is trial 7 with value: -1.8965346108891161.
[I 2025-04-20 23:00:11,621] Trial 9 finished with value: -1.9073714849801715 and parameters: {'kernel': 'poly', 'C': 260.2056020652577, 'epsilon': 0.002462608364724859, 'gamma': 'scale', 'degree': 3, 'coef0': 0.7521849266901022}. Best is trial 7 with value: -1.8965346108891161.
[I 2025-04-20 23:00:11,671] Trial 10 finished with value: -998.1977996952439 and parameters: {'kernel': 'poly', 'C': 57.02757747599816, 'epsilon': 0.031311896642387045, 'gamma': 'auto', 'degree': 2, 'coef0': -

Лучшие гиперпараметры: {'kernel': 'poly', 'C': 884.6975007621129, 'epsilon': 0.5172764651109327, 'gamma': 'scale', 'degree': 5, 'coef0': 1.750228950782232}
Лучшая средняя ошибка (CV): 1.893712676307431
Train RMSE: 3.6077
Train R2: 0.2622
Test RMSE: 4.3398
Test R2: 0.2137

=== Combined_All_Features ===

Датафрейм после отсева по дисперсии (метод Scikit-learn):
     0_rdkit   1_rdkit    2_rdkit    3_rdkit   4_rdkit   5_rdkit   6_rdkit  \
0  3151.8362 -1.492308  11.608322  12.577710  0.483758  0.192308  1.021711   
1  2785.1939 -1.272727  11.839377   5.637233  0.256238  0.136364  1.000826   
2  2078.3658 -1.123529  11.824485   3.657547  0.215150  0.000000  0.983251   
3  3208.5313 -0.066667   4.139095  -2.395473 -0.079849  0.100000  0.993229   
4  2624.9800  0.100000   4.783081  -1.396580 -0.053715  0.000000  0.999597   

    7_rdkit   8_rdkit  9_rdkit  ...  cell_line_MDA-MB-435S cells  \
0  0.269231  1.000000      7.0  ...                          0.0   
1  0.272727  0.727273      7.0  .

  c /= stddev[:, None]
[I 2025-04-20 23:00:18,363] A new study created in memory with name: no-name-41ce4779-c419-4f7c-b482-0043fac1b817
[I 2025-04-20 23:00:18,390] Trial 0 finished with value: -1.83920210184638 and parameters: {'kernel': 'rbf', 'C': 6.231352575136066, 'epsilon': 0.009103451701689223, 'gamma': 'scale'}. Best is trial 0 with value: -1.83920210184638.
[I 2025-04-20 23:00:18,417] Trial 1 finished with value: -16.997717486530405 and parameters: {'kernel': 'poly', 'C': 18.776081745175517, 'epsilon': 0.01211084165431601, 'gamma': 'auto', 'degree': 3, 'coef0': -1.7726139836379056}. Best is trial 0 with value: -1.83920210184638.
[I 2025-04-20 23:00:18,454] Trial 2 finished with value: -2.034867540273633 and parameters: {'kernel': 'rbf', 'C': 561.4661455779523, 'epsilon': 0.017153639550163805, 'gamma': 'scale'}. Best is trial 0 with value: -1.83920210184638.
[I 2025-04-20 23:00:18,482] Trial 3 finished with value: -1.857993745127257 and parameters: {'kernel': 'poly', 'C': 0.176

Оставленные признаки: 83 ['5_rdkit', '7_rdkit', '18_rdkit', '31_rdkit', '34_rdkit', '37_rdkit', '38_rdkit', '68_rdkit', '87_rdkit', '94_rdkit', '109_rdkit', '114_rdkit', '115_rdkit', '118_rdkit', '123_rdkit', '124_rdkit', '133_rdkit', '134_rdkit', '158_rdkit', '160_rdkit', '174_rdkit', '202_rdkit', '209_rdkit', '26_blomap', '36_blomap', '57_blomap', '119_fingerprints', '229_fingerprints', '283_fingerprints', '294_fingerprints', '321_fingerprints', '328_fingerprints', '364_fingerprints', '376_fingerprints', '462_fingerprints', '545_fingerprints', '593_fingerprints', '600_fingerprints', '623_fingerprints', '648_fingerprints', '667_fingerprints', '680_fingerprints', '708_fingerprints', '759_fingerprints', '806_fingerprints', '894_fingerprints', '953_fingerprints', '983_fingerprints', '1057_fingerprints', '1088_fingerprints', '1104_fingerprints', '1199_fingerprints', '1349_fingerprints', '1357_fingerprints', '1431_fingerprints', '1451_fingerprints', '1459_fingerprints', '1466_fingerprints'

[I 2025-04-20 23:00:18,594] Trial 7 finished with value: -1.9314404464045232 and parameters: {'kernel': 'rbf', 'C': 92.6169047564318, 'epsilon': 0.15849113974693183, 'gamma': 'auto'}. Best is trial 4 with value: -1.7900781561305377.
[I 2025-04-20 23:00:18,627] Trial 8 finished with value: -1.9056340640268705 and parameters: {'kernel': 'poly', 'C': 2.907154504447093, 'epsilon': 0.3317087461254153, 'gamma': 'scale', 'degree': 2, 'coef0': 0.3093450434303593}. Best is trial 4 with value: -1.7900781561305377.
[I 2025-04-20 23:00:18,679] Trial 9 finished with value: -2.2631839252279726 and parameters: {'kernel': 'rbf', 'C': 929.0179478133224, 'epsilon': 0.0011072280122440827, 'gamma': 'auto'}. Best is trial 4 with value: -1.7900781561305377.
[I 2025-04-20 23:00:18,716] Trial 10 finished with value: -1.8086333646146298 and parameters: {'kernel': 'rbf', 'C': 1.2846693454037, 'epsilon': 0.06834636300585353, 'gamma': 'scale'}. Best is trial 4 with value: -1.7900781561305377.
[I 2025-04-20 23:00:

Лучшие гиперпараметры: {'kernel': 'rbf', 'C': 12.095307005498434, 'epsilon': 0.9974093376424371, 'gamma': 'scale'}
Лучшая средняя ошибка (CV): 1.7581858612078727
Train RMSE: 0.9651
Train R2: 0.8026
Test RMSE: 3.9876
Test R2: 0.2775


In [233]:
model = svr_optuna(X_train_scaled, y_train, X_test_scaled, y_test)

[I 2025-04-20 15:59:23,036] A new study created in memory with name: no-name-bfbe5d22-ed17-4593-9517-276647867d58
[I 2025-04-20 15:59:23,306] Trial 0 finished with value: -1.7120523389124205 and parameters: {'C': 13.035255162984711, 'epsilon': 0.03730097116246337, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,474] Trial 1 finished with value: -1.7159074409419854 and parameters: {'C': 7.070461329939782, 'epsilon': 0.2556291142643974, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,643] Trial 2 finished with value: -1.7673078358851804 and parameters: {'C': 2.7699494220727288, 'epsilon': 0.1453293064808481, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15:59:23,800] Trial 3 finished with value: -1.8245612590408147 and parameters: {'C': 1.0947937764206288, 'epsilon': 0.043708963986457856, 'kernel': 'rbf'}. Best is trial 0 with value: -1.7120523389124205.
[I 2025-04-20 15

Лучшие гиперпараметры: {'C': 12.862918844000914, 'epsilon': 0.6713343238724396, 'kernel': 'rbf'}
Лучшая средняя ошибка (CV): 1.6625375429926663
Train RMSE: 0.9757
Train R2: 0.8005
Test RMSE: 2.7727
Test R2: 0.4976


In [113]:
def run_model_with_optuna(model_name, X_train, y_train, X_test, y_test):

    def objective(trial):
        if model_name == "svm":
            params = {
                "C": trial.suggest_float("C", 1e-1, 100, log=True),
                "epsilon": trial.suggest_float("epsilon", 1e-3, 1.0, log=True),
                "kernel": trial.suggest_categorical("kernel", ["rbf"])
            }
            model = SVR(**params)

        elif model_name == "sgd":
            params = {
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "penalty": trial.suggest_categorical("penalty", ["l2", "elasticnet"]),
                "max_iter": 1000
            }
            model = SGDRegressor(**params)

        elif model_name == "knn":
            params = {
                "n_neighbors": trial.suggest_int("n_neighbors", 3, 20),
                "weights": trial.suggest_categorical("weights", ["uniform", "distance"]),
            }
            model = KNeighborsRegressor(**params)

        elif model_name == "dt":
            params = {
                "max_depth": trial.suggest_int("max_depth", 2, 20),
                "min_samples_split": trial.suggest_int("min_samples_split", 2, 10)
            }
            model = DecisionTreeRegressor(**params)

        elif model_name == "rf":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = RandomForestRegressor(**params)

        elif model_name == "et":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 20),
            }
            model = ExtraTreesRegressor(**params)

        elif model_name == "xgb":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", 2, 10),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = XGBRegressor(**params, objective='reg:squarederror', verbosity=0)

        elif model_name == "lgbm":
            params = {
                "n_estimators": trial.suggest_int("n_estimators", 50, 300),
                "max_depth": trial.suggest_int("max_depth", -1, 20),
                "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
                "num_leaves": trial.suggest_int("num_leaves", 20, 150),
                "subsample": trial.suggest_float("subsample", 0.5, 1.0),
                "colsample_bytree": trial.suggest_float("colsample_bytree", 0.5, 1.0),
            }
            model = LGBMRegressor(**params)

        elif model_name == "mlp":
            params = {
                "hidden_layer_sizes": trial.suggest_categorical("hidden_layer_sizes", [(64,), (128,), (64, 64)]),
                "alpha": trial.suggest_float("alpha", 1e-5, 1e-1, log=True),
                "learning_rate_init": trial.suggest_float("learning_rate_init", 1e-4, 1e-2),
                "max_iter": 1000
            }
            model = MLPRegressor(**params)

        else:
            raise ValueError(f"Unknown model: {model_name}")

        # Кросс-валидация
        score = cross_val_score(
            model,
            X_train,
            y_train,
            cv=5,
            scoring="neg_root_mean_squared_error",
            n_jobs=-1
        )
        return score.mean()

    print(f"\n🔍 Оптимизация модели: {model_name}")
    study = optuna.create_study(direction="maximize")
    study.optimize(objective, n_trials=50)

    print("🏆 Лучшие гиперпараметры:", study.best_params)
    print("📉 Лучшая средняя ошибка (CV):", -study.best_value)

    # Обучаем модель с лучшими параметрами

    model_classes = {
        "svm": SVR,
        "sgd": SGDRegressor,
        "knn": KNeighborsRegressor,
        "dt": DecisionTreeRegressor,
        "rf": RandomForestRegressor,
        "et": ExtraTreesRegressor,
        "xgb": XGBRegressor,
        "lgbm": LGBMRegressor,
        "mlp": MLPRegressor
    }
    best_model = model_classes[model_name](**study.best_params)

    best_model.fit(X_train, y_train)

    # 🔍 Оценка модели
    print("📊 Оценка модели на train/test:")
    evaluate_model(best_model, X_train, y_train, X_test, y_test)


In [114]:
for model in ["lgbm"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-17 02:48:15,649] A new study created in memory with name: no-name-df10ada3-c973-4a8c-8442-1a9857f52a3d
[I 2025-04-17 02:48:15,821] Trial 0 finished with value: -1.6960896423980512 and parameters: {'n_estimators': 226, 'max_depth': 19, 'learning_rate': 0.16344993638147193, 'num_leaves': 122, 'subsample': 0.5216219378836007, 'colsample_bytree': 0.6092546502178895}. Best is trial 0 with value: -1.6960896423980512.



🔍 Оптимизация модели: lgbm


[I 2025-04-17 02:48:15,939] Trial 1 finished with value: -1.7828403799838721 and parameters: {'n_estimators': 125, 'max_depth': 8, 'learning_rate': 0.23183321761462347, 'num_leaves': 36, 'subsample': 0.6167854428141397, 'colsample_bytree': 0.5474085100679746}. Best is trial 0 with value: -1.6960896423980512.
[I 2025-04-17 02:48:16,068] Trial 2 finished with value: -1.654327358506092 and parameters: {'n_estimators': 141, 'max_depth': 13, 'learning_rate': 0.12245592721145833, 'num_leaves': 89, 'subsample': 0.9229162404440199, 'colsample_bytree': 0.6833690768132368}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,162] Trial 3 finished with value: -1.7479825187568578 and parameters: {'n_estimators': 95, 'max_depth': 17, 'learning_rate': 0.2742393313169498, 'num_leaves': 44, 'subsample': 0.7478510679912089, 'colsample_bytree': 0.5495123166150259}. Best is trial 2 with value: -1.654327358506092.
[I 2025-04-17 02:48:16,312] Trial 4 finished with value: -1.6840289838842

🏆 Лучшие гиперпараметры: {'n_estimators': 152, 'max_depth': 11, 'learning_rate': 0.07759053090052767, 'num_leaves': 50, 'subsample': 0.9318155523102657, 'colsample_bytree': 0.7137131727645704}
📉 Лучшая средняя ошибка (CV): 1.6518158163223498
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000189 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2117
[LightGBM] [Info] Number of data points in the train set: 214, number of used features: 29
[LightGBM] [Info] Start training from score 4.786516
📊 Оценка модели на train/test:
Train RMSE: 0.0997
Train R2: 0.9796
Test RMSE: 3.4065
Test R2: 0.3828


In [None]:
for model in ["svm", "sgd", "knn", "dt", "rf", "et", "xgb", "lgbm", "mlp"]:
    run_model_with_optuna(model, X_train_transformed, y_train, X_test_transformed, y_test)

[I 2025-04-17 02:23:57,001] A new study created in memory with name: no-name-7d0ef6f1-379c-414d-a0a3-852cb871fa5c



🔍 Оптимизация модели: svm


[I 2025-04-17 02:24:01,299] Trial 0 finished with value: -1.9704878188372874 and parameters: {'C': 0.17126744015861523, 'epsilon': 0.0338872491502521, 'kernel': 'rbf'}. Best is trial 0 with value: -1.9704878188372874.
[I 2025-04-17 02:24:03,045] Trial 1 finished with value: -1.7850185622083377 and parameters: {'C': 46.61337700036316, 'epsilon': 0.07289497194479191, 'kernel': 'rbf'}. Best is trial 1 with value: -1.7850185622083377.
[I 2025-04-17 02:24:04,827] Trial 2 finished with value: -1.7608667663395896 and parameters: {'C': 7.08555392387036, 'epsilon': 0.0037180610940864955, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7608667663395896.
[I 2025-04-17 02:24:06,716] Trial 3 finished with value: -1.7938288964420437 and parameters: {'C': 56.01148707188686, 'epsilon': 0.03502153285287117, 'kernel': 'rbf'}. Best is trial 2 with value: -1.7608667663395896.
[I 2025-04-17 02:24:06,757] Trial 4 finished with value: -1.7948116559311296 and parameters: {'C': 40.3394533348504, 'epsilon': 0.

🏆 Лучшие гиперпараметры: {'C': 5.733136090860661, 'epsilon': 0.7571120107141919, 'kernel': 'rbf'}
📉 Лучшая средняя ошибка (CV): 1.7078269261764987
📊 Оценка модели на train/test:
Train RMSE: 0.6463
Train R2: 0.8678
Test RMSE: 3.2340
Test R2: 0.4141

🔍 Оптимизация модели: sgd


[I 2025-04-17 02:24:11,329] Trial 6 finished with value: -3.3252180430216733 and parameters: {'alpha': 0.00022694098586585574, 'penalty': 'l2'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,360] Trial 7 finished with value: -5.3268465977483315 and parameters: {'alpha': 0.0011040026707761568, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,392] Trial 8 finished with value: -4.632746471069512 and parameters: {'alpha': 5.724240741297575e-05, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,422] Trial 9 finished with value: -5.126750882395365 and parameters: {'alpha': 0.008066547453117535, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 2025-04-17 02:24:11,461] Trial 10 finished with value: -3.7000249964335303 and parameters: {'alpha': 1.5658442248230646e-05, 'penalty': 'elasticnet'}. Best is trial 6 with value: -3.3252180430216733.
[I 

🏆 Лучшие гиперпараметры: {'alpha': 0.04898901289127776, 'penalty': 'l2'}
📉 Лучшая средняя ошибка (CV): 2.1677238590961005
📊 Оценка модели на train/test:
Train RMSE: 72.2007
Train R2: -13.7657
Test RMSE: 4.3826
Test R2: 0.2059

🔍 Оптимизация модели: knn


[I 2025-04-17 02:24:15,255] Trial 0 finished with value: -1.9723820421989458 and parameters: {'n_neighbors': 12, 'weights': 'distance'}. Best is trial 0 with value: -1.9723820421989458.
[I 2025-04-17 02:24:15,502] Trial 1 finished with value: -1.9623771495984847 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,736] Trial 2 finished with value: -2.0096887540988395 and parameters: {'n_neighbors': 17, 'weights': 'uniform'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,901] Trial 3 finished with value: -1.9623771495984847 and parameters: {'n_neighbors': 20, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,929] Trial 4 finished with value: -1.9752694795474846 and parameters: {'n_neighbors': 14, 'weights': 'distance'}. Best is trial 1 with value: -1.9623771495984847.
[I 2025-04-17 02:24:15,959] Trial 5 finished with value: -1.96416098243

🏆 Лучшие гиперпараметры: {'n_neighbors': 3, 'weights': 'distance'}
📉 Лучшая средняя ошибка (CV): 1.8668547172222734
📊 Оценка модели на train/test:
Train RMSE: 0.0000
Train R2: 1.0000
Test RMSE: 3.3282
Test R2: 0.3970

🔍 Оптимизация модели: dt


[I 2025-04-17 02:24:19,843] Trial 2 finished with value: -2.4304805524276025 and parameters: {'max_depth': 11, 'min_samples_split': 3}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,894] Trial 3 finished with value: -2.4916752895021994 and parameters: {'max_depth': 14, 'min_samples_split': 5}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,933] Trial 4 finished with value: -2.432850043273622 and parameters: {'max_depth': 9, 'min_samples_split': 8}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:19,972] Trial 5 finished with value: -2.4308768544035395 and parameters: {'max_depth': 16, 'min_samples_split': 6}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:20,001] Trial 6 finished with value: -2.4627234742088 and parameters: {'max_depth': 14, 'min_samples_split': 10}. Best is trial 0 with value: -2.3011071974456945.
[I 2025-04-17 02:24:20,040] Trial 7 finished with value: -2.4406904805649043 an

🏆 Лучшие гиперпараметры: {'max_depth': 3, 'min_samples_split': 2}
📉 Лучшая средняя ошибка (CV): 2.0136835876777144
📊 Оценка модели на train/test:
Train RMSE: 2.7539
Train R2: 0.4368
Test RMSE: 5.7947
Test R2: -0.0499

🔍 Оптимизация модели: rf


[I 2025-04-17 02:24:25,198] Trial 0 finished with value: -1.883224555323904 and parameters: {'n_estimators': 101, 'max_depth': 4}. Best is trial 0 with value: -1.883224555323904.
[I 2025-04-17 02:24:26,165] Trial 1 finished with value: -1.8867020231198754 and parameters: {'n_estimators': 149, 'max_depth': 4}. Best is trial 0 with value: -1.883224555323904.
[I 2025-04-17 02:24:28,078] Trial 2 finished with value: -1.8757950766597116 and parameters: {'n_estimators': 252, 'max_depth': 6}. Best is trial 2 with value: -1.8757950766597116.
[I 2025-04-17 02:24:30,437] Trial 3 finished with value: -1.869465220104918 and parameters: {'n_estimators': 261, 'max_depth': 17}. Best is trial 3 with value: -1.869465220104918.
[I 2025-04-17 02:24:32,082] Trial 4 finished with value: -1.868020648187586 and parameters: {'n_estimators': 186, 'max_depth': 11}. Best is trial 4 with value: -1.868020648187586.
[I 2025-04-17 02:24:32,447] Trial 5 finished with value: -1.9113840020423425 and parameters: {'n_est

🏆 Лучшие гиперпараметры: {'n_estimators': 155, 'max_depth': 18}
📉 Лучшая средняя ошибка (CV): 1.8374692392081358


[I 2025-04-17 02:27:06,498] A new study created in memory with name: no-name-a95c8354-ec5e-4a4c-99e1-a26cc7915b6b


📊 Оценка модели на train/test:
Train RMSE: 0.4608
Train R2: 0.9058
Test RMSE: 3.5887
Test R2: 0.3498

🔍 Оптимизация модели: et


[I 2025-04-17 02:27:07,682] Trial 0 finished with value: -1.799399672288144 and parameters: {'n_estimators': 214, 'max_depth': 19}. Best is trial 0 with value: -1.799399672288144.
[I 2025-04-17 02:27:08,634] Trial 1 finished with value: -1.8147346305256875 and parameters: {'n_estimators': 176, 'max_depth': 20}. Best is trial 0 with value: -1.799399672288144.
[I 2025-04-17 02:27:09,671] Trial 2 finished with value: -1.7984290996125196 and parameters: {'n_estimators': 204, 'max_depth': 13}. Best is trial 2 with value: -1.7984290996125196.
[I 2025-04-17 02:27:10,946] Trial 3 finished with value: -1.7875090108175844 and parameters: {'n_estimators': 290, 'max_depth': 9}. Best is trial 3 with value: -1.7875090108175844.
[I 2025-04-17 02:27:12,151] Trial 4 finished with value: -1.80630502692599 and parameters: {'n_estimators': 222, 'max_depth': 17}. Best is trial 3 with value: -1.7875090108175844.
[I 2025-04-17 02:27:13,119] Trial 5 finished with value: -1.8054122286513046 and parameters: {'n

🏆 Лучшие гиперпараметры: {'n_estimators': 251, 'max_depth': 15}
📉 Лучшая средняя ошибка (CV): 1.7792212073669398


[I 2025-04-17 02:28:26,087] A new study created in memory with name: no-name-9cf0583a-bec4-40a3-9c76-04d6130758f3


📊 Оценка модели на train/test:
Train RMSE: 0.0004
Train R2: 0.9999
Test RMSE: 3.2298
Test R2: 0.4148

🔍 Оптимизация модели: xgb


[I 2025-04-17 02:28:28,608] Trial 0 finished with value: -1.896445077700036 and parameters: {'n_estimators': 265, 'max_depth': 7, 'learning_rate': 0.20839597865041376, 'subsample': 0.5034636159792547, 'colsample_bytree': 0.8765085964031631}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:30,886] Trial 1 finished with value: -1.938958121935319 and parameters: {'n_estimators': 79, 'max_depth': 8, 'learning_rate': 0.1136685155894583, 'subsample': 0.9749595059880913, 'colsample_bytree': 0.8818714372636052}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:32,391] Trial 2 finished with value: -2.0466749656743226 and parameters: {'n_estimators': 225, 'max_depth': 4, 'learning_rate': 0.29730179249337274, 'subsample': 0.7419444049250039, 'colsample_bytree': 0.7606984184747096}. Best is trial 0 with value: -1.896445077700036.
[I 2025-04-17 02:28:32,911] Trial 3 finished with value: -2.0064434449235975 and parameters: {'n_estimators': 149, 'max_depth': 2, 

🏆 Лучшие гиперпараметры: {'n_estimators': 285, 'max_depth': 7, 'learning_rate': 0.01883083413917217, 'subsample': 0.6675772164904088, 'colsample_bytree': 0.5134562254608362}
📉 Лучшая средняя ошибка (CV): 1.8297309654860912


[I 2025-04-17 02:31:37,323] A new study created in memory with name: no-name-d8caa16f-8d3b-47f1-9703-baed819d2641


📊 Оценка модели на train/test:
Train RMSE: 0.0342
Train R2: 0.9930
Test RMSE: 3.5027
Test R2: 0.3654

🔍 Оптимизация модели: mlp


[I 2025-04-17 02:31:37,724] Trial 0 finished with value: -1.9326934060397094 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.03447388659083492, 'learning_rate_init': 0.00543144836381885}. Best is trial 0 with value: -1.9326934060397094.
[I 2025-04-17 02:31:38,072] Trial 1 finished with value: -1.843023666449539 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.06310826942842783, 'learning_rate_init': 0.009816109714056293}. Best is trial 1 with value: -1.843023666449539.
[I 2025-04-17 02:31:38,585] Trial 2 finished with value: -1.9502665600514895 and parameters: {'hidden_layer_sizes': (64, 64), 'alpha': 0.00018024094524191436, 'learning_rate_init': 0.0045779543119559805}. Best is trial 1 with value: -1.843023666449539.
[I 2025-04-17 02:31:39,060] Trial 3 finished with value: -1.8291021493541648 and parameters: {'hidden_layer_sizes': (64,), 'alpha': 0.024237383514900727, 'learning_rate_init': 0.002931903757276672}. Best is trial 3 with value: -1.8291021493541648.
[I 2025-

🏆 Лучшие гиперпараметры: {'hidden_layer_sizes': (64,), 'alpha': 0.02834235225512128, 'learning_rate_init': 0.00047747003027425947}
📉 Лучшая средняя ошибка (CV): 1.7160655540100223
📊 Оценка модели на train/test:
Train RMSE: 1.4186
Train R2: 0.7099
Test RMSE: 3.9238
Test R2: 0.2891




In [53]:
svr_learning(X_train_scaled, y_train, X_test_scaled, y_test)

Train RMSE: 3.7003
Train R2: 0.2432
Test RMSE: 4.5344
Test R2: 0.1784
