# ML3 Validation Project


In [None]:
import numpy as np
import pandas as pd
import json
from sklearn.model_selection import KFold, GroupKFold, StratifiedKFold, TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import Lasso, ElasticNet
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.inspection import permutation_importance
import shap
import optuna

optuna.logging.set_verbosity(optuna.logging.WARNING)
np.random.seed(21)


In [None]:
def calculate_all_metrics(y_true, y_pred):
    return {
        'MAE': mean_absolute_error(y_true, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_true, y_pred)),
        'R2': r2_score(y_true, y_pred)
    }


1. Ответы на вопросы

1.1. Leave-one-out - это кросс-валидация где k=n (каждый образец используется как тест один раз).
Сильные стороны: использует максимум данных для обучения, нет случайности, несмещенная оценка.
Ограничения: вычислительно дорого (n моделей), высокая дисперсия, медленно для больших данных.


1.2. Grid Search - перебирает все комбинации параметров. Randomized Search - случайная выборка комбинаций. Bayesian Optimization - использует вероятностную модель (TPE) для предсказания лучших параметров на основе предыдущих оценок.


1.3. Классификация методов отбора признаков:
- Unsupervised: не используют целевую переменную
- Supervised: используют целевую переменную
  - Filter: статистические меры (быстро, независимо от модели)
  - Wrapper: производительность модели (медленно, зависит от модели)
  - Embedded: встроены в обучение (Lasso, Ridge)

Pearson - корреляция между признаком и целевой переменной (-1 до 1).
Chi2 - проверка независимости для категориальных признаков.
Lasso - добавляет штраф λ||w||₁, обнуляет неважные признаки.
Permutation Importance - важность через перестановку значений признака.
SHAP - объяснение предсказаний на основе Shapley values.


2. Предобработка данных


In [None]:
with open('data/train.json', 'r') as f:
    train_df = pd.DataFrame(json.load(f))
    
with open('data/test.json', 'r') as f:
    test_df = pd.DataFrame(json.load(f))

train_df['interest_level'] = train_df['interest_level'].replace({'low': 0, 'medium': 1, 'high': 2})


  train_df['interest_level'] = train_df['interest_level'].replace({'low': 0, 'medium': 1, 'high': 2})


In [None]:
feature_list = ['Elevator', 'HardwoodFloors', 'CatsAllowed', 'DogsAllowed', 'Doorman', 
                'Dishwasher', 'NoFee', 'LaundryinBuilding', 'FitnessCenter', 'Pre-War', 
                'LaundryinUnit', 'RoofDeck', 'OutdoorSpace', 'DiningRoom', 'HighSpeedInternet', 
                'Balcony', 'SwimmingPool', 'LaundryInBuilding', 'NewConstruction', 'Terrace']

for feat in feature_list:
    train_df[feat] = train_df['features'].apply(lambda x: int(feat in x))
    test_df[feat] = test_df['features'].apply(lambda x: int(feat in x))


3. Методы разбиения данных

3.1. Разбиение на 2 части случайно


In [None]:
def split_data_random_2(df, test_size=0.2, random_state=21):
    np.random.seed(random_state)
    n_samples = len(df)
    n_test = int(n_samples * test_size)
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    train_indices = indices[n_test:]
    return df.iloc[train_indices].copy(), df.iloc[test_indices].copy()

train_split, test_split = split_data_random_2(train_df, test_size=0.2, random_state=21)
print(f"Train: {len(train_split)}, Test: {len(test_split)}")


Train: 39482, Test: 9870


3.2. Разбиение на 3 части случайно


In [None]:
def split_data_random_3(df, validation_size=0.2, test_size=0.2, random_state=21):
    np.random.seed(random_state)
    n_samples = len(df)
    n_test = int(n_samples * test_size)
    n_val = int(n_samples * validation_size)
    indices = np.random.permutation(n_samples)
    test_indices = indices[:n_test]
    val_indices = indices[n_test:n_test+n_val]
    train_indices = indices[n_test+n_val:]
    return df.iloc[train_indices].copy(), df.iloc[val_indices].copy(), df.iloc[test_indices].copy()

train_split, val_split, test_split = split_data_random_3(train_df, validation_size=0.2, test_size=0.2, random_state=21)
print(f"Train: {len(train_split)}, Val: {len(val_split)}, Test: {len(test_split)}")


Train: 29612, Val: 9870, Test: 9870


3.3. Разбиение на 2 части по дате


In [None]:
def split_data_by_date_2(df, date_split, date_field='created'):
    df[date_field] = pd.to_datetime(df[date_field])
    date_split = pd.to_datetime(date_split)
    return df[df[date_field] < date_split].copy(), df[df[date_field] >= date_split].copy()

median_date = pd.to_datetime(train_df['created']).median()
train_date, test_date = split_data_by_date_2(train_df, median_date)
print(f"Train: {len(train_date)}, Test: {len(test_date)}")


Train: 24676, Test: 24676


3.4. Разбиение на 3 части по дате


In [None]:
def split_data_by_date_3(df, validation_date, test_date, date_field='created'):
    df[date_field] = pd.to_datetime(df[date_field])
    validation_date = pd.to_datetime(validation_date)
    test_date = pd.to_datetime(test_date)
    train_df = df[df[date_field] < validation_date].copy()
    val_df = df[(df[date_field] >= validation_date) & (df[date_field] < test_date)].copy()
    test_df = df[df[date_field] >= test_date].copy()
    return train_df, val_df, test_df

dates = pd.to_datetime(train_df['created'])
val_date = dates.quantile(0.6)
test_date = dates.quantile(0.8)
train_date, val_date_split, test_date_split = split_data_by_date_3(train_df, val_date, test_date)
print(f"Train: {len(train_date)}, Val: {len(val_date_split)}, Test: {len(test_date_split)}")


Train: 29611, Val: 9870, Test: 9871


3.5. Детерминированность - при одинаковом random_state результат всегда одинаковый


4. Методы кросс-валидации

4.1. K-Fold


In [None]:
def k_fold_cv(n_samples, k=5, random_state=21):
    np.random.seed(random_state)
    indices = np.random.permutation(n_samples)
    fold_size = n_samples // k
    folds = []
    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else n_samples
        test_indices = indices[start:end]
        train_indices = np.concatenate([indices[:start], indices[end:]])
        folds.append((train_indices, test_indices))
    return folds

folds = k_fold_cv(len(train_df), k=5, random_state=21)
print(f"Фолдов: {len(folds)}, первый фолд: train={len(folds[0][0])}, test={len(folds[0][1])}")


Фолдов: 5, первый фолд: train=39482, test=9870


4.2. Grouped K-Fold


In [None]:
def grouped_k_fold_cv(df, k=5, group_field='listing_id', random_state=21):
    np.random.seed(random_state)
    groups = df[group_field].unique()
    np.random.shuffle(groups)
    n_groups = len(groups)
    fold_size = n_groups // k
    folds = []
    for i in range(k):
        start = i * fold_size
        end = (i + 1) * fold_size if i < k - 1 else n_groups
        test_groups = groups[start:end]
        test_mask = df[group_field].isin(test_groups)
        test_indices = df[test_mask].index.values
        train_indices = df[~test_mask].index.values
        folds.append((train_indices, test_indices))
    return folds

folds = grouped_k_fold_cv(train_df, k=5, group_field='listing_id', random_state=21)
print(f"Фолдов: {len(folds)}, первый фолд: train={len(folds[0][0])}, test={len(folds[0][1])}")


Фолдов: 5, первый фолд: train=39482, test=9870


4.3. Stratified K-Fold


In [None]:
def stratified_k_fold_cv(df, k=5, stratify_field='interest_level', random_state=21):
    np.random.seed(random_state)
    groups_by_class = {}
    for idx, value in df[stratify_field].items():
        if value not in groups_by_class:
            groups_by_class[value] = []
        groups_by_class[value].append(idx)
    for class_val in groups_by_class:
        np.random.shuffle(groups_by_class[class_val])
    folds = [[] for _ in range(k)]
    for class_val, indices in groups_by_class.items():
        fold_size = len(indices) // k
        for i in range(k):
            start = i * fold_size
            end = (i + 1) * fold_size if i < k - 1 else len(indices)
            folds[i].extend(indices[start:end])
    result = []
    for i in range(k):
        test_indices = np.array(folds[i])
        train_indices = np.concatenate([np.array(folds[j]) for j in range(k) if j != i])
        result.append((train_indices, test_indices))
    return result

folds = stratified_k_fold_cv(train_df, k=5, stratify_field='interest_level', random_state=21)
print(f"Фолдов: {len(folds)}, первый фолд: train={len(folds[0][0])}, test={len(folds[0][1])}")


Фолдов: 5, первый фолд: train=39484, test=9868


4.4. Time Series Split


In [None]:
def time_series_split(df, k=5, date_field='created'):
    df_sorted = df.sort_values(date_field).reset_index(drop=True)
    n_samples = len(df_sorted)
    fold_size = n_samples // (k + 1)
    folds = []
    for i in range(1, k + 1):
        train_end = i * fold_size
        test_end = (i + 1) * fold_size if i < k else n_samples
        train_indices = df_sorted.index[:train_end].values
        test_indices = df_sorted.index[train_end:test_end].values
        folds.append((train_indices, test_indices))
    return folds

folds = time_series_split(train_df, k=5, date_field='created')
print(f"Фолдов: {len(folds)}, первый фолд: train={len(folds[0][0])}, test={len(folds[0][1])}")


Фолдов: 5, первый фолд: train=8225, test=8225


5. Сравнение с sklearn


In [None]:
our_kfold = k_fold_cv(len(train_df), k=5, random_state=21)
sklearn_kfold = list(KFold(n_splits=5, shuffle=True, random_state=21).split(train_df))
print(f"K-Fold: наш={len(our_kfold)}, sklearn={len(sklearn_kfold)}")

our_grouped = grouped_k_fold_cv(train_df, k=5, group_field='listing_id', random_state=21)
y = train_df['interest_level'].values
groups = train_df['listing_id'].values
sklearn_grouped = list(GroupKFold(n_splits=5).split(train_df, y, groups))
print(f"Grouped K-Fold: наш={len(our_grouped)}, sklearn={len(sklearn_grouped)}")

our_stratified = stratified_k_fold_cv(train_df, k=5, stratify_field='interest_level', random_state=21)
sklearn_stratified = list(StratifiedKFold(n_splits=5, shuffle=True, random_state=21).split(train_df, y))
print(f"Stratified K-Fold: наш={len(our_stratified)}, sklearn={len(sklearn_stratified)}")

our_timeseries = time_series_split(train_df, k=5)
sklearn_timeseries = list(TimeSeriesSplit(n_splits=5).split(train_df))
print(f"Time Series Split: наш={len(our_timeseries)}, sklearn={len(sklearn_timeseries)}")


K-Fold: наш=5, sklearn=5
Grouped K-Fold: наш=5, sklearn=5
Stratified K-Fold: наш=5, sklearn=5
Time Series Split: наш=5, sklearn=5


5.4. Выбрать лучшую схему валидации


In [None]:
feature_cols_cv = feature_list + ['bathrooms', 'bedrooms']
X_cv = train_df[feature_cols_cv].fillna(0).values
y_cv = train_df['interest_level'].values
scaler_cv = StandardScaler()
X_cv_scaled = scaler_cv.fit_transform(X_cv)

idx_map = {idx: i for i, idx in enumerate(train_df.index)}

cv_results = {}
for name, folds in [('K-Fold', our_kfold), ('Stratified K-Fold', our_stratified), ('Time Series Split', our_timeseries)]:
    scores = []
    for train_idx, val_idx in folds:
        if name == 'K-Fold':
            train_idx = np.asarray(train_idx, dtype=int)
            val_idx = np.asarray(val_idx, dtype=int)
        elif name == 'Stratified K-Fold':
            train_idx = np.array([idx_map[idx] for idx in train_idx], dtype=int)
            val_idx = np.array([idx_map[idx] for idx in val_idx], dtype=int)
        else:
            train_idx = np.asarray(train_idx, dtype=int)
            val_idx = np.asarray(val_idx, dtype=int)
        model = Lasso(alpha=0.1, random_state=21)
        model.fit(X_cv_scaled[train_idx], y_cv[train_idx])
        scores.append(mean_squared_error(y_cv[val_idx], model.predict(X_cv_scaled[val_idx])))
    cv_results[name] = {'mean': np.mean(scores), 'std': np.std(scores)}

best_cv = min(cv_results.items(), key=lambda x: x[1]['mean'])
print(f"Лучшая схема: {best_cv[0]} (MSE={best_cv[1]['mean']:.4f}±{best_cv[1]['std']:.4f})")
for name, res in cv_results.items():
    print(f"{name}: MSE={res['mean']:.4f}±{res['std']:.4f}")


Лучшая схема: Time Series Split (MSE=0.3918±0.0085)
K-Fold: MSE=0.3920±0.0067
Stratified K-Fold: MSE=0.3919±0.0003
Time Series Split: MSE=0.3918±0.0085


6. Отбор признаков

6.1. Lasso модель с нормализацией (60/20/20)


In [None]:
feature_cols = feature_list + ['bathrooms', 'bedrooms']
train_split, val_split, test_split = split_data_random_3(train_df, validation_size=0.2, test_size=0.2, random_state=21)

X_train = train_split[feature_cols].fillna(0).values
X_val = val_split[feature_cols].fillna(0).values
X_test = test_split[feature_cols].fillna(0).values
y_train = train_split['interest_level'].values
y_val = val_split['interest_level'].values
y_test = test_split['interest_level'].values

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)
X_test_scaled = scaler.transform(X_test)

lasso = Lasso(alpha=0.1, random_state=21)
lasso.fit(X_train_scaled, y_train)
print(f"MSE: train={mean_squared_error(y_train, lasso.predict(X_train_scaled)):.4f}, val={mean_squared_error(y_val, lasso.predict(X_val_scaled)):.4f}, test={mean_squared_error(y_test, lasso.predict(X_test_scaled)):.4f}")


MSE: train=0.3918, val=0.3900, test=0.3941


6.2. Топ 10 признаков по коэффициентам Lasso


In [None]:
feature_importance = pd.DataFrame({
    'feature': feature_cols,
    'coefficient': np.abs(lasso.coef_)
}).sort_values('coefficient', ascending=False)

top_10_lasso = feature_importance.head(10)['feature'].tolist()
X_train_top10 = X_train_scaled[:, [feature_cols.index(f) for f in top_10_lasso]]
X_val_top10 = X_val_scaled[:, [feature_cols.index(f) for f in top_10_lasso]]
X_test_top10 = X_test_scaled[:, [feature_cols.index(f) for f in top_10_lasso]]

lasso_top10 = Lasso(alpha=0.1, random_state=21)
lasso_top10.fit(X_train_top10, y_train)
print(f"Топ 10: {top_10_lasso[:10]}")
print(f"MSE: val={mean_squared_error(y_val, lasso_top10.predict(X_val_top10)):.4f}, test={mean_squared_error(y_test, lasso_top10.predict(X_test_top10)):.4f}")

Топ 10: ['Elevator', 'HardwoodFloors', 'CatsAllowed', 'DogsAllowed', 'Doorman', 'Dishwasher', 'NoFee', 'LaundryinBuilding', 'FitnessCenter', 'Pre-War']
MSE: val=0.3900, test=0.3941


6.3. Простой отбор по NaN-ratio и корреляции


In [None]:
def simple_feature_selection(df, target_col, feature_cols, top_n=10):
    scores = []
    for feat in feature_cols:
        nan_ratio = df[feat].isna().sum() / len(df)
        nan_score = 1 - nan_ratio
        try:
            corr = abs(df[[feat, target_col]].corr().iloc[0, 1])
            if np.isnan(corr):
                corr = 0
        except:
            corr = 0
        combined_score = 0.3 * nan_score + 0.7 * corr
        scores.append((feat, combined_score))
    scores_df = pd.DataFrame(scores, columns=['feature', 'score']).sort_values('score', ascending=False)
    return scores_df.head(top_n)['feature'].tolist()

top_10_simple = simple_feature_selection(train_df, 'interest_level', feature_cols, top_n=10)
X_train_simple = X_train_scaled[:, [feature_cols.index(f) for f in top_10_simple]]
X_val_simple = X_val_scaled[:, [feature_cols.index(f) for f in top_10_simple]]
X_test_simple = X_test_scaled[:, [feature_cols.index(f) for f in top_10_simple]]

lasso_simple = Lasso(alpha=0.1, random_state=21)
lasso_simple.fit(X_train_simple, y_train)
print(f"Топ 10: {top_10_simple[:10]}")
print(f"MSE: val={mean_squared_error(y_val, lasso_simple.predict(X_val_simple)):.4f}, test={mean_squared_error(y_test, lasso_simple.predict(X_test_simple)):.4f}")


Топ 10: ['Doorman', 'bathrooms', 'Dishwasher', 'Pre-War', 'bedrooms', 'Terrace', 'Balcony', 'Elevator', 'HardwoodFloors', 'DogsAllowed']
MSE: val=0.3900, test=0.3941


6.4. Permutation importance


In [None]:
base_model = Lasso(alpha=0.1, random_state=21)
base_model.fit(X_train_scaled, y_train)

perm_importance = permutation_importance(base_model, X_val_scaled, y_val, n_repeats=10, random_state=21, n_jobs=-1)
perm_df = pd.DataFrame({
    'feature': feature_cols,
    'importance_mean': perm_importance.importances_mean
}).sort_values('importance_mean', ascending=False)

top_10_perm = perm_df.head(10)['feature'].tolist()
X_train_perm = X_train_scaled[:, [feature_cols.index(f) for f in top_10_perm]]
X_val_perm = X_val_scaled[:, [feature_cols.index(f) for f in top_10_perm]]
X_test_perm = X_test_scaled[:, [feature_cols.index(f) for f in top_10_perm]]

lasso_perm = Lasso(alpha=0.1, random_state=21)
lasso_perm.fit(X_train_perm, y_train)
print(f"Топ 10: {top_10_perm[:10]}")
print(f"MSE: val={mean_squared_error(y_val, lasso_perm.predict(X_val_perm)):.4f}, test={mean_squared_error(y_test, lasso_perm.predict(X_test_perm)):.4f}")


Топ 10: ['Elevator', 'HardwoodFloors', 'CatsAllowed', 'DogsAllowed', 'Doorman', 'Dishwasher', 'NoFee', 'LaundryinBuilding', 'FitnessCenter', 'Pre-War']
MSE: val=0.3900, test=0.3941


6.5. SHAP


In [None]:
explainer = shap.LinearExplainer(lasso, X_train_scaled)
shap_values = explainer.shap_values(X_val_scaled[:100])
shap_importance = np.abs(shap_values).mean(axis=0)
shap_df = pd.DataFrame({
    'feature': feature_cols,
    'shap_importance': shap_importance
}).sort_values('shap_importance', ascending=False)

top_10_shap = shap_df.head(10)['feature'].tolist()
X_train_shap = X_train_scaled[:, [feature_cols.index(f) for f in top_10_shap]]
X_val_shap = X_val_scaled[:, [feature_cols.index(f) for f in top_10_shap]]
X_test_shap = X_test_scaled[:, [feature_cols.index(f) for f in top_10_shap]]

lasso_shap = Lasso(alpha=0.1, random_state=21)
lasso_shap.fit(X_train_shap, y_train)
print(f"Топ 10: {top_10_shap[:10]}")
print(f"MSE: val={mean_squared_error(y_val, lasso_shap.predict(X_val_shap)):.4f}, test={mean_squared_error(y_test, lasso_shap.predict(X_test_shap)):.4f}")


Топ 10: ['Elevator', 'HardwoodFloors', 'CatsAllowed', 'DogsAllowed', 'Doorman', 'Dishwasher', 'NoFee', 'LaundryinBuilding', 'FitnessCenter', 'Pre-War']
MSE: val=0.3900, test=0.3941


6.6. Сравнить методы отбора признаков


In [None]:
comparison = pd.DataFrame({
    'Метод': ['Lasso coefficients', 'Simple selection', 'Permutation', 'SHAP'],
    'Val MSE': [
        mean_squared_error(y_val, lasso_top10.predict(X_val_top10)),
        mean_squared_error(y_val, lasso_simple.predict(X_val_simple)),
        mean_squared_error(y_val, lasso_perm.predict(X_val_perm)),
        mean_squared_error(y_val, lasso_shap.predict(X_val_shap))
    ],
    'Test MSE': [
        mean_squared_error(y_test, lasso_top10.predict(X_test_top10)),
        mean_squared_error(y_test, lasso_simple.predict(X_test_simple)),
        mean_squared_error(y_test, lasso_perm.predict(X_test_perm)),
        mean_squared_error(y_test, lasso_shap.predict(X_test_shap))
    ]
})
print(comparison.to_string(index=False))
best_method = comparison.loc[comparison['Test MSE'].idxmin(), 'Метод']
print(f"\nЛучший метод: {best_method} (Test MSE={comparison['Test MSE'].min():.4f})")


             Метод  Val MSE  Test MSE
Lasso coefficients 0.389953  0.394122
  Simple selection 0.389953  0.394122
       Permutation 0.389953  0.394122
              SHAP 0.389953  0.394122

Лучший метод: Lasso coefficients (Test MSE=0.3941)


7. Оптимизация гиперпараметров

7.1. Grid Search и Random Search


In [None]:
def grid_search_elasticnet(X_train, y_train, X_val, y_val, alpha_grid, l1_ratio_grid):
    best_score = float('inf')
    best_params = None
    for alpha in alpha_grid:
        for l1_ratio in l1_ratio_grid:
            model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=21, max_iter=1000)
            model.fit(X_train, y_train)
            val_mse = mean_squared_error(y_val, model.predict(X_val))
            if val_mse < best_score:
                best_score = val_mse
                best_params = {'alpha': alpha, 'l1_ratio': l1_ratio}
    return best_params, best_score

def random_search_elasticnet(X_train, y_train, X_val, y_val, alpha_range, l1_ratio_range, n_iter=25):
    best_score = float('inf')
    best_params = None
    np.random.seed(21)
    for _ in range(n_iter):
        alpha = np.random.choice(alpha_range)
        l1_ratio = np.random.choice(l1_ratio_range)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=21, max_iter=1000)
        model.fit(X_train, y_train)
        val_mse = mean_squared_error(y_val, model.predict(X_val))
        if val_mse < best_score:
            best_score = val_mse
            best_params = {'alpha': alpha, 'l1_ratio': l1_ratio}
    return best_params, best_score

alpha_grid = [0.001, 0.01, 0.1, 1.0, 10.0]
l1_ratio_grid = [0.1, 0.3, 0.5, 0.7, 0.9]
grid_params, grid_score = grid_search_elasticnet(X_train_scaled, y_train, X_val_scaled, y_val, alpha_grid, l1_ratio_grid)
random_params, random_score = random_search_elasticnet(X_train_scaled, y_train, X_val_scaled, y_val, alpha_grid, l1_ratio_grid, n_iter=25)
print(f"Grid Search: {grid_params}, MSE={grid_score:.4f}")
print(f"Random Search: {random_params}, MSE={random_score:.4f}")


Grid Search: {'alpha': 0.001, 'l1_ratio': 0.1}, MSE=0.3791
Random Search: {'alpha': np.float64(0.001), 'l1_ratio': np.float64(0.1)}, MSE=0.3791


7.2. Optuna


In [None]:
def objective(trial):
    alpha = trial.suggest_float('alpha', 0.001, 10.0, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)
    model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=21, max_iter=1000)
    model.fit(X_train_scaled, y_train)
    return mean_squared_error(y_val, model.predict(X_val_scaled))

study = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=21))
study.optimize(objective, n_trials=50, show_progress_bar=False)
print(f"Optuna: {study.best_params}, MSE={study.best_value:.4f}")


Optuna: {'alpha': 0.0021097123443160645, 'l1_ratio': 0.14654038301385044}, MSE=0.3791


7.3. Optuna с кросс-валидацией


In [23]:
def objective_cv(trial):
    alpha = trial.suggest_float('alpha', 0.001, 10.0, log=True)
    l1_ratio = trial.suggest_float('l1_ratio', 0.1, 0.9)
    cv_scores = []
    folds = k_fold_cv(len(train_df), k=5, random_state=21)
    for train_idx, val_idx in folds:
        X_train_fold = train_df.iloc[train_idx][feature_cols].fillna(0).values
        X_val_fold = train_df.iloc[val_idx][feature_cols].fillna(0).values
        y_train_fold = train_df.iloc[train_idx]['interest_level'].values
        y_val_fold = train_df.iloc[val_idx]['interest_level'].values
        scaler_fold = StandardScaler()
        X_train_fold_scaled = scaler_fold.fit_transform(X_train_fold)
        X_val_fold_scaled = scaler_fold.transform(X_val_fold)
        model = ElasticNet(alpha=alpha, l1_ratio=l1_ratio, random_state=21, max_iter=1000)
        model.fit(X_train_fold_scaled, y_train_fold)
        cv_scores.append(mean_squared_error(y_val_fold, model.predict(X_val_fold_scaled)))
    return np.mean(cv_scores)

study_cv = optuna.create_study(direction='minimize', sampler=optuna.samplers.TPESampler(seed=21))
study_cv.optimize(objective_cv, n_trials=30, show_progress_bar=False)
print(f"Optuna с CV: {study_cv.best_params}, CV MSE={study_cv.best_value:.4f}")


Optuna с CV: {'alpha': 0.0011333377686672955, 'l1_ratio': 0.21910664136704278}, CV MSE=0.3806


7.4. Сравнение методов оптимизации


In [None]:
grid_iter = len(alpha_grid) * len(l1_ratio_grid)
random_iter = 25

models = {
    'Grid Search': (grid_params, grid_score),
    'Random Search': (random_params, random_score),
    'Optuna': (study.best_params, study.best_value),
    'Optuna с CV': (study_cv.best_params, study_cv.best_value)
}

results = []
for name, (params, val_score) in models.items():
    m = ElasticNet(**params, random_state=21, max_iter=1000)
    m.fit(X_train_scaled, y_train)
    test_metrics = calculate_all_metrics(y_test, m.predict(X_test_scaled))
    results.append({
        'Метод': name,
        'Итераций': grid_iter if name == 'Grid Search' else (random_iter if name == 'Random Search' else (len(study.trials) if name == 'Optuna' else len(study_cv.trials))),
        'Val RMSE': np.sqrt(val_score),
        'Test RMSE': test_metrics['RMSE'],
        'Test MAE': test_metrics['MAE'],
        'Test R²': test_metrics['R2']
    })

opt_comparison = pd.DataFrame(results)
print(opt_comparison.to_string(index=False))
best_opt = opt_comparison.loc[opt_comparison['Test RMSE'].idxmin(), 'Метод']
print(f"\nЛучший метод: {best_opt} (Test RMSE={opt_comparison['Test RMSE'].min():.4f})")
print(f"\nВывод: Optuna нашла оптимальные параметры за {len(study.trials)} итераций против {grid_iter} у Grid Search.")

        Метод  Итераций  Val RMSE  Test RMSE  Test MAE  Test R²
  Grid Search        25  0.615688   0.618884  0.516090 0.028070
Random Search        25  0.615688   0.618884  0.516090 0.028070
       Optuna        50  0.615687   0.618884  0.516207 0.028071
  Optuna с CV        30  0.616907   0.618885  0.516157 0.028069

Лучший метод: Optuna (Test RMSE=0.6189)

Вывод: Optuna нашла оптимальные параметры за 50 итераций против 25 у Grid Search.
