In [3]:
!pip install xgboost lightgbm --quiet

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
import warnings
warnings.filterwarnings('ignore')

In [4]:
# Загрузка данных
train_df = pd.read_csv('diamonds_train.csv')
test_df = pd.read_csv('diamonds_test.csv')

print("Train shape:", train_df.shape)
print("Test shape:", test_df.shape)

Train shape: (43018, 10)
Test shape: (5379, 10)


In [5]:
def enhanced_preprocess_data(df, is_train=True, density_median=None):
    """Улучшенная функция предобработки данных"""
    df_processed = df.copy()

    # Создание новых признаков
    df_processed['volume'] = df_processed['x'] * df_processed['y'] * df_processed['z']
    df_processed['density'] = df_processed['carat'] / (df_processed['volume'] + 1e-8)  # Защита от деления на 0

    # Обработка бесконечных значений в density
    df_processed['density'] = df_processed['density'].replace([np.inf, -np.inf], np.nan)

    if is_train:
        density_median = df_processed['density'].median()
        df_processed['density'] = df_processed['density'].fillna(density_median)
    else:
        df_processed['density'] = df_processed['density'].fillna(density_median)

    # Дополнительные признаки
    df_processed['table_depth_ratio'] = df_processed['table'] / df_processed['depth']
    df_processed['carat_volume_ratio'] = df_processed['carat'] / (df_processed['volume'] + 1e-8)
    df_processed['size_index'] = (df_processed['x'] + df_processed['y'] + df_processed['z']) / 3
    df_processed['symmetry'] = (df_processed['x'] / df_processed['y']).abs()

    # Порядковая кодировка категориальных признаков
    cut_order = {'Fair': 1, 'Good': 2, 'Very Good': 3, 'Premium': 4, 'Ideal': 5}
    color_order = {'J': 1, 'I': 2, 'H': 3, 'G': 4, 'F': 5, 'E': 6, 'D': 7}
    clarity_order = {'I1': 1, 'SI2': 2, 'SI1': 3, 'VS2': 4, 'VS1': 5, 'VVS2': 6, 'VVS1': 7, 'IF': 8}

    df_processed['cut_encoded'] = df_processed['cut'].map(cut_order)
    df_processed['color_encoded'] = df_processed['color'].map(color_order)
    df_processed['clarity_encoded'] = df_processed['clarity'].map(clarity_order)

    # Удаление исходных категориальных колонок и ID
    columns_to_drop = ['id', 'cut', 'color', 'clarity']
    for col in columns_to_drop:
        if col in df_processed.columns:
            df_processed = df_processed.drop(col, axis=1)

    return df_processed, density_median

# Применяем улучшенную предобработку
train_processed, density_median = enhanced_preprocess_data(train_df, is_train=True)
test_processed, _ = enhanced_preprocess_data(test_df, is_train=False, density_median=density_median)

print("Processed train shape:", train_processed.shape)
print("Processed test shape:", test_processed.shape)


Processed train shape: (43018, 16)
Processed test shape: (5379, 15)


In [6]:
# Разделение на признаки и целевую переменную
X = train_processed.drop('price', axis=1)
y = train_processed['price']

# Разделение на train/validation
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Train size: {X_train.shape}, Validation size: {X_val.shape}")

# Расширенный список моделей для тестирования
models = {
    'Linear Regression': LinearRegression(),
    'Ridge': Ridge(alpha=1.0),
    'Lasso': Lasso(alpha=0.1),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42),
    'XGBoost': XGBRegressor(n_estimators=100, random_state=42),
    'LightGBM': LGBMRegressor(n_estimators=100, random_state=42)
}

# Масштабирование для линейных моделей
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_val_scaled = scaler.transform(X_val)

results = {}

print("=" * 60)
print("СРАВНЕНИЕ МОДЕЛЕЙ")
print("=" * 60)

Train size: (34414, 15), Validation size: (8604, 15)
СРАВНЕНИЕ МОДЕЛЕЙ


In [9]:
for name, model in models.items():
    if name in ['Linear Regression', 'Ridge', 'Lasso']:
        # Линейные модели используют масштабированные данные
        model.fit(X_train_scaled, y_train)
        y_pred = model.predict(X_val_scaled)
    else:
        # Деревья используют исходные данные
        model.fit(X_train, y_train)
        y_pred = model.predict(X_val)

    mse = mean_squared_error(y_val, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_val, y_pred)
    mae = mean_absolute_error(y_val, y_pred)

    results[name] = {
        'RMSE': rmse,
        'R2': r2,
        'MAE': mae
    }

    print(f"{name:20} | R²: {r2:.4f} | RMSE: ${rmse:.2f} | MAE: ${mae:.2f}")

results_df = pd.DataFrame(results).T
results_df = results_df.sort_values('RMSE')
print("\n ЛУЧШИЕ МОДЕЛИ:")
print(results_df.head())

Linear Regression    | R²: -10.0785 | RMSE: $13380.37 | MAE: $957.09
Ridge                | R²: -4.7499 | RMSE: $9639.52 | MAE: $904.94
Lasso                | R²: 0.7404 | RMSE: $2048.24 | MAE: $808.34
Random Forest        | R²: 0.9785 | RMSE: $588.76 | MAE: $275.11
Gradient Boosting    | R²: 0.9752 | RMSE: $633.47 | MAE: $339.46
XGBoost              | R²: 0.9814 | RMSE: $548.54 | MAE: $277.42
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003871 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2792
[LightGBM] [Info] Number of data points in the train set: 34414, number of used features: 15
[LightGBM] [Info] Start training from score 3916.755071
LightGBM             | R²: 0.9814 | RMSE: $548.72 | MAE: $275.26

 ЛУЧШИЕ МОДЕЛИ:
                          RMSE        R2         MAE
XGBoost             548.542643  0.981381  277.418518
LightGBM            548.722108  0.981368  275.263926
Random Forest    

In [15]:
# Выбор и оптимизация лучшей модели
best_model_name = results_df.index[0]
print(f"\n Оптимизируем лучшую модель: {best_model_name}")

if best_model_name == 'Random Forest':
    final_model = RandomForestRegressor(
        n_estimators=200,
        max_depth=15,
        min_samples_split=5,
        random_state=42,
        n_jobs=-1
    )
elif best_model_name == 'XGBoost':
    final_model = XGBRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
elif best_model_name == 'LightGBM':
    final_model = LGBMRegressor(
        n_estimators=200,
        max_depth=6,
        learning_rate=0.1,
        random_state=42
    )
else:
    final_model = models[best_model_name]

# Финальное обучение на всех данных
print(f"\n Обучаем финальную модель на всех данных...")
if best_model_name in ['Linear Regression', 'Ridge', 'Lasso']:
    X_full_scaled = scaler.fit_transform(X)
    final_model.fit(X_full_scaled, y)
else:
    final_model.fit(X, y)


 Оптимизируем лучшую модель: XGBoost

 Обучаем финальную модель на всех данных...


In [16]:
# Предсказание на тестовых данных
print(" Делаем предсказания на тестовых данных...")
if best_model_name in ['Linear Regression', 'Ridge', 'Lasso']:
    test_scaled = scaler.transform(test_processed)
    test_predictions = final_model.predict(test_scaled)
else:
    test_predictions = final_model.predict(test_processed)

# Создание submission файла
submission = pd.DataFrame({
    'id': test_df['id'],
    'price': test_predictions
})

# Проверка предсказаний
print("\n Статистика предсказаний:")
print(f"Min price: ${submission['price'].min():.2f}")
print(f"Max price: ${submission['price'].max():.2f}")
print(f"Mean price: ${submission['price'].mean():.2f}")
print(f"Median price: ${submission['price'].median():.2f}")

# Сохранение результатов
submission_file = 'improved_diamond_predictions.csv'
submission.to_csv(submission_file, index=False)
print(f"\n Файл {submission_file} успешно сохранен!")

# Дополнительная проверка качества
if best_model_name in ['Linear Regression', 'Ridge', 'Lasso']:
    y_val_pred = final_model.predict(X_val_scaled)
else:
    y_val_pred = final_model.predict(X_val)

final_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
final_r2 = r2_score(y_val, y_val_pred)
print(f"\n Финальное качество на валидации:")
print(f"RMSE: ${final_rmse:.2f}")
print(f"R²: {final_r2:.4f}")

 Делаем предсказания на тестовых данных...

 Статистика предсказаний:
Min price: $324.53
Max price: $18299.49
Mean price: $3933.04
Median price: $2441.09

 Файл improved_diamond_predictions.csv успешно сохранен!

 Финальное качество на валидации:
RMSE: $377.80
R²: 0.9912
