In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error
from xgboost import XGBRegressor
import warnings
import joblib

warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('diamonds_train.csv')
df.head()

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.51,Good,D,SI2,63.9,55.0,1180,5.04,5.1,3.24
1,0.72,Ideal,E,VS2,60.8,57.0,3091,5.79,5.82,3.53
2,0.7,Very Good,D,VVS2,62.8,60.0,4022,5.65,5.69,3.56
3,0.36,Ideal,D,SI1,61.2,57.0,663,4.59,4.63,2.82
4,0.54,Very Good,D,SI1,60.0,59.8,1593,5.3,5.34,3.18


**Предварительная обработка данных**

In [3]:
missing = df.isnull().sum()
display(missing[missing > 0] if missing.sum() > 0 else "Пропущенных значений нет")


'Пропущенных значений нет'

In [4]:
categorical_cols = ['cut', 'color', 'clarity']
for col in categorical_cols:
    if col in df.columns:
        print(f"{col}: {sorted(df[col].unique())}")

cut: ['Fair', 'Good', 'Ideal', 'Premium', 'Very Good']
color: ['D', 'E', 'F', 'G', 'H', 'I', 'J']
clarity: ['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2']


In [5]:
numeric_features = [col for col in df.columns if col not in categorical_cols + ['price']]
categorical_features = categorical_cols

**Подготовка признаков**

In [6]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numeric_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
    ])

In [7]:
model = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', XGBRegressor(objective='reg:squarederror', random_state=42))
])

**Разделение на train/test**

In [8]:
X = df.drop('price', axis=1)
y = df['price']

X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42,
    shuffle=True
)

print(f"Train: {X_train.shape[0]:,} образцов ({X_train.shape[0]/len(X)*100:.1f}%)")
print(f"Test: {X_test.shape[0]:,} образцов ({X_test.shape[0]/len(X)*100:.1f}%)")

Train: 34,414 образцов (80.0%)
Test: 8,604 образцов (20.0%)


**Подбор гиперпараметров, обучение модели**

In [9]:
param_dist = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [3, 5, 7],
    'regressor__learning_rate': [0.01, 0.1, 0.2]
}

In [10]:
search = RandomizedSearchCV(
    model,
    param_distributions=param_dist,
    n_iter=10,
    cv=5,
    scoring='r2',
    random_state=42
)

In [11]:
search.fit(X_train, y_train)

In [12]:
best_model = search.best_estimator_


In [13]:
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

In [14]:
# Метрики качества
train_r2 = r2_score(y_train, y_train_pred)
test_r2 = r2_score(y_test, y_test_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))

print("МЕТРИКИ КАЧЕСТВА:")
print(f"R² Score - Train: {train_r2:.4f}, Test: {test_r2:.4f}")
print(f"RMSE - Train: ${train_rmse:,.2f}, Test: ${test_rmse:,.2f}")

МЕТРИКИ КАЧЕСТВА:
R² Score - Train: 0.9918, Test: 0.9798
RMSE - Train: $360.37, Test: $571.37


In [15]:
cv_scores = cross_val_score(best_model, X_train, y_train, cv=5, scoring='r2')
print(f"Кросс-валидация R² (5-fold): {cv_scores.mean():.4f} (+/- {cv_scores.std() * 2:.4f})")

Кросс-валидация R² (5-fold): 0.9810 (+/- 0.0022)


In [16]:
overfitting = train_r2 - test_r2
print(f"Переобучение (разница R²): {overfitting:.4f}")

Переобучение (разница R²): 0.0120


**Финальная модель для соревнований**

In [17]:
# Сохранение модели
joblib.dump(best_model, 'best_model.pkl')

['best_model.pkl']

In [18]:
# Загрузка тестового датасета
df_test = pd.read_csv('diamonds_test.csv')
print(df_test.head())

   id  carat        cut color clarity  depth  table     x     y     z
0   0   1.02       Good     F     SI2   59.2   58.0  6.51  6.56  3.87
1   1   0.70  Very Good     I    VVS1   59.5   58.0  5.78  5.81  3.45
2   2   0.32  Very Good     H    VVS2   63.4   56.0  4.37  4.34  2.76
3   3   0.42      Ideal     F    VVS2   62.2   56.0  4.79  4.82  2.99
4   4   0.40      Ideal     F     VS2   62.3   54.0  4.74  4.77  2.96


In [19]:
X_test_final = df_test.drop('id', axis=1, errors='ignore')
predictions = best_model.predict(X_test_final)

# Сохраняем id для submission
test_ids = df_test['id'].copy()

In [23]:
df_result = pd.DataFrame({'id': test_ids, 'price': predictions})
df_result.to_csv('submission.csv', index=False)

In [22]:
df_sub = pd.read_csv('submission.csv')
df_sub.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5379 entries, 0 to 5378
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   id      5379 non-null   int64  
 1   price   5379 non-null   float64
dtypes: float64(1), int64(1)
memory usage: 84.2 KB
