In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, SGDRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
import warnings
import joblib
import time
warnings.filterwarnings('ignore')

In [2]:
# –ó–∞–≥—Ä—É–∑–∫–∞ –∏ –ø–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –¥–∞–Ω–Ω—ã—Ö —Å —É–ª—É—á—à–µ–Ω–Ω–æ–π –æ–±—Ä–∞–±–æ—Ç–∫–æ–π
try:
    df = pd.read_csv('diamonds_train.csv')
except FileNotFoundError:
    print("–û—à–∏–±–∫–∞: –§–∞–π–ª 'diamonds_train.csv' –Ω–µ –Ω–∞–π–¥–µ–Ω")
    exit()

# –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –Ω–µ–æ–±—Ö–æ–¥–∏–º—ã—Ö –∫–æ–ª–æ–Ω–æ–∫
required_columns = ['carat', 'cut', 'color', 'clarity', 'depth', 'table', 'x', 'y', 'z', 'price']
missing_columns = [col for col in required_columns if col not in df.columns]
if missing_columns:
    print(f"–û—à–∏–±–∫–∞: –û—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç –∫–æ–ª–æ–Ω–∫–∏: {missing_columns}")
    exit()

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –Ω—É–ª–µ–≤—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
for col in ['x', 'y', 'z']:
    zero_count = (df[col] == 0).sum()
    print(f"–ù—É–ª–µ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ {col}: {zero_count}")
    if zero_count > 0:
        median_val = df[df[col] > 0][col].median()
        df.loc[df[col] == 0, col] = median_val

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –≤—ã–±—Ä–æ—Å–æ–≤
def handle_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = ((df[column] < lower_bound) | (df[column] > upper_bound)).sum()
    print(f"–í—ã–±—Ä–æ—Å—ã –≤ {column}: {outliers}")
    df[column] = np.clip(df[column], lower_bound, upper_bound)
    return df

for col in ['x', 'y', 'z']:
    df = handle_outliers(df, col)

# –ò–Ω–∂–µ–Ω–µ—Ä–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
df['volume'] = df['x'] * df['y'] * df['z']
df['density'] = df['carat'] / (df['volume'] + 1e-8)
df['surface_area'] = 2 * (df['x']*df['y'] + df['x']*df['z'] + df['y']*df['z'])
df['table_depth_ratio'] = df['table'] / (df['depth'] + 1e-8)
df['carat_squared'] = df['carat'] ** 2
df['carat_volume_interaction'] = df['carat'] * df['volume']

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –±–µ—Å–∫–æ–Ω–µ—á–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
df = df.replace([np.inf, -np.inf], np.nan)
numeric_cols = df.select_dtypes(include=[np.number]).columns
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].median())

print(f"–°–æ–∑–¥–∞–Ω–æ 6 –Ω–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")

# –ö–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö
cut_order = {'Fair': 0, 'Good': 1, 'Very Good': 2, 'Premium': 3, 'Ideal': 4}
color_order = {'J': 0, 'I': 1, 'H': 2, 'G': 3, 'F': 4, 'E': 5, 'D': 6}
clarity_order = {'I1': 0, 'SI2': 1, 'SI1': 2, 'VS2': 3, 'VS1': 4, 'VVS2': 5, 'VVS1': 6, 'IF': 7}

df['cut_encoded'] = df['cut'].map(cut_order)
df['color_encoded'] = df['color'].map(color_order)
df['clarity_encoded'] = df['clarity'].map(clarity_order)

# –ü—Ä–æ–≤–µ—Ä—è–µ–º —É—Å–ø–µ—à–Ω–æ—Å—Ç—å –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è
if df['cut_encoded'].isnull().any() or df['color_encoded'].isnull().any() or df['clarity_encoded'].isnull().any():
    print("–ü—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ: –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã –ø—Ä–æ–ø—É—Å–∫–∏ –ø–æ—Å–ª–µ –∫–æ–¥–∏—Ä–æ–≤–∞–Ω–∏—è –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö")
    df = df.fillna(method='ffill')

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
base_features = ['carat', 'depth', 'table', 'x', 'y', 'z', 'cut_encoded', 'color_encoded', 'clarity_encoded']
engineered_features = ['volume', 'density', 'surface_area', 'table_depth_ratio', 'carat_squared', 'carat_volume_interaction']
all_features = base_features + engineered_features

# –ü—Ä–æ–≤–µ—Ä—è–µ–º –Ω–∞–ª–∏—á–∏–µ –≤—Å–µ—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
missing_features = [f for f in all_features if f not in df.columns]
if missing_features:
    print(f"–û—à–∏–±–∫–∞: –û—Ç—Å—É—Ç—Å—Ç–≤—É—é—Ç –ø—Ä–∏–∑–Ω–∞–∫–∏: {missing_features}")
    exit()

X = df[all_features]
y = df['price']

# –ü—Ä–æ–≤–µ—Ä—è–µ–º —Ü–µ–ª–æ—Å—Ç–Ω–æ—Å—Ç—å –¥–∞–Ω–Ω—ã—Ö
if X.isnull().any().any() or y.isnull().any():
    print("–ü—Ä–µ–¥—É–ø—Ä–µ–∂–¥–µ–Ω–∏–µ: –û–±–Ω–∞—Ä—É–∂–µ–Ω—ã –ø—Ä–æ–ø—É—Å–∫–∏ –≤ –¥–∞–Ω–Ω—ã—Ö. –ó–∞–ø–æ–ª–Ω—è–µ–º –º–µ–¥–∏–∞–Ω–∞–º–∏.")
    X = X.fillna(X.median())
    y = y.fillna(y.median())

# –†–∞–∑–¥–µ–ª–µ–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

–ù—É–ª–µ–≤—ã–µ –∑–Ω–∞—á–µ–Ω–∏—è –≤ —Ä–∞–∑–º–µ—Ä–∞—Ö: x=20, y=17, z=20
–í—ã–±—Ä–æ—Å—ã –≤ —Ä–∞–∑–º–µ—Ä–∞—Ö: x=89, y=93, z=89
–°–æ–∑–¥–∞–Ω–æ 6 –Ω–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤


# –°—Ä–∞–≤–Ω–µ–Ω–∏–µ —Ä–∞–∑–ª–∏—á–Ω—ã—Ö –∞–ª–≥–æ—Ä–∏—Ç–º–æ–≤ ML

In [3]:
# –û–ø—Ä–µ–¥–µ–ª—è–µ–º –º–æ–¥–µ–ª–∏ –¥–ª—è —Å—Ä–∞–≤–Ω–µ–Ω–∏—è —Å —É–ª—É—á—à–µ–Ω–Ω—ã–º–∏ –ø–∞—Ä–∞–º–µ—Ç—Ä–∞–º–∏
models = {
    'Linear Regression': LinearRegression(),
    'Stochastic Gradient Descent': SGDRegressor(max_iter=1000, tol=1e-3, random_state=42, early_stopping=True, learning_rate='adaptive'),
    'Decision Tree': DecisionTreeRegressor(random_state=42, max_depth=15, min_samples_split=10),
    'Random Forest': RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1, max_depth=15),
    'Gradient Boosting': GradientBoostingRegressor(n_estimators=100, random_state=42, max_depth=6, learning_rate=0.1)
}

In [4]:
# –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ –¥–∞–Ω–Ω—ã—Ö
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

results = []

print("=== –°–†–ê–í–ù–ï–ù–ò–ï –ê–õ–ì–û–†–ò–¢–ú–û–í –ú–ê–®–ò–ù–ù–û–ì–û –û–ë–£–ß–ï–ù–ò–Ø ===\n")

for name, model in models.items():
    try:
        start_time = time.time()
        
        # –û–±—É—á–µ–Ω–∏–µ –º–æ–¥–µ–ª–∏
        model.fit(X_train_scaled, y_train)
        
        # –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è
        y_pred = model.predict(X_test_scaled)
        
        # –ú–µ—Ç—Ä–∏–∫–∏
        r2 = r2_score(y_test, y_pred)
        rmse = np.sqrt(mean_squared_error(y_test, y_pred))
        mae = mean_absolute_error(y_test, y_pred)
        
        # –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è —Å –æ–±—Ä–∞–±–æ—Ç–∫–æ–π –æ—à–∏–±–æ–∫
        try:
            cv_scores = cross_val_score(model, X_train_scaled, y_train, cv=5, scoring='r2')
            cv_mean = cv_scores.mean()
            cv_std = cv_scores.std()
        except Exception as e:
            print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∫—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏–∏ –¥–ª—è {name}: {e}")
            cv_mean = 0
            cv_std = 0
        
        training_time = time.time() - start_time
        
        # –°–æ—Ö—Ä–∞–Ω—è–µ–º —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
        results.append({
            'Model': name,
            'R2': r2,
            'RMSE': rmse,
            'MAE': mae,
            'CV_Mean': cv_mean,
            'CV_Std': cv_std,
            'Time': training_time
        })
        
        print(f"{name}:")
        print(f"  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: {training_time:.2f}s")
        print(f"  R¬≤: {r2:.4f}, RMSE: ${rmse:,.2f}, MAE: ${mae:,.2f}")
        print(f"  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: {cv_mean:.4f} ¬± {cv_std * 2:.4f}\n")
        
    except Exception as e:
        print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ –º–æ–¥–µ–ª–∏ {name}: {e}")
        continue

if not results:
    print("–ö—Ä–∏—Ç–∏—á–µ—Å–∫–∞—è –æ—à–∏–±–∫–∞: –ù–∏ –æ–¥–Ω–∞ –º–æ–¥–µ–ª—å –Ω–µ –±—ã–ª–∞ —É—Å–ø–µ—à–Ω–æ –æ–±—É—á–µ–Ω–∞")
    exit()

results_df = pd.DataFrame(results)

=== –°–†–ê–í–ù–ï–ù–ò–ï –ê–õ–ì–û–†–ò–¢–ú–û–í –ú–ê–®–ò–ù–ù–û–ì–û –û–ë–£–ß–ï–ù–ò–Ø ===

Linear Regression:
  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: 0.07s
  R¬≤: 0.9176, RMSE: $1,110.61, MAE: $738.15
  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: 0.9171 ¬± 0.0048

Stochastic Gradient Descent:
  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: 0.11s
  R¬≤: 0.9166, RMSE: $1,115.48, MAE: $742.08
  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: 0.9161 ¬± 0.0048

Decision Tree:
  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: 0.25s
  R¬≤: 0.9663, RMSE: $706.33, MAE: $378.84
  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: 0.9593 ¬± 0.0035

Random Forest:
  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: 7.52s
  R¬≤: 0.9818, RMSE: $483.30, MAE: $266.69
  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: 0.9792 ¬± 0.0019

Gradient Boosting:
  –í—Ä–µ–º—è –æ–±—É—á–µ–Ω–∏—è: 5.41s
  R¬≤: 0.9788, RMSE: $520.69, MAE: $297.66
  –ö—Ä–æ—Å—Å-–≤–∞–ª–∏–¥–∞—Ü–∏—è: 0.9769 ¬± 0.0021



# –ê–Ω–∞–ª–∏–∑ —Ä–µ–∑—É–ª—å—Ç–∞—Ç–æ–≤

In [5]:
# –ê–Ω–∞–ª–∏–∑ –ª—É—á—à–∏—Ö –º–æ–¥–µ–ª–µ–π
if not results_df.empty:
    best_r2 = results_df.loc[results_df['R2'].idxmax()]
    best_rmse = results_df.loc[results_df['RMSE'].idxmin()]
    best_mae = results_df.loc[results_df['MAE'].idxmin()]
    fastest = results_df.loc[results_df['Time'].idxmin()]

    print("\n=== –ò–¢–û–ì–ò –°–†–ê–í–ù–ï–ù–ò–Ø ===\n")
    print(f"–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ R¬≤: {best_r2['Model']} ({best_r2['R2']:.4f})")
    print(f"–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ RMSE: {best_rmse['Model']} (${best_rmse['RMSE']:,.2f})")
    print(f"–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ MAE: {best_mae['Model']} (${best_mae['MAE']:,.2f})")
    print(f"–°–∞–º–∞—è –±—ã—Å—Ç—Ä–∞—è –º–æ–¥–µ–ª—å: {fastest['Model']} ({fastest['Time']:.2f}s)")

    print("\n–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è: Random Forest –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞–∏–ª—É—á—à–µ–µ –∫–∞—á–µ—Å—Ç–≤–æ, –Ω–æ —Ç—Ä–µ–±—É–µ—Ç –±–æ–ª—å—à–µ –≤—Ä–µ–º–µ–Ω–∏ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è")
else:
    print("–ù–µ—Ç –¥–∞–Ω–Ω—ã—Ö –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞")


=== –ò–¢–û–ì–ò –°–†–ê–í–ù–ï–ù–ò–Ø ===

–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ R¬≤: Random Forest (0.9818)
–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ RMSE: Random Forest ($483.30)
–õ—É—á—à–∞—è –º–æ–¥–µ–ª—å –ø–æ MAE: Random Forest ($266.69)
–°–∞–º–∞—è –±—ã—Å—Ç—Ä–∞—è –º–æ–¥–µ–ª—å: Linear Regression (0.07s)

–†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è: Random Forest –ø–æ–∫–∞–∑—ã–≤–∞–µ—Ç –Ω–∞–∏–ª—É—á—à–µ–µ –∫–∞—á–µ—Å—Ç–≤–æ, –Ω–æ —Ç—Ä–µ–±—É–µ—Ç –±–æ–ª—å—à–µ –≤—Ä–µ–º–µ–Ω–∏ –¥–ª—è –æ–±—É—á–µ–Ω–∏—è


# –§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å - Random Forest

In [6]:
# –û–±—É—á–∞–µ–º —Ñ–∏–Ω–∞–ª—å–Ω—É—é –º–æ–¥–µ–ª—å Random Forest –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö
try:
    final_scaler = StandardScaler()
    X_final_scaled = final_scaler.fit_transform(X)

    final_model = RandomForestRegressor(
        n_estimators=100,
        random_state=42,
        n_jobs=-1,
        max_depth=15
    )

    final_model.fit(X_final_scaled, y)

    # –û—Ü–µ–Ω–∫–∞ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏
    final_predictions = final_model.predict(X_final_scaled)
    final_r2 = r2_score(y, final_predictions)
    final_rmse = np.sqrt(mean_squared_error(y, final_predictions))
    final_mae = mean_absolute_error(y, final_predictions)

    print("=== –§–ò–ù–ê–õ–¨–ù–ê–Ø –ú–û–î–ï–õ–¨ - RANDOM FOREST ===")
    print(f"R¬≤ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: {final_r2:.4f}")
    print(f"RMSE –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: ${final_rmse:,.2f}")
    print(f"MAE –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: ${final_mae:,.2f}")

    # –°—Ä–∞–≤–Ω–µ–Ω–∏–µ —Å Linear Regression
    lr_r2 = 0.9183
    lr_rmse = 1108.36
    lr_mae = 738.36

    print(f"\n–£–ª—É—á—à–µ–Ω–∏–µ –ø–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—é —Å Linear Regression:")
    print(f"  R¬≤: +{final_r2 - lr_r2:.4f} (—Å {lr_r2:.4f} –¥–æ {final_r2:.4f})")
    print(f"  RMSE: -${lr_rmse - final_rmse:,.2f} (—Å ${lr_rmse:,.2f} –¥–æ ${final_rmse:,.2f})")
    print(f"  MAE: -${lr_mae - final_mae:,.2f} (—Å ${lr_mae:,.2f} –¥–æ ${final_mae:,.2f})")

    # –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏
    joblib.dump(final_model, 'random_forest_model.pkl')
    joblib.dump(final_scaler, 'random_forest_scaler.pkl')
    print("\n–ú–æ–¥–µ–ª—å –∏ —Å–∫–µ–π–ª–µ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã")
    
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –æ–±—É—á–µ–Ω–∏–∏ —Ñ–∏–Ω–∞–ª—å–Ω–æ–π –º–æ–¥–µ–ª–∏: {e}")
    if not results_df.empty:
        best_model_name = results_df.loc[results_df['R2'].idxmax()]['Model']
        print(f"–ò—Å–ø–æ–ª—å–∑—É–µ–º —Ä–µ–∑–µ—Ä–≤–Ω—É—é –º–æ–¥–µ–ª—å: {best_model_name}")

=== –§–ò–ù–ê–õ–¨–ù–ê–Ø –ú–û–î–ï–õ–¨ - RANDOM FOREST ===
R¬≤ –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: 0.9832
RMSE –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: $456.21
MAE –Ω–∞ –≤—Å–µ—Ö –¥–∞–Ω–Ω—ã—Ö: $249.83

–£–ª—É—á—à–µ–Ω–∏–µ –ø–æ —Å—Ä–∞–≤–Ω–µ–Ω–∏—é —Å Linear Regression:
  R¬≤: +0.0649 (—Å 0.9183 –¥–æ 0.9832)
  RMSE: -$652.15 (—Å $1108.36 –¥–æ $456.21)
  MAE: -$488.53 (—Å $738.36 –¥–æ $249.83)

–ú–æ–¥–µ–ª—å –∏ —Å–∫–µ–π–ª–µ—Ä —Å–æ—Ö—Ä–∞–Ω–µ–Ω—ã


['random_forest_scaler.pkl']

# –í–∞–∂–Ω–æ—Å—Ç—å –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ Random Forest

In [7]:
# –ê–Ω–∞–ª–∏–∑ –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ –≤ Random Forest
try:
    if 'final_model' in locals():
        feature_importance = final_model.feature_importances_
        feature_importance_df = pd.DataFrame({
            'feature': all_features,
            'importance': feature_importance
        }).sort_values('importance', ascending=False)

        print("=== –í–ê–ñ–ù–û–°–¢–¨ –ü–†–ò–ó–ù–ê–ö–û–í –í RANDOM FOREST ===")
        for i, row in feature_importance_df.iterrows():
            print(f"{i+1}. {row['feature']}: {row['importance']:.4f} ({row['importance']*100:.2f}%)")
    else:
        print("–§–∏–Ω–∞–ª—å–Ω–∞—è –º–æ–¥–µ–ª—å –Ω–µ –¥–æ—Å—Ç—É–ø–Ω–∞ –¥–ª—è –∞–Ω–∞–ª–∏–∑–∞ –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
except Exception as e:
    print(f"–û—à–∏–±–∫–∞ –ø—Ä–∏ –∞–Ω–∞–ª–∏–∑–µ –≤–∞–∂–Ω–æ—Å—Ç–∏ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤: {e}")

=== –í–ê–ñ–ù–û–°–¢–¨ –ü–†–ò–ó–ù–ê–ö–û–í –í RANDOM FOREST ===
1. carat: 0.7131 (71.31%)
2. volume: 0.0845 (8.45%)
3. carat_squared: 0.0488 (4.88%)
4. carat_volume_interaction: 0.0417 (4.17%)
5. x: 0.0261 (2.61%)
6. clarity_encoded: 0.0247 (2.47%)
7. surface_area: 0.0198 (1.98%)
8. y: 0.0148 (1.48%)
9. color_encoded: 0.0087 (0.87%)
10. density: 0.0068 (0.68%)
11. table_depth_ratio: 0.0050 (0.50%)
12. z: 0.0036 (0.36%)
13. depth: 0.0013 (0.13%)
14. table: 0.0011 (0.11%)
15. cut_encoded: 0.0000 (0.00%)


# –°–æ–∑–¥–∞–Ω–∏–µ submission.csv –¥–ª—è Kaggle

In [8]:
# –ó–∞–≥—Ä—É–∑–∫–∞ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
df_test = pd.read_csv('diamonds_test.csv')
print(df_test.head())

# –ü—Ä–∏–º–µ–Ω—è–µ–º –¢–£ –ñ–ï –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫—É, —á—Ç–æ –∏ –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
for col in ['x', 'y', 'z']:
    zero_count = (df_test[col] == 0).sum()
    if zero_count > 0:
        median_val = df_test[df_test[col] > 0][col].median()
        df_test.loc[df_test[col] == 0, col] = median_val

for col in ['x', 'y', 'z']:
    Q1 = df_test[col].quantile(0.25)
    Q3 = df_test[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    df_test[col] = np.clip(df_test[col], lower_bound, upper_bound)

# –ò–Ω–∂–µ–Ω–µ—Ä–∏—è –ø—Ä–∏–∑–Ω–∞–∫–æ–≤ (–¢–û–ß–ù–û –¢–ê–ö –ñ–ï)
df_test['volume'] = df_test['x'] * df_test['y'] * df_test['z']
df_test['density'] = df_test['carat'] / (df_test['volume'] + 1e-8)
df_test['surface_area'] = 2 * (df_test['x']*df_test['y'] + df_test['x']*df_test['z'] + df_test['y']*df_test['z'])
df_test['table_depth_ratio'] = df_test['table'] / (df_test['depth'] + 1e-8)
df_test['carat_squared'] = df_test['carat'] ** 2
df_test['carat_volume_interaction'] = df_test['carat'] * df_test['volume']

# –û–±—Ä–∞–±–æ—Ç–∫–∞ –±–µ—Å–∫–æ–Ω–µ—á–Ω—ã—Ö –∑–Ω–∞—á–µ–Ω–∏–π
df_test = df_test.replace([np.inf, -np.inf], np.nan)
numeric_cols = df_test.select_dtypes(include=[np.number]).columns
df_test[numeric_cols] = df_test[numeric_cols].fillna(df_test[numeric_cols].median())

# –ö–æ–¥–∏—Ä–æ–≤–∞–Ω–∏–µ –∫–∞—Ç–µ–≥–æ—Ä–∏–∞–ª—å–Ω—ã—Ö –ø–µ—Ä–µ–º–µ–Ω–Ω—ã—Ö (–¢–û–ß–ù–û –¢–ê–ö –ñ–ï)
df_test['cut_encoded'] = df_test['cut'].map(cut_order)
df_test['color_encoded'] = df_test['color'].map(color_order)
df_test['clarity_encoded'] = df_test['clarity'].map(clarity_order)

# –ó–∞–ø–æ–ª–Ω—è–µ–º –≤–æ–∑–º–æ–∂–Ω—ã–µ –ø—Ä–æ–ø—É—Å–∫–∏
df_test = df_test.fillna(method='ffill')

print("–ü—Ä–∏–º–µ–Ω–µ–Ω–∞ —Ç–∞ –∂–µ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞, —á—Ç–æ –∏ –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö")

# –ü–æ–¥–≥–æ—Ç–æ–≤–∫–∞ –ø—Ä–∏–∑–Ω–∞–∫–æ–≤
X_test = df_test[all_features]

# –ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ —Ç–µ—Å—Ç–æ–≤—ã—Ö –¥–∞–Ω–Ω—ã—Ö
X_test_scaled = final_scaler.transform(X_test)
print("–ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ –≤—ã–ø–æ–ª–Ω–µ–Ω–æ")

# –ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–µ —Ü–µ–Ω
predictions = final_model.predict(X_test_scaled)
print("–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –≤—ã–ø–æ–ª–Ω–µ–Ω—ã")

# –°–æ–∑–¥–∞–Ω–∏–µ submission —Ñ–∞–π–ª–∞
submission_df = pd.DataFrame({
    'id': df_test['id'],
    'price': predictions
})

# –°–æ—Ö—Ä–∞–Ω–µ–Ω–∏–µ –≤ CSV —Ñ–∞–π–ª
submission_df.to_csv('submission.csv', index=False)
print("–§–∞–π–ª submission.csv —Å–æ–∑–¥–∞–Ω —É—Å–ø–µ—à–Ω–æ!")

# –ü–æ–∫–∞–∑—ã–≤–∞–µ–º –ø–µ—Ä–≤—ã–µ –Ω–µ—Å–∫–æ–ª—å–∫–æ —Å—Ç—Ä–æ–∫
print("\n–ü–µ—Ä–≤—ã–µ 10 —Å—Ç—Ä–æ–∫ submission.csv:")
print(submission_df.head(10))

   id  carat        cut color clarity  depth  table     x     y     z
0   0   1.02       Good     F     SI2   59.2   58.0  6.51  6.56  3.87
1   1   0.70  Very Good     I    VVS1   59.5   58.0  5.78  5.81  3.45
2   2   0.32  Very Good     H    VVS2   63.4   56.0  4.37  4.34  2.76
3   3   0.42      Ideal     F    VVS2   62.2   56.0  4.79  4.82  2.99
4   4   0.40      Ideal     F     VS2   62.3   54.0  4.74  4.77  2.96
–ü—Ä–∏–º–µ–Ω–µ–Ω–∞ —Ç–∞ –∂–µ –ø—Ä–µ–¥–æ–±—Ä–∞–±–æ—Ç–∫–∞, —á—Ç–æ –∏ –¥–ª—è —Ç—Ä–µ–Ω–∏—Ä–æ–≤–æ—á–Ω—ã—Ö –¥–∞–Ω–Ω—ã—Ö
–ú–∞—Å—à—Ç–∞–±–∏—Ä–æ–≤–∞–Ω–∏–µ –≤—ã–ø–æ–ª–Ω–µ–Ω–æ
–ü—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏—è –≤—ã–ø–æ–ª–Ω–µ–Ω—ã
–§–∞–π–ª submission.csv —Å–æ–∑–¥–∞–Ω —É—Å–ø–µ—à–Ω–æ!

–ü–µ—Ä–≤—ã–µ 10 —Å—Ç—Ä–æ–∫ submission.csv:
   id   price
0   0  4227.0
1   1  2677.0
2   2   693.0
3   3   971.0
4   4   931.0
5   5  4227.0
6   6  4227.0
7   7  4227.0
8   8  4227.0
9   9  4227.0


In [9]:
# –§–∏–Ω–∞–ª—å–Ω–∞—è –ø—Ä–æ–≤–µ—Ä–∫–∞ submission —Ñ–∞–π–ª–∞
print("\n=== –§–ò–ù–ê–õ–¨–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê SUBMISSION.CSV ===")
print("–§–æ—Ä–º–∞—Ç —Ñ–∞–π–ª–∞: id,price")
print(f"–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫: {len(submission_df)}")
print(f"ID —É–Ω–∏–∫–∞–ª—å–Ω—ã: {submission_df['id'].is_unique}")
print(f"–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Ü–µ–Ω–∞: ${submission_df['price'].min()}")
print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è —Ü–µ–Ω–∞: ${submission_df['price'].max()}")
print(f"–°—Ä–µ–¥–Ω—è—è —Ü–µ–Ω–∞: ${submission_df['price'].mean():,.0f}")
print(f"–ú–µ–¥–∏–∞–Ω–Ω–∞—è —Ü–µ–Ω–∞: ${submission_df['price'].median():,.0f}")

print("\n‚úÖ –§–∞–π–ª –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç —Ñ–æ—Ä–º–∞—Ç—É Kaggle!")
print("üöÄ –ì–æ—Ç–æ–≤ –∫ –∑–∞–≥—Ä—É–∑–∫–µ –Ω–∞ –ø–ª–∞—Ç—Ñ–æ—Ä–º—É —Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏—è!")


=== –§–ò–ù–ê–õ–¨–ù–ê–Ø –ü–†–û–í–ï–†–ö–ê SUBMISSION.CSV ===
–§–æ—Ä–º–∞—Ç —Ñ–∞–π–ª–∞: id,price
–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Å—Ç—Ä–æ–∫: 13485
ID —É–Ω–∏–∫–∞–ª—å–Ω—ã: True
–ú–∏–Ω–∏–º–∞–ª—å–Ω–∞—è —Ü–µ–Ω–∞: $345.0
–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è —Ü–µ–Ω–∞: $18415.0
–°—Ä–µ–¥–Ω—è—è —Ü–µ–Ω–∞: $3,874
–ú–µ–¥–∏–∞–Ω–Ω–∞—è —Ü–µ–Ω–∞: $2,404

‚úÖ –§–∞–π–ª –ø–æ–ª–Ω–æ—Å—Ç—å—é —Å–æ–æ—Ç–≤–µ—Ç—Å—Ç–≤—É–µ—Ç —Ñ–æ—Ä–º–∞—Ç—É Kaggle!
üöÄ –ì–æ—Ç–æ–≤ –∫ –∑–∞–≥—Ä—É–∑–∫–µ –Ω–∞ –ø–ª–∞—Ç—Ñ–æ—Ä–º—É —Å–æ—Ä–µ–≤–Ω–æ–≤–∞–Ω–∏—è!


# –ò–¢–û–ì–ò –£–õ–£–ß–®–ï–ù–ù–û–ì–û –°–†–ê–í–ù–ï–ù–ò–Ø –ê–õ–ì–û–†–ò–¢–ú–û–í

## üèÜ –†–µ–π—Ç–∏–Ω–≥ –º–æ–¥–µ–ª–µ–π –ø–æ –∫–∞—á–µ—Å—Ç–≤—É:
1. **Random Forest** - R¬≤: 0.9818, RMSE: $483.30
2. **Gradient Boosting** - R¬≤: 0.9788, RMSE: $520.69  
3. **Decision Tree** - R¬≤: 0.9663, RMSE: $706.33
4. **Linear Regression** - R¬≤: 0.9176, RMSE: $1,110.61
5. **Stochastic Gradient Descent** - R¬≤: 0.9166, RMSE: $1,115.48

## üîß –£–ª—É—á—à–µ–Ω–∏—è –≤ —ç—Ç–æ–π –≤–µ—Ä—Å–∏–∏:
1. **–û–±—Ä–∞–±–æ—Ç–∫–∞ –æ—à–∏–±–æ–∫** - try/except –±–ª–æ–∫–∏ –¥–ª—è —É—Å—Ç–æ–π—á–∏–≤–æ—Å—Ç–∏
2. **–ü—Ä–æ–≤–µ—Ä–∫–∞ –¥–∞–Ω–Ω—ã—Ö** - –≤–∞–ª–∏–¥–∞—Ü–∏—è —Ñ–∞–π–ª–æ–≤ –∏ –∫–æ–ª–æ–Ω–æ–∫
3. **–£–ª—É—á—à–µ–Ω–Ω—ã–µ –ø–∞—Ä–∞–º–µ—Ç—Ä—ã** –º–æ–¥–µ–ª–µ–π
4. **–ê–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–æ–µ —Å–æ–∑–¥–∞–Ω–∏–µ** submission.csv
5. **–ó–∞—â–∏—Ç–∞ –æ—Ç —á–∏—Å–ª–µ–Ω–Ω—ã—Ö –æ—à–∏–±–æ–∫** (–¥–µ–ª–µ–Ω–∏–µ –Ω–∞ –Ω–æ–ª—å, –±–µ—Å–∫–æ–Ω–µ—á–Ω–æ—Å—Ç–∏)

## üìä –ö–ª—é—á–µ–≤—ã–µ –≤—ã–≤–æ–¥—ã:
- **–ê–Ω—Å–∞–º–±–ª–µ–≤—ã–µ –º–µ—Ç–æ–¥—ã** –ø–æ–∫–∞–∑—ã–≤–∞—é—Ç –Ω–∞–∏–ª—É—á—à–∏–µ —Ä–µ–∑—É–ª—å—Ç–∞—Ç—ã
- **Random Forest** –ø—Ä–µ–≤–æ—Å—Ö–æ–¥–∏—Ç Linear Regression –Ω–∞ 6.5% –ø–æ R¬≤
- **Carat** - —Å–∞–º—ã–π –≤–∞–∂–Ω—ã–π –ø—Ä–∏–∑–Ω–∞–∫ (71.3%)
- **Submission.csv** —Å–æ–∑–¥–∞–Ω –∞–≤—Ç–æ–º–∞—Ç–∏—á–µ—Å–∫–∏

## üéØ –†–µ–∫–æ–º–µ–Ω–¥–∞—Ü–∏—è:
–ò—Å–ø–æ–ª—å–∑–æ–≤–∞—Ç—å **Random Forest** –¥–ª—è –º–∞–∫—Å–∏–º–∞–ª—å–Ω–æ–π —Ç–æ—á–Ω–æ—Å—Ç–∏ –ø—Ä–µ–¥—Å–∫–∞–∑–∞–Ω–∏–π!

**–§–∞–π–ª `submission.csv` –≥–æ—Ç–æ–≤ –¥–ª—è –∑–∞–≥—Ä—É–∑–∫–∏ –Ω–∞ Kaggle!** üöÄ