In [None]:

import time
start = time.time()
# your RandomizedSearchCV call
end = time.time()
print(f"⏱ Total tuning time: {(end - start)/60:.2f} minutes")

# ✅ Step 1: Upload Dataset (Colab)
from google.colab import files
uploaded = files.upload()

# ✅ Step 2: Imports
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, RandomizedSearchCV
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from scipy.stats import zscore
from xgboost import XGBRegressor

# ✅ Step 3: Load Dataset
df = pd.read_csv("salaries.csv")
df.dropna(inplace=True)

# ✅ Step 4: Feature/Target split
target = 'salary'
X = df.drop(columns=[target])
y = df[target]
y_log = np.log1p(y)  # log1p handles zeros better

# ✅ Step 5: Remove outliers in y_log using Z-score
z_scores = np.abs(zscore(y_log))
mask = z_scores < 3  # keep only values within 3 standard deviations
X = X[mask]
y_log = y_log[mask]

# ✅ Step 6: Identify column types
categorical_cols = X.select_dtypes(include='object').columns.tolist()
numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# ✅ Step 7: Preprocessing
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_cols),
        ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_cols)
    ]
)

# ✅ Step 8: Train-test split
X_train, X_test, y_train_log, y_test_log = train_test_split(X, y_log, test_size=0.2, random_state=42)

# ✅ Step 9: Transform data
X_train_processed = preprocessor.fit_transform(X_train)
X_test_processed = preprocessor.transform(X_test)

# ✅ Step 10: Hyperparameter tuning using RandomizedSearchCV
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb_model = XGBRegressor(
    objective='reg:squarederror',
    random_state=42,
    eval_metric='rmse'
)

search = RandomizedSearchCV(
    estimator=xgb_model,
    param_distributions=param_grid,
    n_iter=10,
    cv=3,
    scoring='r2',
    random_state=42,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train_processed, y_train_log)
best_model = search.best_estimator_

print("\n🔍 Best Parameters Found:")
print(search.best_params_)

# ✅ Step 11: Cross-validation on best model
cv_scores = cross_val_score(best_model, X_train_processed, y_train_log, cv=5, scoring='r2')
print("\n📈 Cross-Validation R² Scores:", cv_scores)
print("Average R²: {:.4f}".format(np.mean(cv_scores)))

# ✅ Step 12: Final Evaluation on Test Set
y_pred_log = best_model.predict(X_test_processed)
y_pred = np.expm1(y_pred_log)
y_true = np.expm1(y_test_log)

mae = mean_absolute_error(y_true, y_pred)
rmse = np.sqrt(mean_squared_error(y_true, y_pred))
r2 = r2_score(y_true, y_pred)

print("\n📊 Final Evaluation on Test Set:")
print(f"MAE : {mae:.2f}")
print(f"RMSE: {rmse:.2f}")
print(f"R²   : {r2:.4f} → Accuracy ≈ {r2 * 100:.2f}%")
