In [None]:
# Import necessary libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import mean_squared_error
import xgboost as xgb

# Example: Load your dataset
# Replace this with your actual dataset
# Ensure you have X (features) and y (target) defined properly
# Example dataset creation
np.random.seed(42)
X = np.random.rand(1000, 10)  # 1000 samples, 10 features
y = X @ np.random.rand(10) + np.random.randn(1000) * 0.5  # Linear relationship with noise

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', eval_metric='rmse')

# Define hyperparameter grid
param_grid = {
    'learning_rate': [0.01, 0.05, 0.1],
    'max_depth': [3, 5, 7],
    'subsample': [0.8, 1.0],
    'colsample_bytree': [0.8, 1.0],
    'reg_alpha': [0, 0.1, 0.5],  # Regularization alpha
    'reg_lambda': [1, 1.5, 2],  # Regularization lambda
    'n_estimators': [100, 200]  # Number of boosting rounds
}

# Use GridSearchCV for hyperparameter tuning
grid_search = GridSearchCV(
    estimator=xgb_model,
    param_grid=param_grid,
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    n_jobs=-1
)

# Fit the GridSearchCV
grid_search.fit(X_train, y_train)

# Extract the best parameters and model
best_params = grid_search.best_params_
best_model = grid_search.best_estimator_

print("Best Parameters:", best_params)

# Evaluate the best model
y_train_pred = best_model.predict(X_train)
y_test_pred = best_model.predict(X_test)

# Calculate MSE and RMSE for training and testing
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

train_rmse = train_mse ** 0.5
test_rmse = test_mse ** 0.5

print(f"Training MSE: {train_mse:.4f}, RMSE: {train_rmse:.4f}")
print(f"Testing MSE: {test_mse:.4f}, RMSE: {test_rmse:.4f}")

# Save feature importance to CSV
importance = best_model.get_booster().get_score(importance_type='weight')
importance_df = pd.DataFrame(
    list(importance.items()), 
    columns=['Feature', 'Importance']
).sort_values(by='Importance', ascending=False)
importance_csv_path = './FeatureImportance_GridSearchCV.csv'
importance_df.to_csv(importance_csv_path, index=False)

print(f"Feature importance saved to {importance_csv_path}")
print("\nTop Features:")
print(importance_df)
