In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler
import shap  

datasets = []
best_parameters = []
r2_scores = []
rmse_scores = []
results_data = pd.DataFrame()
data = pd.read_excel('../parameter.xlsx')
data = data.dropna(subset=['parameter'])
X = data.drop('parameter', axis=1)
y = data['parameter']
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)

param_dist = {
    'n_estimators': randint(100, 1000),
    'max_depth': randint(3, 10),
    'learning_rate': uniform(0.01, 0.1),
    'subsample': uniform(0.7, 0.3),
    'min_child_weight': randint(1, 10),
    'colsample_bytree': uniform(0.7, 0.3),
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.01, 0.1, 1, 10]
}

model = xgb.XGBRegressor(objective='reg:squarederror', gpu_id=0)
kf = KFold(n_splits=10, shuffle=True, random_state=120)
random_search = RandomizedSearchCV(model, param_distributions=param_dist, n_iter=100, cv=kf, verbose=2, random_state=120, n_jobs=-1)
random_search.fit(X_scaled, y)
best_model = random_search.best_estimator_
best_params = random_search.best_params_
print("Best parameters found: ", best_params)
r2_fold_scores = []
rmse_fold_scores = []

for train_index, test_index in kf.split(X_scaled):
    X_train, X_test = X_scaled[train_index], X_scaled[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    best_model.fit(X_train, y_train)
    y_pred = best_model.predict(X_test)

    # Calculate metrics
    r2_fold = r2_score(y_test, y_pred)
    rmse_fold = np.sqrt(mean_squared_error(y_test, y_pred))

    r2_fold_scores.append(r2_fold)
    rmse_fold_scores.append(rmse_fold)

    # Collect results for each fold
    fold_results = pd.DataFrame({
        'Actual': y_test,
        'Predicted': y_pred,
        'Fold': [f'Fold_{len(r2_fold_scores)}'] * len(y_test)
    })
    results_data = pd.concat([results_data, fold_results])

best_model.fit(X_scaled, y)
y_train_pred = best_model.predict(X_scaled)

explainer = shap.Explainer(best_model, X_scaled, check_additivity=False)
shap_values = explainer(X_scaled)  

final_results = pd.DataFrame({
    'Actual': y.values,
    'Predicted': y_train_pred
})
results_data = pd.concat([results_data, final_results], ignore_index=True)
results_data.to_excel('../result.xlsx', index=False)
shap.summary_plot(shap_values, X_scaled)
