Weighted average method

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.feature_selection import RFE
from sklearn.ensemble import RandomForestRegressor
from sklearn.impute import IterativeImputer
import numpy as np


file_path = '/content/drive/MyDrive/9900data/EDCPCE-replace0toNan.xlsx'
data = pd.read_excel(file_path)

# replace 'ND' to NaN
data.replace('ND', np.nan, inplace=True)

# drop missing value of 'EDC_delta13C'
data.dropna(subset=['EDC_delta13C'], inplace=True)

# drop missing value
data.dropna(axis=1, inplace=True)

X = data.drop(['EDC_delta13C', 'mine'], axis=1)
y = data['EDC_delta13C'].astype(float)

# Standardized features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Using RFE for feature selection
svr = SVR(kernel='linear')
selector = RFE(svr, n_features_to_select=30, step=1)
X_selected = selector.fit_transform(X_scaled, y)

# Using PCA for dimensionality reduction
pca = PCA(n_components=0.95)
X_pca = pca.fit_transform(X_selected)

# Divide the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Using Grid Search for SVR Model Optimization
param_grid = {
    'kernel': ['linear', 'rbf'],
    'C': [0.1, 1, 10, 100],
    'gamma': ['scale', 'auto']
}
grid_search = GridSearchCV(SVR(), param_grid, cv=5, scoring='r2')
grid_search.fit(X_train, y_train)


best_svr = grid_search.best_estimator_

# RF model
rf = RandomForestRegressor(
    n_estimators=35,
    max_depth=4,
    min_samples_split=8,
    min_samples_leaf=4,
    max_features='sqrt',
    bootstrap=True,
    oob_score=True,
    random_state=42
)
rf.fit(X_train, y_train)

# SVR prediction
y_pred_svr = best_svr.predict(X_test)

# RF prediction
y_pred_rf = rf.predict(X_test)

# Weighted average method for integration
weight_svr = 0.5
weight_rf = 0.5

y_pred_ensemble = (weight_svr * y_pred_svr) + (weight_rf * y_pred_rf)

# Evaluate the integrated model
r2_ensemble = r2_score(y_test, y_pred_ensemble)
mse_ensemble = mean_squared_error(y_test, y_pred_ensemble)
rmse_ensemble = np.sqrt(mse_ensemble)

print(f'Best SVR Parameters: {grid_search.best_params_}')
print(f'SVR R²: {r2_score(y_test, y_pred_svr)}')
print(f'SVR MSE: {mean_squared_error(y_test, y_pred_svr)}')
print(f'SVR RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_svr))}')
print()

print(f'RF R²: {r2_score(y_test, y_pred_rf)}')
print(f'RF MSE: {mean_squared_error(y_test, y_pred_rf)}')
print(f'RF RMSE: {np.sqrt(mean_squared_error(y_test, y_pred_rf))}')
print()

print('Ensemble Model Performance:')
print(f'R²: {r2_ensemble}')
print(f'MSE: {mse_ensemble}')
print(f'RMSE: {rmse_ensemble}')


Best SVR Parameters: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
SVR R²: 0.7546222064344412
SVR MSE: 9.266647665402532
SVR RMSE: 3.0441168941751453

RF R²: 0.5690614045800794
RF MSE: 16.27431753767453
RF RMSE: 4.0341439659083225

Ensemble Model Performance:
R²: 0.684257665551053
MSE: 11.923951731224445
RMSE: 3.4531075470110175
