In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_pca = pca.fit_transform(X)

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# List of regression models up to Decision Tree
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('KNN', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor())
]

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with 10 folds
    cv_scores_rmse = cross_val_score(model, X, y, cv=10, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=10, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Hyperparameter tuning using GridSearchCV for the selected models
param_grids = {
    'Linear Regression': {},  # No hyperparameters for linear regression
    'Ridge Regression': {'alpha': [0.1, 1, 10, 100]},
    'Lasso Regression': {'alpha': [0.1, 1, 10]},
    'KNN': {'n_neighbors': [3, 5, 10, 15], 'weights': ['uniform', 'distance']},
    'Decision Tree': {'max_depth': [None, 5, 10, 20], 'min_samples_split': [2, 5, 10]}
}

# Perform hyperparameter tuning and evaluation for each model
for name, model in models:
    print(f"\nTraining and hyperparameter tuning for {name}...")
    param_grid = param_grids.get(name, {})

    # Skip models with no parameters to tune
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=10, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best {name} model: {grid_search.best_params_}")
    else:
        best_model = model
        best_model.fit(X_train, y_train)  # Explicitly fit the model if no hyperparameters are tuned

    # Cross-validation after tuning
    rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)
    print(f"Cross-validation after tuning for {name}:")
    print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
    print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}")

    # Evaluate the model on the test data
    y_pred = best_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    test_r2 = r2_score(y_test, y_pred)

    # Cross-validation on the test data
    test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

    print(f"\nTest RMSE: {test_rmse}")
    print(f"Test R²: {test_r2}")
    print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
    print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}")



Training and hyperparameter tuning for Linear Regression...
Cross-validation after tuning for Linear Regression:
CV Mean RMSE (after tuning): 0.34735736022123026, CV RMSE Std: 0.0511733615527798
CV Mean R² (after tuning): 0.8537205153844616, CV R² Std: 0.02436128878256942

Test RMSE: 0.6043519793460793
Test R²: 0.8565307413397948
Test CV Mean RMSE: 0.6765945963874704, Test CV RMSE Std: 0.2131053283920084
Test CV Mean R²: 0.7244934723658905, Test CV R² Std: 0.09209450511537234

Training and hyperparameter tuning for Ridge Regression...
Best Ridge Regression model: {'alpha': 100}
Cross-validation after tuning for Ridge Regression:
CV Mean RMSE (after tuning): 0.34208842353388536, CV RMSE Std: 0.05107372275185737
CV Mean R² (after tuning): 0.8559935336690273, CV R² Std: 0.023783216423366355

Test RMSE: 0.6029482161740294
Test R²: 0.8571964559248534
Test CV Mean RMSE: 0.5030530961336431, Test CV RMSE Std: 0.18011660990414793
Test CV Mean R²: 0.7940899617380046, Test CV R² Std: 0.080531480

In [2]:
import pandas as pd
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA
import numpy as np

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# XGBoost model
model = XGBRegressor(random_state=42, n_jobs=-1)

# Reduced hyperparameter tuning grid (fewer combinations)
param_grid = {
    'n_estimators': [100, 200],               # Fewer estimators
    'max_depth': [3, 6],                       # Shallower trees
    'learning_rate': [0.01, 0.1],             # Different learning rates
    'subsample': [0.8, 1.0]                    # Fraction of samples used for training
}

# Hyperparameter tuning and evaluation with fewer cross-validation folds
print(f"Training and hyperparameter tuning for XGBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced CV folds
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best XGBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    # Cross-validation with fewer folds
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Cross-validation performance
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)

print(f"Cross-validation after tuning for XGBoost:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

# Cross-validation performance on the test set
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for XGBoost...
Best XGBoost model: {'learning_rate': 0.1, 'max_depth': 6, 'n_estimators': 200, 'subsample': 0.8}
Cross-validation after tuning for XGBoost:
CV Mean RMSE (after tuning): 0.2481026887887942, CV RMSE Std: 0.03686741521643318
CV Mean R² (after tuning): 0.8965616226196289, CV R² Std: 0.013336679315897253

Test RMSE: 0.5017688404643853
Test R²: 0.9011022448539734
Test CV Mean RMSE: 0.6818448973939629, Test CV RMSE Std: 0.17073820678551446
Test CV Mean R²: 0.7290929317474365, Test CV R² Std: 0.0569218551728663



In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# AdaBoost model
model = AdaBoostRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],             # Number of boosting rounds
    'learning_rate': [0.01, 0.1],         # Learning rate
    'loss': ['linear', 'square']          # Loss function options
}

print(f"Training and hyperparameter tuning for AdaBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # 5-fold CV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best AdaBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    rmse_mean = -cv_scores_rmse.mean()   # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Cross-validation performance
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)

print(f"Cross-validation after tuning for AdaBoost:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

# Cross-validation performance on the test set
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for AdaBoost...
Best AdaBoost model: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 100}
Cross-validation after tuning for AdaBoost:
CV Mean RMSE (after tuning): 0.7292156034271668, CV RMSE Std: 0.037032436282969136
CV Mean R² (after tuning): 0.6951365083185609, CV R² Std: 0.014633281954732093

Test RMSE: 0.8923391890529824
Test R²: 0.6872200618928292
Test CV Mean RMSE: 0.8966253084403478, Test CV RMSE Std: 0.17073283300014006
Test CV Mean R²: 0.6431405607693669, Test CV R² Std: 0.05518343747168119



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],           # Number of boosting rounds
    'learning_rate': [0.05, 0.1],         # Step size shrinkage
    'max_depth': [3, 5],                  # Maximum depth of a tree
    'subsample': [0.8, 1.0]               # Fraction of samples used for fitting the trees
}

print(f"Training and hyperparameter tuning for Gradient Boosting...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # 5-fold CV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Gradient Boosting model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    rmse_mean = -cv_scores_rmse.mean()   # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Cross-validation performance
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)

print(f"Cross-validation after tuning for Gradient Boosting:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

# Cross-validation performance on the test set
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for Gradient Boosting...
Best Gradient Boosting model: {'learning_rate': 0.05, 'max_depth': 5, 'n_estimators': 200, 'subsample': 1.0}
Cross-validation after tuning for Gradient Boosting:
CV Mean RMSE (after tuning): 0.25015986428846315, CV RMSE Std: 0.03405469028426656
CV Mean R² (after tuning): 0.8956431106552933, CV R² Std: 0.012372533749280636

Test RMSE: 0.5361511980236687
Test R²: 0.8870845114157249
Test CV Mean RMSE: 0.668335105072102, Test CV RMSE Std: 0.16382534477374877
Test CV Mean R²: 0.7338897462147209, Test CV R² Std: 0.056960383952345595



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Random Forest model
model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],           # Number of trees
    'max_depth': [None, 10, 20],          # Maximum depth of each tree
    'min_samples_split': [2, 5],          # Minimum number of samples required to split an internal node
    'min_samples_leaf': [1, 2],           # Minimum number of samples required to be at a leaf node
    'bootstrap': [True, False]            # Whether bootstrap samples are used when building trees
}

print(f"Training and hyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # 5-fold CV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Random Forest model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    rmse_mean = -cv_scores_rmse.mean()   # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Cross-validation performance
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)

print(f"Cross-validation after tuning for Random Forest:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

# Cross-validation performance on the test set
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for Random Forest...
Best Random Forest model: {'bootstrap': True, 'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Cross-validation after tuning for Random Forest:
CV Mean RMSE (after tuning): 0.33486117360006595, CV RMSE Std: 0.037105468809639934
CV Mean R² (after tuning): 0.8602086618624869, CV R² Std: 0.013490170627949931

Test RMSE: 0.5622013612642355
Test R²: 0.8758454208199241
Test CV Mean RMSE: 0.8086493971518987, Test CV RMSE Std: 0.16689611682845654
Test CV Mean R²: 0.6784404121509457, Test CV R² Std: 0.05392955352349438



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:/Users/dhanu/OneDrive/Desktop/machine learning/ML TRAIN DATASETS/gpt2 embeddings.xlsx")

# Drop any irrelevant columns, such as text or index columns==
data = data.drop(columns=['Equation', 'GPT2_Embedding'], errors='ignore')

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# SVR model
model = SVR()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],           # Regularization parameter
    'epsilon': [0.1, 0.2, 0.5],  # Epsilon in the epsilon-SVR model
    'kernel': ['linear', 'rbf']  # Kernel type to be used
}

print(f"Training and hyperparameter tuning for SVR...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # 5-fold CV
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best SVR model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y):
    cv_scores_rmse = cross_val_score(model, X, y, cv=5, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=5, scoring='r2')

    rmse_mean = -cv_scores_rmse.mean()   # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Cross-validation performance
rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train)

print(f"Cross-validation after tuning for SVR:")
print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

# Evaluate the model on the test data
y_pred = best_model.predict(X_test)
test_rmse = np.sqrt(mean_squared_error(y_test, y_pred))
test_r2 = r2_score(y_test, y_pred)

# Cross-validation performance on the test set 
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for SVR...
Best SVR model: {'C': 10, 'epsilon': 0.1, 'kernel': 'rbf'}
Cross-validation after tuning for SVR:
CV Mean RMSE (after tuning): 0.1898483429089733, CV RMSE Std: 0.021740942479126044
CV Mean R² (after tuning): 0.9207026233798714, CV R² Std: 0.008302080046288355

Test RMSE: 0.43865476817161697
Test R²: 0.9244169060486128
Test CV Mean RMSE: 0.4870366854198326, Test CV RMSE Std: 0.13866470743768772
Test CV Mean R²: 0.8040143245511251, Test CV R² Std: 0.05617175036449698

