In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import cross_val_score, GridSearchCV, train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# List of regression models up to Decision Tree
models = [
    ('Linear Regression', LinearRegression()),
    ('Ridge Regression', Ridge()),
    ('Lasso Regression', Lasso()),
    ('KNN', KNeighborsRegressor()),
    ('Decision Tree', DecisionTreeRegressor())
]

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with fewer folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Hyperparameter tuning using GridSearchCV for the selected models (reduced hyperparameter space)
param_grids = {
    'Linear Regression': {},  # No hyperparameters for linear regression
    'Ridge Regression': {'alpha': [1, 10]},  # Reduced hyperparameter space
    'Lasso Regression': {'alpha': [0.1, 1]},  # Reduced hyperparameter space
    'KNN': {'n_neighbors': [5], 'weights': ['uniform']},  # Reduced search space
    'Decision Tree': {'max_depth': [None, 5], 'min_samples_split': [2, 5]}  # Reduced search space
}

# Perform hyperparameter tuning and evaluation for each model
for name, model in models:
    print(f"Training and hyperparameter tuning for {name}...")
    
    param_grid = param_grids.get(name, {})
    
    # Step 2: Hyperparameter tuning using GridSearchCV
    if param_grid:
        grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        best_model = grid_search.best_estimator_
        print(f"Best {name} model: {grid_search.best_params_}")
    else:
        best_model = model.fit(X_train, y_train)

    # Step 3: Cross-validation after tuning
    print(f"Cross-validation after tuning for {name}:")
    rmse_mean, rmse_std, r2_mean, r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
    print(f"CV Mean RMSE (after tuning): {rmse_mean}, CV RMSE Std: {rmse_std}")
    print(f"CV Mean R² (after tuning): {r2_mean}, CV R² Std: {r2_std}\n")

    # Step 4: Testing on the test set with the best model
    y_test_pred = best_model.predict(X_test)
    test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
    test_r2 = r2_score(y_test, y_test_pred)

    # Test CV scores
    test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

    print(f"Test RMSE: {test_rmse}")
    print(f"Test R²: {test_r2}")
    print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
    print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for Linear Regression...
Cross-validation after tuning for Linear Regression:
CV Mean RMSE (after tuning): 0.37487434763210814, CV RMSE Std: 0.11737285408923398
CV Mean R² (after tuning): 0.843647007689988, CV R² Std: 0.04694554352039047

Test RMSE: 0.5548310434227557
Test R²: 0.8790793532803853
Test CV Mean RMSE: 1.3441109852712294, Test CV RMSE Std: 0.311815702622124
Test CV Mean R²: 0.4619267767116873, Test CV R² Std: 0.11816723570212984

Training and hyperparameter tuning for Ridge Regression...
Best Ridge Regression model: {'alpha': 10}
Cross-validation after tuning for Ridge Regression:
CV Mean RMSE (after tuning): 0.3671726827763718, CV RMSE Std: 0.10839536206873733
CV Mean R² (after tuning): 0.8468276554043591, CV R² Std: 0.043317639966460826

Test RMSE: 0.5538478785808704
Test R²: 0.87950751820526
Test CV Mean RMSE: 0.7574514133702778, Test CV RMSE Std: 0.16620775857211892
Test CV Mean R²: 0.6958754006511854, Test CV R² Std: 0.066904536178569

In [3]:
import pandas as pd
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from xgboost import XGBRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of the variance
X_pca = pca.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# XGBoost model
model = XGBRegressor(random_state=42, n_jobs=-1)

# Reduced hyperparameter tuning grid (fewer combinations)
param_grid = {
    'n_estimators': [100, 150],               # Fewer estimators
    'learning_rate': [0.01, 0.1],             # Limited learning rates
    'max_depth': [3, 5],                      # Shallower trees
    'subsample': [0.7, 0.8],                  # Slightly lower subsample values
    'colsample_bytree': [0.8, 1.0]            # Adjusted column sampling
}

# Hyperparameter tuning and evaluation with fewer cross-validation folds
print(f"Training and hyperparameter tuning for XGBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced CV folds
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best XGBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with fewer folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Step 1: Cross-validation after tuning (on training set)
print(f"Cross-validation after tuning for XGBoost:")
train_rmse_mean, train_rmse_std, train_r2_mean, train_r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
print(f"CV Mean RMSE (after tuning): {train_rmse_mean}, CV RMSE Std: {train_rmse_std}")
print(f"CV Mean R² (after tuning): {train_r2_mean}, CV R² Std: {train_r2_std}\n")

# Step 2: Testing on the test set
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
test_r2 = r2_score(y_test, y_test_pred)

# Test CV scores
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for XGBoost...
Best XGBoost model: {'colsample_bytree': 1.0, 'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 150, 'subsample': 0.7}
Cross-validation after tuning for XGBoost:
CV Mean RMSE (after tuning): 0.30264910238362186, CV RMSE Std: 0.07346926240677541
CV Mean R² (after tuning): 0.8739823341369629, CV R² Std: 0.02821056692247991





Test RMSE: 0.5069311578811253
Test R²: 0.8990568518638611
Test CV Mean RMSE: 0.8180029381265006, Test CV RMSE Std: 0.11758197709668125
Test CV Mean R²: 0.675088107585907, Test CV R² Std: 0.02447463625002074



In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import AdaBoostRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_pca = pca.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# AdaBoost model
model = AdaBoostRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],  # Reduced options for faster execution
    'learning_rate': [0.01, 0.1],  # Reduced learning rate options
    'loss': ['linear', 'square']  # Reduced loss functions for faster testing
}

# Hyperparameter tuning and evaluation
print(f"Training and hyperparameter tuning for AdaBoost...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best AdaBoost model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with 5 folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Step 1: Cross-validation after tuning (on training set)
print(f"Cross-validation after tuning for AdaBoost:")
train_rmse_mean, train_rmse_std, train_r2_mean, train_r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
print(f"CV Mean RMSE (after tuning): {train_rmse_mean}, CV RMSE Std: {train_rmse_std}")
print(f"CV Mean R² (after tuning): {train_r2_mean}, CV R² Std: {train_r2_std}\n")

# Step 2: Testing on the test set
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
test_r2 = r2_score(y_test, y_test_pred)

# Test CV scores
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for AdaBoost...


  _data = np.array(data, dtype=dtype, copy=copy,


Best AdaBoost model: {'learning_rate': 0.1, 'loss': 'square', 'n_estimators': 100}
Cross-validation after tuning for AdaBoost:
CV Mean RMSE (after tuning): 0.9332836827304088, CV RMSE Std: 0.06543115382323982
CV Mean R² (after tuning): 0.6101715510278675, CV R² Std: 0.02223883896145461





Test RMSE: 1.015017723202473
Test R²: 0.5953065199531684
Test CV Mean RMSE: 0.930766641580392, Test CV RMSE Std: 0.11734322898608636
Test CV Mean R²: 0.6291314952795777, Test CV R² Std: 0.029968676539522374



In [5]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_pca = pca.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Gradient Boosting model
model = GradientBoostingRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [50, 100],  # Reduced options for faster execution
    'learning_rate': [0.01, 0.1],  # Reduced learning rate options
    'max_depth': [3, 5],  # Reduced depth for faster training
    'subsample': [0.7, 0.8]  # Reduced subsample options
}

# Hyperparameter tuning and evaluation
print(f"Training and hyperparameter tuning for Gradient Boosting...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Gradient Boosting model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with 5 folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Step 1: Cross-validation after tuning (on training set)
print(f"Cross-validation after tuning for Gradient Boosting:")
train_rmse_mean, train_rmse_std, train_r2_mean, train_r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
print(f"CV Mean RMSE (after tuning): {train_rmse_mean}, CV RMSE Std: {train_rmse_std}")
print(f"CV Mean R² (after tuning): {train_r2_mean}, CV R² Std: {train_r2_std}\n")

# Step 2: Testing on the test set
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
test_r2 = r2_score(y_test, y_test_pred)

# Test CV scores
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for Gradient Boosting...
Best Gradient Boosting model: {'learning_rate': 0.1, 'max_depth': 5, 'n_estimators': 100, 'subsample': 0.8}
Cross-validation after tuning for Gradient Boosting:
CV Mean RMSE (after tuning): 0.31425095011185067, CV RMSE Std: 0.0774208829864673
CV Mean R² (after tuning): 0.8691863319167373, CV R² Std: 0.029663850790337676





Test RMSE: 0.5073878232713636
Test R²: 0.8988748829208911
Test CV Mean RMSE: 0.951590126592565, Test CV RMSE Std: 0.17648237777062908
Test CV Mean R²: 0.6187794594053193, Test CV R² Std: 0.06942117873365039



In [6]:
import pandas as pd
import numpy as np
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Preserve 95% of variance
X_pca = pca.fit_transform(X)

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# Random Forest model
model = RandomForestRegressor(random_state=42)

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'n_estimators': [100, 200],  # Reduced estimators for faster execution
    'max_depth': [None, 10, 20],  # Reduced depth options
    'min_samples_split': [2, 5],  # Limited to lower values for faster computation
    'min_samples_leaf': [1, 2],   # Limited leaf size options
    'bootstrap': [True]           # No need to test False for bootstrap
}

# Hyperparameter tuning and evaluation
print(f"Training and hyperparameter tuning for Random Forest...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best Random Forest model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with 5 folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Step 1: Cross-validation after tuning (on training set)
print(f"Cross-validation after tuning for Random Forest:")
train_rmse_mean, train_rmse_std, train_r2_mean, train_r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
print(f"CV Mean RMSE (after tuning): {train_rmse_mean}, CV RMSE Std: {train_rmse_std}")
print(f"CV Mean R² (after tuning): {train_r2_mean}, CV R² Std: {train_r2_std}\n")

# Step 2: Testing on the test set
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
test_r2 = r2_score(y_test, y_test_pred)

# Test CV scores
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Training and hyperparameter tuning for Random Forest...
Best Random Forest model: {'bootstrap': True, 'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Cross-validation after tuning for Random Forest:
CV Mean RMSE (after tuning): 0.43973313810891695, CV RMSE Std: 0.08149588294507212
CV Mean R² (after tuning): 0.8167652011797897, CV R² Std: 0.03036305273527818





Test RMSE: 0.6677125071949344
Test R²: 0.8248710357921665
Test CV Mean RMSE: 1.0298020901898732, Test CV RMSE Std: 0.15767461895020266
Test CV Mean R²: 0.5909487222130052, Test CV R² Std: 0.03743159712114883



In [7]:
import pandas as pd
from sklearn.svm import SVR
from sklearn.model_selection import GridSearchCV, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.metrics import mean_squared_error, r2_score

# Load the dataset
data = pd.read_excel(r"C:\Users\dhanu\OneDrive\Desktop\machine learning\ML TRAIN DATASETS\train_t5_embeddings.xlsx")

# Features and target variable
X = data.iloc[:, :-1]  # All columns except the last one
y = data['output']     # Target variable

# Standardize the data
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Apply PCA for dimensionality reduction
pca = PCA(n_components=0.95)  # Retain 95% of variance
X_pca = pca.fit_transform(X_scaled)
print(f"Number of components selected by PCA: {pca.n_components_}")

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_pca, y, test_size=0.2, random_state=42)

# SVR model
model = SVR()

# Hyperparameter tuning using GridSearchCV
param_grid = {
    'C': [0.1, 1, 10],  # Reduced values for faster training
    'kernel': ['linear', 'rbf'],  # Focus on common kernels
    'gamma': ['scale'],  # Common gamma values
}

# Hyperparameter tuning and evaluation
print(f"Training and hyperparameter tuning for SVR...")
grid_search = GridSearchCV(model, param_grid, cv=5, scoring='neg_mean_squared_error', n_jobs=-1)  # Reduced cv folds to speed up
grid_search.fit(X_train, y_train)
best_model = grid_search.best_estimator_
print(f"Best SVR model: {grid_search.best_params_}")

# Function to calculate and return performance metrics
def evaluate_model(model, X, y, cv_folds=5):
    # Cross-validation with 5 folds (default 5)
    cv_scores_rmse = cross_val_score(model, X, y, cv=cv_folds, scoring='neg_mean_squared_error')  # Reduced folds
    cv_scores_r2 = cross_val_score(model, X, y, cv=cv_folds, scoring='r2')  # Reduced folds

    # Compute mean and standard deviation of CV scores
    rmse_mean = -cv_scores_rmse.mean()  # Convert negative RMSE to positive
    rmse_std = cv_scores_rmse.std()
    r2_mean = cv_scores_r2.mean()
    r2_std = cv_scores_r2.std()

    return rmse_mean, rmse_std, r2_mean, r2_std

# Step 1: Cross-validation after tuning (on training set)
print(f"Cross-validation after tuning for SVR:")
train_rmse_mean, train_rmse_std, train_r2_mean, train_r2_std = evaluate_model(best_model, X_train, y_train, cv_folds=5)
print(f"CV Mean RMSE (after tuning): {train_rmse_mean}, CV RMSE Std: {train_rmse_std}")
print(f"CV Mean R² (after tuning): {train_r2_mean}, CV R² Std: {train_r2_std}\n")

# Step 2: Testing on the test set
y_test_pred = best_model.predict(X_test)
test_rmse = mean_squared_error(y_test, y_test_pred, squared=False)  # RMSE
test_r2 = r2_score(y_test, y_test_pred)

# Test CV scores
test_cv_rmse_mean, test_cv_rmse_std, test_cv_r2_mean, test_cv_r2_std = evaluate_model(best_model, X_test, y_test, cv_folds=5)

print(f"Test RMSE: {test_rmse}")
print(f"Test R²: {test_r2}")
print(f"Test CV Mean RMSE: {test_cv_rmse_mean}, Test CV RMSE Std: {test_cv_rmse_std}")
print(f"Test CV Mean R²: {test_cv_r2_mean}, Test CV R² Std: {test_cv_r2_std}\n")


Number of components selected by PCA: 214
Training and hyperparameter tuning for SVR...
Best SVR model: {'C': 10, 'gamma': 'scale', 'kernel': 'rbf'}
Cross-validation after tuning for SVR:
CV Mean RMSE (after tuning): 0.16413393490872874, CV RMSE Std: 0.03617572079374697
CV Mean R² (after tuning): 0.9314636635239119, CV R² Std: 0.014444859261490901





Test RMSE: 0.3664838170652731
Test R²: 0.9472419711762071
Test CV Mean RMSE: 0.4009073041761133, Test CV RMSE Std: 0.07830549772424744
Test CV Mean R²: 0.8392390758780021, Test CV R² Std: 0.03144014144900435

