In [39]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr, spearmanr, kendalltau
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [40]:
class RegressionModelTrainer:
    """
    A comprehensive class for training multiple regression models on scaled feature datasets
    with hyperparameter tuning and performance evaluation.
    """
    
    def __init__(self, features_base_path, labels_path, models_save_path):
        """
        Initialize the trainer with paths and configuration.
        
        Args:
            features_base_path: Base path to scaled features directory
            labels_path: Path to the labels CSV file
            models_save_path: Base path to save trained models
        """
        self.features_base_path = features_base_path
        self.labels_path = labels_path
        self.models_save_path = models_save_path
        
        # Define scaling methods and dataset types
        self.scaling_methods = ['log', 'log1p', 'minmax', 'power', 'quantile', 'robust', 'standard']
        self.dataset_types = ['all_features', 'by_component', 's_only']
        self.labels = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        
        # Define models and their hyperparameter grids
        self.models = {
            'linear_regression': {
                'model': LinearRegression,
                'params': {}  # Linear regression has no hyperparameters to tune
            },
            'svr': {
                'model': SVR,
                'params': {
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
                    'kernel': ['rbf', 'poly', 'sigmoid'],
                    'epsilon': [0.01, 0.1, 0.2]
                }
            },
            'random_forest': {
                'model': RandomForestRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'gbm': {
                'model': GradientBoostingRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'subsample': [0.8, 0.9, 1.0]
                }
            }
        }
        
        # Create directories for saving models
        self._create_model_directories()
        
        # Load labels
        self.labels_df = self._load_labels()
        
        # Initialize results storage
        self.results = []
    
    def _create_model_directories(self):
        """Create directories for saving trained models."""
        for model_name in self.models.keys():
            model_dir = os.path.join(self.models_save_path, model_name)
            os.makedirs(model_dir, exist_ok=True)
    
    def _load_labels(self):
        """Load the labels CSV file."""
        try:
            labels_df = pd.read_csv(self.labels_path)
            print(f"Labels loaded successfully. Shape: {labels_df.shape}\n\n")
            return labels_df
        except Exception as e:
            print(f"Error loading labels: {e}\n\n")
            return None
    
    def _load_features(self, scaling_method, dataset_type):
        """
        Load features for a specific scaling method and dataset type.
        
        Args:
            scaling_method: The scaling method (e.g., 'log', 'minmax')
            dataset_type: The dataset type (e.g., 'all_features', 'by_component')
        
        Returns:
            pandas.DataFrame: Loaded features
        """
        filename = f"{dataset_type}_{scaling_method}.csv"
        filepath = os.path.join(self.features_base_path, scaling_method, filename)
        
        try:
            features_df = pd.read_csv(filepath)
            features_df = features_df.drop('videoname', axis=1)
            print(f"Features loaded: {filename}, Shape: {features_df.shape}\n\n")
            return features_df
        except Exception as e:
            print(f"Error loading {filename}: {e}\n\n")
            return None
    
    def _calculate_metrics(self, y_true, y_pred):
        """
        Calculate evaluation metrics: PLCC, SRCC, KRCC, RMSE.
        
        Args:
            y_true: True values
            y_pred: Predicted values
            
        Returns:
            dict: Dictionary containing all metrics
        """
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # PLCC (Pearson Linear Correlation Coefficient)
        plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # SRCC (Spearman Rank Correlation Coefficient)
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        
        # KRCC (Kendall Rank Correlation Coefficient)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        
        # RMSE (Root Mean Square Error)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {
            'PLCC': plcc,
            'SRCC': srcc,
            'KRCC': krcc,
            'RMSE': rmse
        }
    
    def _train_model(self, model_name, X_train, y_train, X_test, y_test):
        """
        Train a single model with hyperparameter tuning.
        
        Args:
            model_name: Name of the model to train
            X_train, y_train: Training data
            X_test, y_test: Testing data
            
        Returns:
            tuple: (trained_model, metrics_dict)
        """
        model_config = self.models[model_name]
        model_class = model_config['model']
        param_grid = model_config['params']
        
        print(f"    Training {model_name}...")
        
        if model_name == 'linear_regression':
            # No hyperparameter tuning needed for linear regression
            model = model_class()
            model.fit(X_train, y_train)
            best_model = model
        else:
            # Use RandomizedSearchCV for faster hyperparameter tuning
            if model_name == 'svr':
                model = model_class()
            else:
                model = model_class(random_state=42)
            
            # Adjust search strategy based on model complexity
            if model_name in ['svr', 'gbm']:
                search_cv = RandomizedSearchCV(
                    model, param_grid, n_iter=20, cv=3, 
                    scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
                )
            else:
                search_cv = RandomizedSearchCV(
                    model, param_grid, n_iter=30, cv=3,
                    scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
                )
            
            search_cv.fit(X_train, y_train)
            best_model = search_cv.best_estimator_
            print(f"      Best parameters: {search_cv.best_params_}")
        print("\n")
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Calculate metrics
        metrics = self._calculate_metrics(y_test, y_pred)
        
        return best_model, metrics
    
    def _save_model(self, model, model_name, dataset_name, label_name):
        """
        Save a trained model to disk.
        
        Args:
            model: Trained model object
            model_name: Name of the model type
            dataset_name: Name of the dataset
            label_name: Name of the target label
        """
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{dataset_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model, f)
            print(f"      Model saved: {filepath}")
        except Exception as e:
            print(f"      Error saving model {filepath}: {e}")
        print("\n")
    
    def train_all_models(self, test_size=0.2, random_state=42):
        """
        Train all models on all datasets for all labels.
        
        Args:
            test_size: Proportion of data to use for testing
            random_state: Random seed for reproducibility
        """
        print("Starting comprehensive model training...")
        print(f"Total combinations: {len(self.scaling_methods)} scaling methods × "
              f"{len(self.dataset_types)} dataset types × {len(self.labels)} labels × "
              f"{len(self.models)} models = {len(self.scaling_methods) * len(self.dataset_types) * len(self.labels) * len(self.models)} models")
        
        total_models = 0
        successful_models = 0
        
        for scaling_method in self.scaling_methods:
            for dataset_type in self.dataset_types:
                dataset_name = f"{dataset_type}_{scaling_method}"
                print(f"\nProcessing dataset: {dataset_name}")
                
                # Load features
                features_df = self._load_features(scaling_method, dataset_type)
                if features_df is None:
                    print(f"  Skipping {dataset_name} due to loading error")
                    continue
                
                for label in self.labels:
                    print(f"  Target label: {label}")
                    
                    # Check if label exists in labels dataframe
                    if label not in self.labels_df.columns:
                        print(f"    Label {label} not found in labels file")
                        continue
                    
                    # Prepare data
                    y = self.labels_df[label].values
                    X = features_df.values
                    
                    # Check for matching dimensions
                    if len(X) != len(y):
                        print(f"    Dimension mismatch: Features={len(X)}, Labels={len(y)}")
                        continue
                    
                    # Remove samples with NaN values
                    mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
                    X_clean = X[mask]
                    y_clean = y[mask]
                    
                    if len(X_clean) == 0:
                        print(f"    No valid samples after cleaning")
                        continue
                    
                    # Split data
                    X_train, X_test, y_train, y_test = train_test_split(
                        X_clean, y_clean, test_size=test_size, random_state=random_state
                    )
                    
                    # Train each model
                    for model_name in self.models.keys():
                        total_models += 1
                        
                        try:
                            model, metrics = self._train_model(
                                model_name, X_train, y_train, X_test, y_test
                            )
                            
                            # Save model
                            self._save_model(model, model_name, dataset_name, label)
                            
                            # Store results
                            result = {
                                'scaling_method': scaling_method,
                                'dataset_type': dataset_type,
                                'dataset_name': dataset_name,
                                'label': label,
                                'model': model_name,
                                'train_samples': len(X_train),
                                'test_samples': len(X_test),
                                **metrics
                            }
                            self.results.append(result)
                            
                            successful_models += 1
                            
                            print(f"      ✓ {model_name} - PLCC: {metrics['PLCC']:.4f}, "
                                  f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                                  f"RMSE: {metrics['RMSE']:.4f}")
                            
                        except Exception as e:
                            print(f"      ✗ {model_name} failed: {e}")
        
        print(f"\nTraining completed!")
        print(f"Successfully trained: {successful_models}/{total_models} models\n\n")
        
        # Save results to CSV
        self.save_results()
    
    def save_results(self, filename="training_results.csv"):
        """Save all training results to a CSV file."""
        if self.results:
            results_df = pd.DataFrame(self.results)
            filepath = os.path.join(self.models_save_path, filename)
            results_df.to_csv(filepath, index=False)
            print(f"\nResults saved to: {filepath}")
            
            # Display summary statistics
            self.display_results_summary(results_df)
        else:
            print("No results to save.")

        print("\n")
    
    def display_results_summary(self, results_df):
        """Display summary statistics of the training results."""
        print("\n" + "="*80)
        print("TRAINING RESULTS SUMMARY")
        print("="*80)
        
        # Best models by metric
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        
        for metric in metrics:
            print(f"\nBest {metric}:")
            if metric == 'RMSE':
                best_result = results_df.loc[results_df[metric].idxmin()]
                print(f"  {best_result['model']} on {best_result['dataset_name']} "
                      f"for {best_result['label']}: {best_result[metric]:.4f}")
            else:
                best_result = results_df.loc[results_df[metric].idxmax()]
                print(f"  {best_result['model']} on {best_result['dataset_name']} "
                      f"for {best_result['label']}: {best_result[metric]:.4f}")
        
        # Average performance by model
        print(f"\nAverage Performance by Model:")
        model_avg = results_df.groupby('model')[metrics].mean()
        for model in model_avg.index:
            print(f"  {model}:")
            for metric in metrics:
                print(f"    {metric}: {model_avg.loc[model, metric]:.4f}")
        
        # Average performance by dataset
        print(f"\nTop 5 Dataset Configurations (by PLCC):")
        dataset_avg = results_df.groupby(['dataset_name', 'label'])['PLCC'].mean().sort_values(ascending=False).head()
        for (dataset, label), plcc in dataset_avg.items():
            print(f"  {dataset} → {label}: {plcc:.4f}")
        print("\n")

In [41]:
class RegressionModelTrainer:
    """
    A comprehensive class for training multiple regression models on scaled feature datasets
    with hyperparameter tuning and performance evaluation.
    """
    
    def __init__(self, features_base_path, labels_path, models_save_path):
        """
        Initialize the trainer with paths and configuration.
        
        Args:
            features_base_path: Base path to scaled features directory
            labels_path: Path to the labels CSV file
            models_save_path: Base path to save trained models
        """
        self.features_base_path = features_base_path
        self.labels_path = labels_path
        self.models_save_path = models_save_path
        
        # Define scaling methods and dataset types
        self.scaling_methods = ['log', 'log1p', 'minmax', 'power', 'quantile', 'robust', 'standard']
        self.dataset_types = ['all_features', 'by_component', 's_only']
        self.labels = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        
        # Define models and their hyperparameter grids
        self.models = {
            'linear_regression': {
                'model': LinearRegression,
                'params': {}  # Linear regression has no hyperparameters to tune
            },
            'ridge_regression': {
                'model': Ridge,
                'params': {
                    'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga'],
                    'max_iter': [1000, 2000, 3000]
                }
            },
            'svr': {
                'model': SVR,
                'params': {
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
                    'kernel': ['rbf', 'poly', 'sigmoid'],
                    'epsilon': [0.01, 0.1, 0.2]
                }
            },
            'random_forest': {
                'model': RandomForestRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'gbm': {
                'model': GradientBoostingRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'subsample': [0.8, 0.9, 1.0]
                }
            },
            'mlp': {
                'model': MLPRegressor,
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100), (100, 50, 25)],
                    'activation': ['relu', 'tanh', 'logistic'],
                    'solver': ['adam', 'lbfgs', 'sgd'],
                    'alpha': [0.0001, 0.001, 0.01, 0.1],
                    'learning_rate': ['constant', 'invscaling', 'adaptive'],
                    'max_iter': [500, 1000, 2000]
                }
            }
        }
        
        # Create directories for saving models
        self._create_model_directories()
        
        # Load labels
        self.labels_df = self._load_labels()
        
        # Initialize results storage
        self.results = []
    
    def _create_model_directories(self):
        """Create directories for saving trained models."""
        for model_name in self.models.keys():
            model_dir = os.path.join(self.models_save_path, model_name)
            os.makedirs(model_dir, exist_ok=True)
    
    def _load_labels(self):
        """Load the labels CSV file."""
        try:
            labels_df = pd.read_csv(self.labels_path)
            print(f"Labels loaded successfully. Shape: {labels_df.shape}\n")
            return labels_df
        except Exception as e:
            print(f"Error loading labels: {e}\n")
            return None
    
    def _load_features(self, scaling_method, dataset_type):
        """
        Load features for a specific scaling method and dataset type.
        
        Args:
            scaling_method: The scaling method (e.g., 'log', 'minmax')
            dataset_type: The dataset type (e.g., 'all_features', 'by_component')
        
        Returns:
            pandas.DataFrame: Loaded features
        """
        filename = f"{dataset_type}_{scaling_method}.csv"
        filepath = os.path.join(self.features_base_path, scaling_method, filename)
        
        try:
            features_df = pd.read_csv(filepath)
            features_df = features_df.drop('videoname', axis=1)
            print(f"    Features loaded: {filename}, Shape: {features_df.shape}")
            return features_df
        except Exception as e:
            print(f"    Error loading {filename}: {e}")
            return None
    
    def _model_exists(self, model_name, dataset_name, label_name):
        """
        Check if a trained model already exists.
        
        Args:
            model_name: Name of the model type
            dataset_name: Name of the dataset
            label_name: Name of the target label
            
        Returns:
            bool: True if model file exists, False otherwise
        """
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{dataset_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        return os.path.exists(filepath)
    
    def _load_existing_model(self, model_name, dataset_name, label_name):
        """
        Load an existing trained model from disk.
        
        Args:
            model_name: Name of the model type
            dataset_name: Name of the dataset
            label_name: Name of the target label
            
        Returns:
            model: Loaded model object or None if loading fails
        """
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{dataset_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        
        try:
            with open(filepath, 'rb') as f:
                model = pickle.load(f)
            print(f"      Loaded existing model: {filepath}")
            return model
        except Exception as e:
            print(f"      Error loading existing model {filepath}: {e}")
            return None
    
    def _calculate_metrics(self, y_true, y_pred):
        """
        Calculate evaluation metrics: PLCC, SRCC, KRCC, RMSE.
        
        Args:
            y_true: True values
            y_pred: Predicted values
            
        Returns:
            dict: Dictionary containing all metrics
        """
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # PLCC (Pearson Linear Correlation Coefficient)
        plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # SRCC (Spearman Rank Correlation Coefficient)
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        
        # KRCC (Kendall Rank Correlation Coefficient)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        
        # RMSE (Root Mean Square Error)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {
            'PLCC': plcc,
            'SRCC': srcc,
            'KRCC': krcc,
            'RMSE': rmse
        }
    
    def _train_model(self, model_name, X_train, y_train, X_test, y_test):
        """
        Train a single model with hyperparameter tuning.
        
        Args:
            model_name: Name of the model to train
            X_train, y_train: Training data
            X_test, y_test: Testing data
            
        Returns:
            tuple: (trained_model, metrics_dict)
        """
        model_config = self.models[model_name]
        model_class = model_config['model']
        param_grid = model_config['params']
        
        print(f"    Training {model_name}...")
        
        if model_name in ['linear_regression']:
            # No hyperparameter tuning needed for simple models
            model = model_class()
            model.fit(X_train, y_train)
            best_model = model
        else:
            # Use RandomizedSearchCV for other models
            if model_name == 'svr':
                model = model_class()
            elif model_name == 'mlp':
                model = model_class(random_state=42, early_stopping=True, validation_fraction=0.1)
            else:
                model = model_class(random_state=42)
            
            # Adjust search strategy based on model complexity
            if model_name in ['svr', 'gbm', 'mlp']:
                search_cv = RandomizedSearchCV(
                    model, param_grid, n_iter=20, cv=3, 
                    scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
                )
            else:
                search_cv = RandomizedSearchCV(
                    model, param_grid, n_iter=30, cv=3,
                    scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
                )
            
            search_cv.fit(X_train, y_train)
            best_model = search_cv.best_estimator_
            print(f"      Best parameters: {search_cv.best_params_}")
        
        # Make predictions
        y_pred = best_model.predict(X_test)
        
        # Calculate metrics
        metrics = self._calculate_metrics(y_test, y_pred)
        
        return best_model, metrics
    
    def _save_model(self, model, model_name, dataset_name, label_name):
        """
        Save a trained model to disk.
        
        Args:
            model: Trained model object
            model_name: Name of the model type
            dataset_name: Name of the dataset
            label_name: Name of the target label
        """
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{dataset_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model, f)
            print(f"      Model saved: {filepath}")
        except Exception as e:
            print(f"      Error saving model {filepath}: {e}")
    
    def train_all_models(self, test_size=0.2, random_state=42, force_retrain=False):
        """
        Train all models on all datasets for all labels.
        
        Args:
            test_size: Proportion of data to use for testing
            random_state: Random seed for reproducibility
            force_retrain: If True, retrain even if model exists
        """
        print("=" * 80)
        print("STARTING COMPREHENSIVE MODEL TRAINING")
        print("=" * 80)
        print(f"Total combinations: {len(self.scaling_methods)} scaling methods × "
              f"{len(self.dataset_types)} dataset types × {len(self.labels)} labels × "
              f"{len(self.models)} models = {len(self.scaling_methods) * len(self.dataset_types) * len(self.labels) * len(self.models)} models\n")
        
        total_models = 0
        successful_models = 0
        skipped_models = 0
        
        for scaling_method in self.scaling_methods:
            print(f"🔄 Processing scaling method: {scaling_method.upper()}")
            print("-" * 60)
            
            for dataset_type in self.dataset_types:
                dataset_name = f"{dataset_type}_{scaling_method}"
                print(f"\n📊 Dataset: {dataset_name}")
                
                # Load features
                features_df = self._load_features(scaling_method, dataset_type)
                if features_df is None:
                    print(f"    ⚠️  Skipping {dataset_name} due to loading error\n")
                    continue
                
                for label in self.labels:
                    print(f"\n  🎯 Target label: {label}")
                    
                    # Check if label exists in labels dataframe
                    if label not in self.labels_df.columns:
                        print(f"    ❌ Label {label} not found in labels file")
                        continue
                    
                    # Prepare data
                    y = self.labels_df[label].values
                    X = features_df.values
                    
                    # Check for matching dimensions
                    if len(X) != len(y):
                        print(f"    ❌ Dimension mismatch: Features={len(X)}, Labels={len(y)}")
                        continue
                    
                    # Remove samples with NaN values
                    mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
                    X_clean = X[mask]
                    y_clean = y[mask]
                    
                    if len(X_clean) == 0:
                        print(f"    ❌ No valid samples after cleaning")
                        continue
                    
                    # Split data
                    X_train, X_test, y_train, y_test = train_test_split(
                        X_clean, y_clean, test_size=test_size, random_state=random_state
                    )
                    
                    print(f"    📈 Training samples: {len(X_train)}, Test samples: {len(X_test)}")
                    
                    # Train each model
                    for model_name in self.models.keys():
                        total_models += 1
                        print(f"\n    🤖 Model: {model_name}")
                        
                        # Check if model already exists
                        if not force_retrain and self._model_exists(model_name, dataset_name, label):
                            print(f"      ⏭️  Model already exists, loading...")
                            
                            # Load existing model and evaluate on test set
                            try:
                                existing_model = self._load_existing_model(model_name, dataset_name, label)
                                if existing_model is not None:
                                    # Make predictions with existing model
                                    y_pred = existing_model.predict(X_test)
                                    
                                    # Calculate metrics
                                    metrics = self._calculate_metrics(y_test, y_pred)
                                    
                                    # Store results
                                    result = {
                                        'scaling_method': scaling_method,
                                        'dataset_type': dataset_type,
                                        'dataset_name': dataset_name,
                                        'label': label,
                                        'model': model_name,
                                        'train_samples': len(X_train),
                                        'test_samples': len(X_test),
                                        **metrics
                                    }
                                    self.results.append(result)
                                    
                                    print(f"      ✅ Performance - PLCC: {metrics['PLCC']:.4f}, "
                                          f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                                          f"RMSE: {metrics['RMSE']:.4f}")
                                    
                                    successful_models += 1
                                    skipped_models += 1
                                else:
                                    print(f"      ⚠️  Failed to load existing model, will retrain")
                                    # Continue to training section below
                            except Exception as e:
                                print(f"      ⚠️  Error with existing model: {e}, will retrain")
                                # Continue to training section below
                            else:
                                continue  # Skip to next model if existing model was successfully loaded
                        
                        # Train new model
                        try:
                            model, metrics = self._train_model(
                                model_name, X_train, y_train, X_test, y_test
                            )
                            
                            # Save model
                            self._save_model(model, model_name, dataset_name, label)
                            
                            # Store results
                            result = {
                                'scaling_method': scaling_method,
                                'dataset_type': dataset_type,
                                'dataset_name': dataset_name,
                                'label': label,
                                'model': model_name,
                                'train_samples': len(X_train),
                                'test_samples': len(X_test),
                                **metrics
                            }
                            self.results.append(result)
                            
                            successful_models += 1
                            
                            print(f"      ✅ Performance - PLCC: {metrics['PLCC']:.4f}, "
                                  f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                                  f"RMSE: {metrics['RMSE']:.4f}")
                            
                        except Exception as e:
                            print(f"      ❌ Training failed: {e}")
            
            print(f"\n{'='*60}")
            print(f"Completed scaling method: {scaling_method.upper()}")
            print(f"{'='*60}\n")
        
        print("=" * 80)
        print("TRAINING COMPLETED!")
        print("=" * 80)
        print(f"✅ Successfully trained/loaded: {successful_models}/{total_models} models")
        print(f"⏭️  Skipped (already existed): {skipped_models}/{total_models} models")
        print(f"🆕 Newly trained: {successful_models - skipped_models}/{total_models} models")
        print("=" * 80)
        
        # Save results to CSV
        self.save_results()
    
    def save_results(self, filename="training_results.csv"):
        """Save all training results to a CSV file."""
        if self.results:
            results_df = pd.DataFrame(self.results)
            filepath = os.path.join(self.models_save_path, filename)
            results_df.to_csv(filepath, index=False)
            print(f"\n📊 Results saved to: {filepath}")
            
            # Display summary statistics
            self.display_results_summary(results_df)
        else:
            print("❌ No results to save.")

        print("\n")
    
    def display_results_summary(self, results_df):
        """Display summary statistics of the training results."""
        print("\n" + "="*80)
        print("🏆 TRAINING RESULTS SUMMARY")
        print("="*80)
        
        # Best models by metric
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        
        print("\n🥇 BEST PERFORMING MODELS BY METRIC:")
        print("-" * 50)
        for metric in metrics:
            if metric == 'RMSE':
                best_result = results_df.loc[results_df[metric].idxmin()]
                print(f"\n  📉 Best {metric} (Lower is better):")
                print(f"     🎯 {best_result['model']} on {best_result['dataset_name']}")
                print(f"     📊 Label: {best_result['label']}")
                print(f"     🎖️  Score: {best_result[metric]:.4f}")
            else:
                best_result = results_df.loc[results_df[metric].idxmax()]
                print(f"\n  📈 Best {metric} (Higher is better):")
                print(f"     🎯 {best_result['model']} on {best_result['dataset_name']}")
                print(f"     📊 Label: {best_result['label']}")
                print(f"     🎖️  Score: {best_result[metric]:.4f}")
        
        # Average performance by model
        print(f"\n\n📊 AVERAGE PERFORMANCE BY MODEL:")
        print("-" * 50)
        model_avg = results_df.groupby('model')[metrics].mean()
        for model in model_avg.index:
            print(f"\n  🤖 {model.upper()}:")
            for metric in metrics:
                print(f"     {metric}: {model_avg.loc[model, metric]:.4f}")
        
        # Average performance by dataset
        print(f"\n\n🏅 TOP 5 DATASET CONFIGURATIONS (by PLCC):")
        print("-" * 50)
        dataset_avg = results_df.groupby(['dataset_name', 'label'])['PLCC'].mean().sort_values(ascending=False).head()
        for i, ((dataset, label), plcc) in enumerate(dataset_avg.items(), 1):
            print(f"  #{i} 📊 {dataset} → {label}: {plcc:.4f}")
        
        print("\n" + "="*80)

In [44]:
def main():
    # Initialize the trainer
    trainer = RegressionModelTrainer(
        features_base_path="../preprocessing/scaling/scaled-features/",
        labels_path="../../dataset/cleaned/cleaned-mos.csv",
        models_save_path="./"
    )
    
    # Train all models
    trainer.train_all_models(test_size=0.2, random_state=42)
    
    print("\nAll models have been trained and saved!")

if __name__ == "__main__":
    main()

Labels loaded successfully. Shape: (1000, 7)

STARTING COMPREHENSIVE MODEL TRAINING
Total combinations: 7 scaling methods × 3 dataset types × 6 labels × 6 models = 756 models

🔄 Processing scaling method: LOG
------------------------------------------------------------

📊 Dataset: all_features_log
    Features loaded: all_features_log.csv, Shape: (1000, 1152)

  🎯 Target label: TSV
    📈 Training samples: 800, Test samples: 200

    🤖 Model: linear_regression
      ⏭️  Model already exists, loading...
      Loaded existing model: ./linear_regression\all_features_log_TSV.pkl
      ✅ Performance - PLCC: 0.5913, SRCC: 0.5889, KRCC: 0.4100, RMSE: 0.7316

    🤖 Model: ridge_regression
      ⏭️  Model already exists, loading...
      Loaded existing model: ./ridge_regression\all_features_log_TSV.pkl
      ✅ Performance - PLCC: 0.8526, SRCC: 0.8307, KRCC: 0.6476, RMSE: 0.3487

    🤖 Model: svr
      ⏭️  Model already exists, loading...
      Loaded existing model: ./svr\all_features_log_TSV.p

In [28]:
results = pd.read_csv('./training_results.csv')
results = results.sort_values(by = ['label', 'PLCC'], ascending = [False, False])
results

Unnamed: 0,scaling_method,dataset_type,dataset_name,label,model,train_samples,test_samples,PLCC,SRCC,KRCC,RMSE
377,robust,all_features,all_features_robust,U,svr,800,200,0.85279,0.851481,0.656104,0.416279
401,robust,by_component,by_component_robust,U,svr,800,200,0.85279,0.851481,0.656104,0.416279
449,standard,all_features,all_features_standard,U,svr,800,200,0.851002,0.850417,0.655199,0.422978
473,standard,by_component,by_component_standard,U,svr,800,200,0.851002,0.850417,0.655199,0.422978
233,power,all_features,all_features_power,U,svr,800,200,0.8508,0.851958,0.656205,0.421699
257,power,by_component,by_component_power,U,svr,800,200,0.8508,0.851958,0.656205,0.421699
305,quantile,all_features,all_features_quantile,U,svr,800,200,0.847024,0.854956,0.659726,0.422064
329,quantile,by_component,by_component_quantile,U,svr,800,200,0.847024,0.854956,0.659726,0.422064
161,minmax,all_features,all_features_minmax,U,svr,800,200,0.817773,0.822204,0.62854,0.449713
185,minmax,by_component,by_component_minmax,U,svr,800,200,0.817773,0.822204,0.62854,0.449713
