In [2]:
pip install pandas numpy scikit-learn scipy

Collecting pandas
  Downloading pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl.metadata (91 kB)
Collecting numpy
  Downloading numpy-2.3.2-cp313-cp313-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting scikit-learn
  Downloading scikit_learn-1.7.1-cp313-cp313-macosx_12_0_arm64.whl.metadata (11 kB)
Collecting scipy
  Downloading scipy-1.16.1-cp313-cp313-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Downloading joblib-1.5.1-py3-none-any.whl.metadata (5.6 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading pandas-2.3.1-cp313-cp313-macosx_11_0_arm64.whl (10.7 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.7/10.7 MB[0m [31m3.0 MB/s[0m  

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.svm import SVR
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from scipy.stats import pearsonr, spearmanr, kendalltau
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

class RegressionModelTrainer:
    """
    A class for training SVR and MLP regression models on the SVD feature dataset
    with hyperparameter tuning and performance evaluation.
    """

    def __init__(self, features_path, labels_path, models_save_path):
        """
        Initialize the trainer with paths and configuration.

        Args:
            features_path: Path to the modified_svd_features.csv file
            labels_path: Path to the labels CSV file
            models_save_path: Base path to save trained models and results
        """
        self.features_path = features_path
        self.labels_path = labels_path
        self.models_save_path = models_save_path

        self.labels = ['TSV', 'B', 'SR', 'S', 'U', 'O']

        self.models = {
            'svr': {
                'model': SVR,
                'params': {
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1],
                    'kernel': ['rbf', 'poly', 'sigmoid'],
                    'epsilon': [0.01, 0.1, 0.2]
                }
            },
            'mlp': {
                'model': MLPRegressor,
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100)],
                    'activation': ['relu', 'tanh', 'logistic'],
                    'solver': ['adam', 'sgd'],
                    'alpha': [0.0001, 0.001, 0.01],
                    'learning_rate': ['constant', 'adaptive'],
                    'max_iter': [500, 1000]
                }
            }
        }

        self.model_save_dirs = {
            'svr': 'svr-2',
            'mlp': 'mlp-2'
        }

        self._create_model_directories()
        self.labels_df = self._load_labels()
        self.results = []

    def _create_model_directories(self):
        """Create specified directories for saving trained models."""
        print("Creating model directories...")
        for dir_name in self.model_save_dirs.values():
            model_dir = os.path.join(self.models_save_path, dir_name)
            os.makedirs(model_dir, exist_ok=True)
        print(f"Directories will be saved in: {os.path.abspath(self.models_save_path)}\n")

    def _load_labels(self):
        """Load the labels CSV file."""
        try:
            labels_df = pd.read_csv(self.labels_path)
            print(f"Labels loaded successfully from {self.labels_path}. Shape: {labels_df.shape}\n")
            return labels_df
        except Exception as e:
            print(f"Error loading labels: {e}\n")
            return None

    def _load_features(self):
        """Load features from the specified SVD features CSV file."""
        try:
            features_df = pd.read_csv(self.features_path)
            if 'videoname' in features_df.columns:
                features_df = features_df.drop('videoname', axis=1)
            print(f"Features loaded from: {self.features_path}, Shape: {features_df.shape}\n")
            return features_df
        except Exception as e:
            print(f"Error loading features from {self.features_path}: {e}\n")
            return None

    def _calculate_metrics(self, y_true, y_pred):
        """Calculate evaluation metrics: PLCC, SRCC, KRCC, RMSE."""
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean, y_pred_clean = y_true[mask], y_pred[mask]

        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}

        plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))

        return {'PLCC': plcc, 'SRCC': srcc, 'KRCC': krcc, 'RMSE': rmse}

    def _train_model(self, model_name, X_train, y_train, X_test, y_test):
        """Train a single model with hyperparameter tuning using RandomizedSearchCV."""
        model_config = self.models[model_name]
        model_class = model_config['model']
        param_grid = model_config['params']

        print(f"    Training {model_name}...")

        if model_name == 'svr':
            model = model_class()
        else: # MLP
            model = model_class(random_state=42, early_stopping=True, validation_fraction=0.1)

        search_cv = RandomizedSearchCV(
            model, param_grid, n_iter=20, cv=3,
            scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
        )

        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        print(f"      Best parameters: {search_cv.best_params_}")

        y_pred = best_model.predict(X_test)
        metrics = self._calculate_metrics(y_test, y_pred)

        return best_model, metrics

    def _save_model(self, model, model_name, dataset_name, label_name):
        """Save a trained model to its specific directory."""
        model_dir_name = self.model_save_dirs[model_name]
        model_dir = os.path.join(self.models_save_path, model_dir_name)

        filename = f"{dataset_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)

        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model, f)
            print(f"      Model saved: {filepath}")
        except Exception as e:
            print(f"      Error saving model {filepath}: {e}")

    def train_all_models(self, test_size=0.2, random_state=42):
        """Train all configured models on the SVD dataset for all specified labels."""
        print("=" * 80)
        print("STARTING SVD MODEL TRAINING (SVR & MLP)")
        print("=" * 80)

        features_df = self._load_features()
        if features_df is None:
            print("Halting training due to feature loading error.")
            return

        dataset_name = "modified_svd_features"

        for label in self.labels:
            print(f"\n  🎯 Target label: {label}")

            if label not in self.labels_df.columns:
                print(f"    ❌ Label {label} not found in labels file. Skipping.")
                continue

            y = self.labels_df[label].values
            X = features_df.values

            if len(X) != len(y):
                print(f"    ❌ Dimension mismatch: Features={len(X)}, Labels={len(y)}. Skipping.")
                continue

            mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
            X_clean, y_clean = X[mask], y[mask]

            if len(X_clean) == 0:
                print(f"    ❌ No valid samples after cleaning. Skipping.")
                continue

            X_train, X_test, y_train, y_test = train_test_split(
                X_clean, y_clean, test_size=test_size, random_state=random_state
            )

            print(f"    📈 Training samples: {len(X_train)}, Test samples: {len(X_test)}")

            for model_name in self.models.keys():
                print(f"\n    🤖 Model: {model_name}")
                try:
                    model, metrics = self._train_model(
                        model_name, X_train, y_train, X_test, y_test
                    )

                    self._save_model(model, model_name, dataset_name, label)

                    result = {
                        'dataset_name': dataset_name,
                        'label': label,
                        'model': model_name,
                        'train_samples': len(X_train),
                        'test_samples': len(X_test),
                        **metrics
                    }
                    self.results.append(result)

                    print(f"      ✅ Performance - PLCC: {metrics['PLCC']:.4f}, "
                          f"SRCC: {metrics['SRCC']:.4f}, RMSE: {metrics['RMSE']:.4f}")

                except Exception as e:
                    print(f"      ❌ Training failed for {model_name}: {e}")

        print("\n" + "=" * 80)
        print("TRAINING COMPLETED!")
        print("=" * 80)

        self.save_results()

    def save_results(self, filename="svd_training_results.csv"):
        """Save all training results to a new CSV file."""
        if self.results:
            results_df = pd.DataFrame(self.results)
            filepath = os.path.join(self.models_save_path, filename)
            results_df.to_csv(filepath, index=False)
            print(f"\n📊 Results saved to: {filepath}")
            self.display_results_summary(results_df)
        else:
            print("❌ No results to save.")
        print("\n")

    def display_results_summary(self, results_df):
        """Display summary statistics of the training results."""
        print("\n" + "="*80)
        print("🏆 SVD TRAINING RESULTS SUMMARY 🏆")
        print("="*80)

        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']

        print("\n🥇 BEST PERFORMING MODELS BY METRIC:")
        print("-" * 50)
        for metric in metrics:
            if metric == 'RMSE':
                best_result = results_df.loc[results_df[metric].idxmin()]
                print(f"\n  📉 Best {metric} (Lower is better):")
            else:
                best_result = results_df.loc[results_df[metric].idxmax()]
                print(f"\n  📈 Best {metric} (Higher is better):")
            print(f"      🎯 {best_result['model']} for label '{best_result['label']}'")
            print(f"      🎖️ Score: {best_result[metric]:.4f}")

        print(f"\n\n📊 AVERAGE PERFORMANCE BY MODEL:")
        print("-" * 50)
        model_avg = results_df.groupby('model')[metrics].mean()
        for model in model_avg.index:
            print(f"\n  🤖 {model.upper()}:")
            for metric in metrics:
                print(f"      {metric}: {model_avg.loc[model, metric]:.4f}")

        print("\n" + "="*80)

def main():
    """Main function to run the model training pipeline."""

    # --- Paths adjusted to run from within the 'regressors' folder ---

    # Go up one level (to 'objective-1') then into 'features'
    features_file_path = '../features/cleaned/modified_svd_features.csv'

    # Go up two levels (to the project root) then into 'dataset'
    labels_file_path = "../../dataset/cleaned/cleaned-mos.csv"

    # Save models in the current directory ('.') which is the 'regressors' folder
    save_path = "."

    # --- Initialize and run the trainer ---
    trainer = RegressionModelTrainer(
        features_path=features_file_path,
        labels_path=labels_file_path,
        models_save_path=save_path
    )

    trainer.train_all_models(test_size=0.2, random_state=42)

    print("\n✅ All SVD models have been trained and results have been saved locally!")

if __name__ == "__main__":
    main()
