In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import os
import pickle
from sklearn.linear_model import Ridge
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error
from sklearn.neural_network import MLPRegressor
from scipy.stats import pearsonr, spearmanr, kendalltau
from scipy.optimize import curve_fit
from itertools import combinations
import xgboost as xgb
from catboost import CatBoostRegressor
import warnings
warnings.filterwarnings('ignore')

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

In [2]:
class CSVRegressionModelTrainer:
    
    def __init__(self, csv_file_path, labels_csv_path, models_save_path):
        self.csv_file_path = csv_file_path
        self.labels_csv_path = labels_csv_path
        self.models_save_path = models_save_path
        
        # Define feature ranges based on your dataset structure
        self.level1_features = {
            'U1': [f'U1_{i}' for i in range(1, 257)],  # U1_1 to U1_256
            'S1': [f'S1_{i}' for i in range(1, 257)],  # S1_1 to S1_256
            'V1': [f'V1_{i}' for i in range(1, 257)]   # V1_1 to V1_256
        }
        
        self.level2_features = {
            'U2': [f'U2_{i}' for i in range(1, 129)],  # U2_1 to U2_128
            'S2': [f'S2_{i}' for i in range(1, 129)],  # S2_1 to S2_128
            'V2': [f'V2_{i}' for i in range(1, 129)]   # V2_1 to V2_128
        }
        
        # Flatten all features
        self.all_level1_features = []
        for feature_group in self.level1_features.values():
            self.all_level1_features.extend(feature_group)
            
        self.all_level2_features = []
        for feature_group in self.level2_features.values():
            self.all_level2_features.extend(feature_group)
            
        self.all_features = self.all_level1_features + self.all_level2_features
        
        # Target labels
        self.labels = ['TSV', 'B', 'SR', 'S', 'U', 'O']
        
        # Generate feature combinations
        self.feature_combinations = self._generate_feature_combinations()
        
        # Initialize models
        self.models = self._initialize_models()
        
        # Create directories and load data
        self._create_model_directories()
        self.data_df, self.labels_df, self.merged_df = self._load_csv_data()
        self.results = []
    
    def _generate_feature_combinations(self):
        """Generate all possible feature combinations"""
        combinations_dict = {}
        
        # Level 1 combinations (all possible combinations of U1, S1, V1)
        level1_component_names = ['U1', 'S1', 'V1']
        
        for r in range(1, len(level1_component_names) + 1):
            for combo in combinations(level1_component_names, r):
                features_list = []
                for component in combo:
                    features_list.extend(self.level1_features[component])
                
                combo_name = f"level1_{'_'.join(combo)}"
                combinations_dict[combo_name] = features_list
        
        # Level 2 combinations (all possible combinations of U2, S2, V2)
        level2_component_names = ['U2', 'S2', 'V2']
        
        for r in range(1, len(level2_component_names) + 1):
            for combo in combinations(level2_component_names, r):
                features_list = []
                for component in combo:
                    features_list.extend(self.level2_features[component])
                
                combo_name = f"level2_{'_'.join(combo)}"
                combinations_dict[combo_name] = features_list
        
        # Combined Level 1 & 2 combinations
        level1_level2_combinations = [
            (['U1'], ['U2']),
            (['S1'], ['S2']),
            (['V1'], ['V2']),
            (['U1', 'S1'], ['U2', 'S2']),
            (['U1', 'V1'], ['U2', 'V2']),
            (['S1', 'V1'], ['S2', 'V2']),
            (['U1', 'S1', 'V1'], ['U2', 'S2', 'V2'])
        ]
        
        for level1_combo, level2_combo in level1_level2_combinations:
            features_list = []
            
            # Add level 1 features
            for component in level1_combo:
                features_list.extend(self.level1_features[component])
            
            # Add level 2 features
            for component in level2_combo:
                features_list.extend(self.level2_features[component])
            
            combo_name = f"combined_{'_'.join(level1_combo + level2_combo)}"
            combinations_dict[combo_name] = features_list
        
        print(f"Generated {len(combinations_dict)} feature combinations")
        
        level1_count = sum(1 for name in combinations_dict.keys() if name.startswith('level1_'))
        level2_count = sum(1 for name in combinations_dict.keys() if name.startswith('level2_'))
        combined_count = sum(1 for name in combinations_dict.keys() if name.startswith('combined_'))
        
        print(f"Level 1 combinations: {level1_count}")
        print(f"Level 2 combinations: {level2_count}")
        print(f"Combined L1+L2 combinations: {combined_count}")
        
        return combinations_dict
    
    def _initialize_models(self):
        """Initialize all regression models with hyperparameter grids"""
        models = {
            'mlp_regressor': {
                'model': MLPRegressor,
                'params': {
                    'hidden_layer_sizes': [(50,), (100,), (50, 50), (100, 50), (100, 100), (100, 50, 25)],
                    'activation': ['relu', 'tanh'],
                    'solver': ['adam', 'lbfgs'],
                    'alpha': [0.0001, 0.001, 0.01, 0.1],
                    'learning_rate': ['constant', 'adaptive'],
                    'max_iter': [1000, 2000]
                }
            },
            'ridge_regressor': {
                'model': Ridge,
                'params': {
                    'alpha': [0.01, 0.1, 1, 10, 100, 1000],
                    'solver': ['auto', 'svd', 'cholesky', 'lsqr', 'sag', 'saga'],
                    'max_iter': [1000, 2000, 3000]
                }
            },
            'decision_tree_regressor': {
                'model': DecisionTreeRegressor,
                'params': {
                    'max_depth': [None, 5, 10, 15, 20, 25],
                    'min_samples_split': [2, 5, 10, 20],
                    'min_samples_leaf': [1, 2, 4, 8],
                    'max_features': ['auto', 'sqrt', 'log2', None],
                    'criterion': ['squared_error', 'friedman_mse', 'absolute_error']
                }
            },
            'random_forest_regressor': {
                'model': RandomForestRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'extra_trees_regressor': {
                'model': ExtraTreesRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'max_depth': [None, 10, 20, 30],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'max_features': ['auto', 'sqrt', 'log2']
                }
            },
            'gradient_boosting_regressor': {
                'model': GradientBoostingRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'max_depth': [3, 5, 7],
                    'min_samples_split': [2, 5, 10],
                    'min_samples_leaf': [1, 2, 4],
                    'subsample': [0.8, 0.9, 1.0]
                }
            },
            'adaboost_regressor': {
                'model': AdaBoostRegressor,
                'params': {
                    'n_estimators': [50, 100, 200, 300],
                    'learning_rate': [0.01, 0.1, 0.5, 1.0, 2.0],
                    'loss': ['linear', 'square', 'exponential']
                }
            },
            'svr_regressor': {
                'model': SVR,
                'params': {
                    'kernel': ['linear', 'rbf'],
                    'C': [0.1, 1, 10, 100],
                    'gamma': ['scale', 'auto'],
                    'epsilon': [0.01, 0.1]
                }
            },
            'xgboost_regressor': {
                'model': xgb.XGBRegressor,
                'params': {
                    'n_estimators': [50, 100, 200],
                    'max_depth': [3, 5, 7, 9],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'subsample': [0.8, 0.9, 1.0],
                    'colsample_bytree': [0.8, 0.9, 1.0],
                    'reg_alpha': [0, 0.1, 1],
                    'reg_lambda': [1, 1.5, 2]
                }
            },
            'catboost_regressor': {
                'model': CatBoostRegressor,
                'params': {
                    'iterations': [50, 100, 200],
                    'depth': [4, 6, 8, 10],
                    'learning_rate': [0.01, 0.1, 0.2],
                    'l2_leaf_reg': [1, 3, 5, 7, 9],
                    'bootstrap_type': ['Bayesian', 'Bernoulli', 'MVS']
                }
            }
        }
        
        return models
    
    def _logistic_func(self, x, beta1, beta2, beta3, beta4):
        """Four-parameter logistic function for score mapping"""
        return beta1 + (beta2 - beta1) / (1 + np.exp(-(x - beta3) / np.abs(beta4)))
    
    def _fit_logistic_regression(self, y_pred, y_true):
        """Fit logistic regression to map predictions to ground truth"""
        try:
            # Initial parameter guesses
            beta1_init = np.min(y_true)
            beta2_init = np.max(y_true)
            beta3_init = np.mean(y_pred)
            beta4_init = np.std(y_pred)
            
            initial_guess = [beta1_init, beta2_init, beta3_init, beta4_init]
            
            # Fit the logistic function
            params, _ = curve_fit(
                self._logistic_func, 
                y_pred, 
                y_true, 
                p0=initial_guess,
                maxfev=5000,
                bounds=([np.min(y_true)-1, np.min(y_true)-1, -np.inf, -np.inf], 
                       [np.max(y_true)+1, np.max(y_true)+1, np.inf, np.inf])
            )
            
            # Apply the fitted function to predictions
            y_pred_fitted = self._logistic_func(y_pred, *params)
            
            return y_pred_fitted, params
            
        except Exception as e:
            print(f"        Warning: Logistic fitting failed ({e}), using original predictions")
            return y_pred, None
    
    def _create_model_directories(self):
        """Create directories for saving models"""
        model_names = list(self.models.keys())
        for model_name in model_names:
            model_dir = os.path.join(self.models_save_path, model_name)
            os.makedirs(model_dir, exist_ok=True)
    
    def _load_csv_data(self):
        """Load and validate CSV data from both features and labels files"""
        try:
            # Load features CSV
            data_df = pd.read_csv(self.csv_file_path)
            print(f"Features CSV loaded successfully. Shape: {data_df.shape}")
            print(f"Total columns in features: {len(data_df.columns)}")
            
            # Load labels CSV
            labels_df = pd.read_csv(self.labels_csv_path)
            print(f"Labels CSV loaded successfully. Shape: {labels_df.shape}")
            print(f"Total columns in labels: {len(labels_df.columns)}")
            
            # Check for required columns in features CSV
            required_feature_columns = ['videoname'] + self.all_features
            missing_feature_columns = [col for col in required_feature_columns if col not in data_df.columns]
            
            if missing_feature_columns:
                print(f"Warning: Missing {len(missing_feature_columns)} feature columns from expected features")
                if len(missing_feature_columns) <= 20:
                    print(f"Examples: {missing_feature_columns[:20]}")
                else:
                    print(f"First 20 missing feature columns: {missing_feature_columns[:20]}...")
            else:
                print("All expected feature columns found in features dataset")
            
            # Check for required columns in labels CSV
            required_label_columns = ['videoname'] + self.labels
            missing_label_columns = [col for col in required_label_columns if col not in labels_df.columns]
            
            if missing_label_columns:
                print(f"Warning: Missing {len(missing_label_columns)} label columns from expected labels")
                print(f"Missing label columns: {missing_label_columns}")
            else:
                print("All expected label columns found in labels dataset")
            
            # Check if videoname column exists in both datasets
            if 'videoname' not in data_df.columns:
                print("Error: 'videoname' column not found in features CSV")
                return None, None, None
            
            if 'videoname' not in labels_df.columns:
                print("Error: 'videoname' column not found in labels CSV")
                return None, None, None
            
            # Merge datasets on videoname
            print("Merging features and labels datasets...")
            merged_df = pd.merge(data_df, labels_df, on='videoname', how='inner')
            print(f"Merged dataset shape: {merged_df.shape}")
            print(f"Successfully merged {len(merged_df)} samples")
            
            # Check how many samples were lost during merge
            original_features_samples = len(data_df)
            original_labels_samples = len(labels_df)
            merged_samples = len(merged_df)
            
            print(f"Merge statistics:")
            print(f"  - Original features samples: {original_features_samples}")
            print(f"  - Original labels samples: {original_labels_samples}")
            print(f"  - Merged samples: {merged_samples}")
            print(f"  - Samples lost: {max(original_features_samples, original_labels_samples) - merged_samples}")
            
            # Display basic statistics
            print(f"\nMerged dataset info:")
            print(f"  - Rows: {len(merged_df)}")
            print(f"  - Features available: {sum(1 for col in merged_df.columns if col in self.all_features)}/{len(self.all_features)}")
            print(f"  - Labels available: {sum(1 for col in merged_df.columns if col in self.labels)}/{len(self.labels)}")
            
            # Check for duplicate videonames
            duplicate_features = data_df['videoname'].duplicated().sum()
            duplicate_labels = labels_df['videoname'].duplicated().sum()
            
            if duplicate_features > 0:
                print(f"Warning: {duplicate_features} duplicate videonames found in features CSV")
            if duplicate_labels > 0:
                print(f"Warning: {duplicate_labels} duplicate videonames found in labels CSV")
            
            return data_df, labels_df, merged_df
            
        except Exception as e:
            print(f"Error loading CSV files: {e}")
            return None, None, None
    
    def _model_exists(self, model_name, feature_combo_name, label_name):
        """Check if a trained model already exists"""
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        return os.path.exists(filepath)
    
    def _load_existing_model(self, model_name, feature_combo_name, label_name):
        """Load an existing trained model"""
        model_dir = os.path.join(self.models_save_path, model_name)
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        
        try:
            with open(filepath, 'rb') as f:
                model_data = pickle.load(f)
            print(f"      Loaded existing model: {filename}")
            return model_data
        except Exception as e:
            print(f"      Error loading existing model {filename}: {e}")
            return None
    
    def _calculate_metrics(self, y_true, y_pred):
        """Calculate performance metrics with logistic regression fitting for PLCC"""
        # Remove NaN values
        mask = ~(np.isnan(y_true) | np.isnan(y_pred))
        y_true_clean = y_true[mask]
        y_pred_clean = y_pred[mask]
        
        if len(y_true_clean) == 0:
            return {'PLCC': np.nan, 'SRCC': np.nan, 'KRCC': np.nan, 'RMSE': np.nan}
        
        # Apply logistic regression fitting for PLCC calculation
        y_pred_fitted, logistic_params = self._fit_logistic_regression(y_pred_clean, y_true_clean)
        
        # Calculate PLCC with fitted predictions (this becomes the main PLCC)
        if logistic_params is not None:
            plcc, _ = pearsonr(y_true_clean, y_pred_fitted)
        else:
            plcc, _ = pearsonr(y_true_clean, y_pred_clean)
        
        # Calculate other metrics with original predictions (standard approach)
        srcc, _ = spearmanr(y_true_clean, y_pred_clean)
        krcc, _ = kendalltau(y_true_clean, y_pred_clean)
        rmse = np.sqrt(mean_squared_error(y_true_clean, y_pred_clean))
        
        return {
            'PLCC': plcc,
            'SRCC': srcc,
            'KRCC': krcc,
            'RMSE': rmse,
            'logistic_params': logistic_params
        }
    
    def _train_model(self, model_name, X_train, y_train, X_test, y_test):
        """Train a single model with hyperparameter tuning"""
        model_config = self.models[model_name]
        model_class = model_config['model']
        param_grid = model_config['params']
        
        print(f"      Training {model_name}...")
        
        # Initialize model with appropriate parameters
        if model_name == 'mlp_regressor':
            model = model_class(random_state=42, early_stopping=True, validation_fraction=0.1)

        elif model_name == 'xgboost_regressor':
            # --- MODIFIED FOR CPU ---
            # The original GPU-specific line is commented out below to prevent errors on Mac.
            # model = model_class(random_state=42, objective='reg:squarederror', tree_method="gpu_hist", predictor="gpu_predictor")
            
            # This line uses the default CPU trainer.
            model = model_class(random_state=42, objective='reg:squarederror')

        elif model_name == 'catboost_regressor':
            # --- MODIFIED FOR CPU ---
            # The original GPU-specific line is commented out below to prevent errors on Mac.
            # model = model_class(random_state=42, verbose=False, task_type="GPU")

            # This line uses the default CPU trainer.
            model = model_class(random_state=42, verbose=False)
            
        elif model_name == 'svr_regressor':
            model = model_class()
        else:
            model = model_class(random_state=42)
        
        # Perform hyperparameter search
        print(f"      Performing hyperparameter search...")
        n_iter = 15 if model_name in ['xgboost_regressor', 'catboost_regressor', 'mlp_regressor', 'svr_regressor'] else 20
        
        search_cv = RandomizedSearchCV(
            model, param_grid, n_iter=n_iter, cv=3, 
            scoring='neg_mean_squared_error', random_state=42, n_jobs=-1
        )
        
        search_cv.fit(X_train, y_train)
        best_model = search_cv.best_estimator_
        
        # Evaluate model
        print(f"      Evaluating model performance...")
        y_pred = best_model.predict(X_test)
        metrics = self._calculate_metrics(y_test, y_pred)
        
        # Store model and logistic parameters together
        model_data = {
            'model': best_model,
            'logistic_params': metrics.get('logistic_params', None)
        }
        
        return model_data, metrics
    
    def _save_model(self, model_data, model_name, feature_combo_name, label_name):
        """Save trained model and logistic parameters to disk"""
        model_dir = os.path.join(self.models_save_path, model_name)
        os.makedirs(model_dir, exist_ok=True)
        
        filename = f"{feature_combo_name}_{label_name}.pkl"
        filepath = os.path.join(model_dir, filename)
        
        try:
            with open(filepath, 'wb') as f:
                pickle.dump(model_data, f)
            print(f"      Model saved: {filename}")
            return True
        except Exception as e:
            print(f"      Error saving model {filename}: {e}")
            return False
    
    def _save_intermediate_results(self):
        """Save intermediate results during training"""
        if self.results:
            results_df = pd.DataFrame(self.results)
            timestamp = pd.Timestamp.now().strftime("%Y%m%d_%H%M%S")
            filename = f"intermediate_results_{timestamp}.csv"
            filepath = os.path.join(self.models_save_path, filename)
            
            try:
                results_df.to_csv(filepath, index=False)
                print(f"      Intermediate results saved: {filename}")
            except Exception as e:
                print(f"      Error saving intermediate results: {e}")
    
    def train_all_models(self, test_size=0.2, random_state=42, force_retrain=False):
        """Train all models on all feature combinations and labels"""
        if self.merged_df is None:
            print("No merged data available. Cannot proceed with training.")
            return
        
        print("=" * 100)
        print("STARTING COMPREHENSIVE MODEL TRAINING (with Logistic Regression Fitting for PLCC)")
        print("=" * 100)
        
        # Display training overview
        level1_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('level1_'))
        level2_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('level2_'))
        combined_count = sum(1 for name in self.feature_combinations.keys() if name.startswith('combined_'))
        
        print(f"Feature combinations breakdown:")
        print(f"  - Level 1: {level1_count}")
        print(f"  - Level 2: {level2_count}")
        print(f"  - Combined: {combined_count}")
        print(f"  - Total: {len(self.feature_combinations)}")
        print(f"\nTotal models to train: {len(self.feature_combinations)} × {len(self.labels)} × {len(self.models)} = "
              f"{len(self.feature_combinations) * len(self.labels) * len(self.models)} models\n")
        
        total_models = 0
        successful_models = 0
        skipped_models = 0
        
        # Train models for each feature combination
        for combo_idx, (feature_combo_name, feature_list) in enumerate(self.feature_combinations.items(), 1):
            print(f"Processing feature combination {combo_idx}/{len(self.feature_combinations)}: {feature_combo_name}")
            print(f"   Features: {len(feature_list)} features")
            print("-" * 80)
            
            # Check if all features exist in the merged dataset
            missing_features = [f for f in feature_list if f not in self.merged_df.columns]
            if missing_features:
                print(f"    Missing features: {len(missing_features)} features not found. Skipping this combination.\n")
                if len(missing_features) <= 10:
                    print(f"       Examples: {missing_features[:10]}")
                continue
            
            # Extract features
            X = self.merged_df[feature_list].values
            
            # Train models for each label
            for label in self.labels:
                print(f"\n  Target label: {label}")
                
                if label not in self.merged_df.columns:
                    print(f"    Label {label} not found in merged dataset")
                    continue
                
                y = self.merged_df[label].values
                
                # Validate dimensions
                if len(X) != len(y):
                    print(f"    Dimension mismatch: Features={len(X)}, Labels={len(y)}")
                    continue
                
                # Clean data (remove NaN values)
                mask = ~(np.isnan(y) | np.isnan(X).any(axis=1))
                X_clean = X[mask]
                y_clean = y[mask]
                
                if len(X_clean) == 0:
                    print(f"    No valid samples after cleaning")
                    continue
                
                # Split data
                X_train, X_test, y_train, y_test = train_test_split(
                    X_clean, y_clean, test_size=test_size, random_state=random_state
                )
                
                print(f"    Training samples: {len(X_train)}, Test samples: {len(X_test)}")
                
                # Train each model
                for model_name in self.models.keys():
                    total_models += 1
                    print(f"\n    Model: {model_name}")
                    
                    # Check if model already exists
                    if not force_retrain and self._model_exists(model_name, feature_combo_name, label):
                        print(f"      Model already exists, loading...")
                        
                        try:
                            existing_model_data = self._load_existing_model(model_name, feature_combo_name, label)
                            if existing_model_data is not None:
                                # Handle both old format (just model) and new format (dict with model and logistic_params)
                                if isinstance(existing_model_data, dict) and 'model' in existing_model_data:
                                    existing_model = existing_model_data['model']
                                    existing_logistic_params = existing_model_data.get('logistic_params', None)
                                else:
                                    existing_model = existing_model_data
                                    existing_logistic_params = None
                                
                                # Evaluate existing model
                                y_pred = existing_model.predict(X_test)
                                metrics = self._calculate_metrics(y_test, y_pred)
                                
                                # Store results
                                result = {
                                    'feature_combination': feature_combo_name,
                                    'features': ', '.join(feature_list[:5]) + '...' if len(feature_list) > 5 else ', '.join(feature_list),
                                    'feature_count': len(feature_list),
                                    'label': label,
                                    'model': model_name,
                                    'train_samples': len(X_train),
                                    'test_samples': len(X_test),
                                    'PLCC': metrics['PLCC'],
                                    'SRCC': metrics['SRCC'],
                                    'KRCC': metrics['KRCC'],
                                    'RMSE': metrics['RMSE']
                                }
                                self.results.append(result)
                                
                                print(f"      Performance - PLCC: {metrics['PLCC']:.4f}, "
                                      f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                                      f"RMSE: {metrics['RMSE']:.4f}")
                                
                                successful_models += 1
                                skipped_models += 1
                                continue
                        except Exception as e:
                            print(f"      Error with existing model: {e}, will retrain")
                    
                    # Train new model
                    try:
                        model_data, metrics = self._train_model(
                            model_name, X_train, y_train, X_test, y_test
                        )
                        
                        # Save model
                        print(f"      Saving model...")
                        self._save_model(model_data, model_name, feature_combo_name, label)
                        
                        # Store results
                        result = {
                            'feature_combination': feature_combo_name,
                            'features': ', '.join(feature_list[:5]) + '...' if len(feature_list) > 5 else ', '.join(feature_list),
                            'feature_count': len(feature_list),
                            'label': label,
                            'model': model_name,
                            'train_samples': len(X_train),
                            'test_samples': len(X_test),
                            'PLCC': metrics['PLCC'],
                            'SRCC': metrics['SRCC'],
                            'KRCC': metrics['KRCC'],
                            'RMSE': metrics['RMSE']
                        }
                        self.results.append(result)
                        
                        successful_models += 1
                        
                        print(f"      Performance - PLCC: {metrics['PLCC']:.4f}, "
                              f"SRCC: {metrics['SRCC']:.4f}, KRCC: {metrics['KRCC']:.4f}, "
                              f"RMSE: {metrics['RMSE']:.4f}")
                        
                        # Save intermediate results every 50 models
                        if len(self.results) % 50 == 0:
                            self._save_intermediate_results()
                        
                    except Exception as e:
                        print(f"      Training failed: {e}")
            
            print(f"\n{'='*80}")
            print(f"Completed feature combination: {feature_combo_name}")
            print(f"{'='*80}\n")
        
        # Training summary
        print("=" * 100)
        print("TRAINING COMPLETED!")
        print("=" * 100)
        print(f"Successfully trained/loaded: {successful_models}/{total_models} models")
        print(f"Skipped (already existed): {skipped_models}/{total_models} models")
        print(f"Newly trained: {successful_models - skipped_models}/{total_models} models")
        print("=" * 100)
        
        # Save final results
        self.save_results()
    
    def save_results(self, filename="csv_training_results.csv"):
        """Save training results to CSV file"""
        if self.results:
            results_df = pd.DataFrame(self.results)
            filepath = os.path.join(self.models_save_path, filename)
            results_df.to_csv(filepath, index=False)
            print(f"\nResults saved to: {filepath}")
            
            # Display summary
            self.display_results_summary(results_df)
        else:
            print("No results to save.")

        print("\nTraining process completed successfully!")
    
    def display_results_summary(self, results_df):
        """Display comprehensive results summary"""
        print("\n" + "="*100)
        print("TRAINING RESULTS SUMMARY")
        print("="*100)
        
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        
        print("\nBEST PERFORMING MODELS BY METRIC:")
        print("-" * 60)
        for metric in metrics:
            if metric == 'RMSE':
                best_result = results_df.loc[results_df[metric].idxmin()]
                print(f"\n  Best {metric} (Lower is better):")
            else:
                best_result = results_df.loc[results_df[metric].idxmax()]
                print(f"\n  Best {metric} (Higher is better):")
                
            print(f"     Model: {best_result['model']}")
            print(f"     Features: {best_result['feature_combination']}")
            print(f"     Label: {best_result['label']}")
            print(f"     Score: {best_result[metric]:.4f}")
        
        print(f"\n\nAVERAGE PERFORMANCE BY MODEL:")
        print("-" * 60)
        model_avg = results_df.groupby('model')[metrics].mean()
        for model in model_avg.index:
            print(f"\n  {model.upper().replace('_', ' ')}:")
            for metric in metrics:
                print(f"     {metric}: {model_avg.loc[model, metric]:.4f}")
        
        print(f"\n\nTOP 10 FEATURE COMBINATIONS (by PLCC):")
        print("-" * 60)
        combo_avg = results_df.groupby(['feature_combination'])['PLCC'].mean().sort_values(ascending=False).head(10)
        for i, (combo, plcc) in enumerate(combo_avg.items(), 1):
            print(f"  #{i} {combo}: {plcc:.4f}")
        
        print(f"\n\nBEST PERFORMANCE BY LABEL (using PLCC):")
        print("-" * 60)
        for label in self.labels:
            label_data = results_df[results_df['label'] == label]
            if not label_data.empty:
                best_idx = label_data['PLCC'].idxmax()
                best_result = label_data.loc[best_idx]
                print(f"\n  {label}:")
                print(f"     Best Model: {best_result['model']}")
                print(f"     Features: {best_result['feature_combination']}")
                print(f"     PLCC: {best_result['PLCC']:.4f}")
                print(f"     SRCC: {best_result['SRCC']:.4f}")
                print(f"     KRCC: {best_result['KRCC']:.4f}")
                print(f"     RMSE: {best_result['RMSE']:.4f}")
        
        print("\n" + "="*100)
    
    def load_model_for_prediction(self, model_name, feature_combo_name, label_name):
        """Load a specific trained model for prediction"""
        model_data = self._load_existing_model(model_name, feature_combo_name, label_name)
        if model_data is not None:
            # Handle both old format (just model) and new format (dict with model and logistic_params)
            if isinstance(model_data, dict) and 'model' in model_data:
                return model_data['model'], model_data.get('logistic_params', None)
            else:
                return model_data, None
        return None, None
    
    def predict_with_model(self, model_name, feature_combo_name, label_name, X_new, apply_logistic_fit=True):
        """Make predictions using a trained model with optional logistic fitting"""
        model, logistic_params = self.load_model_for_prediction(model_name, feature_combo_name, label_name)
        if model is not None:
            y_pred = model.predict(X_new)
            
            # Apply logistic fitting if requested and parameters are available
            if apply_logistic_fit and logistic_params is not None:
                try:
                    y_pred_fitted = self._logistic_func(y_pred, *logistic_params)
                    return y_pred_fitted
                except Exception as e:
                    print(f"Warning: Could not apply logistic fitting ({e}), returning original predictions")
                    return y_pred
            else:
                return y_pred
        else:
            print(f"Model not found: {model_name}_{feature_combo_name}_{label_name}")
            return None
    
    def get_model_summary(self):
        """Get summary of available trained models"""
        summary = []
        for model_name in self.models.keys():
            model_dir = os.path.join(self.models_save_path, model_name)
            if os.path.exists(model_dir):
                model_files = [f for f in os.listdir(model_dir) if f.endswith('.pkl')]
                for file in model_files:
                    parts = file.replace('.pkl', '').split('_')
                    if len(parts) >= 2:
                        feature_combo = '_'.join(parts[:-1])
                        label = parts[-1]
                        summary.append({
                            'model': model_name,
                            'feature_combination': feature_combo,
                            'label': label,
                            'file_path': os.path.join(model_dir, file)
                        })
        
        return pd.DataFrame(summary)

    def export_best_models_summary(self, top_n=10, filename="best_models_summary.csv"):
        """Export summary of top performing models for each metric and label"""
        if not self.results:
            print("No results available to export.")
            return
        
        results_df = pd.DataFrame(self.results)
        best_models_data = []
        
        metrics = ['PLCC', 'SRCC', 'KRCC', 'RMSE']
        
        for label in self.labels:
            label_data = results_df[results_df['label'] == label]
            if label_data.empty:
                continue
                
            for metric in metrics:
                # Get top N models for this metric and label
                if metric == 'RMSE':
                    top_models = label_data.nsmallest(top_n, metric)
                else:
                    top_models = label_data.nlargest(top_n, metric)
                
                for rank, (_, row) in enumerate(top_models.iterrows(), 1):
                    best_models_data.append({
                        'label': label,
                        'metric': metric,
                        'rank': rank,
                        'model': row['model'],
                        'feature_combination': row['feature_combination'],
                        'score': row[metric],
                        'plcc': row['PLCC'],
                        'srcc': row['SRCC'],
                        'krcc': row['KRCC'],
                        'rmse': row['RMSE'],
                        'feature_count': row['feature_count']
                    })
        
        best_models_df = pd.DataFrame(best_models_data)
        filepath = os.path.join(self.models_save_path, filename)
        best_models_df.to_csv(filepath, index=False)
        print(f"Best models summary exported to: {filepath}")
        
        return best_models_df

In [3]:
# Modified usage function with separate labels CSV
def run_training(csv_file_path, labels_csv_path, models_save_path, test_size=0.2, force_retrain=False):
    """
    Simple function to run the complete training process with separate features and labels files
    
    Args:
        csv_file_path: Path to CSV file with features data
        labels_csv_path: Path to CSV file with labels data
        models_save_path: Directory to save/load models
        test_size: Fraction for test set (default: 0.2)
        force_retrain: Whether to retrain existing models (default: False)
    """
    print("CSV Regression Model Trainer")
    print("=" * 50)
    
    # Validate CSV files exist
    if not os.path.exists(csv_file_path):
        print(f"Features CSV file not found: {csv_file_path}")
        return None
    
    if not os.path.exists(labels_csv_path):
        print(f"Labels CSV file not found: {labels_csv_path}")
        return None
    
    print(f"Features CSV file: {csv_file_path}")
    print(f"Labels CSV file: {labels_csv_path}")
    print(f"Models directory: {models_save_path}")
    print(f"Test size: {test_size}")
    print(f"Force retrain: {force_retrain}")
    
    # Create trainer instance
    trainer = CSVRegressionModelTrainer(
        csv_file_path=csv_file_path,
        labels_csv_path=labels_csv_path,
        models_save_path=models_save_path
    )
    
    if trainer.merged_df is None:
        print("Failed to load and merge data. Exiting.")
        return None
    
    # Start training
    print("\nStarting model training...")
    trainer.train_all_models(
        test_size=test_size,
        random_state=42,
        force_retrain=force_retrain
    )
    
    # Export best models summary
    if trainer.results:
        print("\nExporting best models summary...")
        trainer.export_best_models_summary(top_n=10)
    
    # Display model summary
    print("\nGetting model summary...")
    model_summary = trainer.get_model_summary()
    if not model_summary.empty:
        print(f"Total trained models: {len(model_summary)}")
        print("\nModel distribution:")
        print(model_summary.groupby('model').size().to_string())
        
        print("\nLabel distribution:")
        print(model_summary.groupby('label').size().to_string())
    else:
        print("No models found.")
    
    print("\nAll processes completed successfully!")
    print("\nNext steps:")
    print("1. Check the results CSV file for performance analysis")
    print("2. Use the best_models_summary.csv to identify top performers")
    print("3. Load specific models using trainer.load_model_for_prediction() for predictions")
    
    return trainer

In [4]:
# Example usage
if __name__ == "__main__":
    # Update these paths to match your setup
    features_csv_path = "../preprocessing/scaling/scaled-features/power/all_features_power.csv"
    labels_csv_path = "../../dataset/cleaned/cleaned-mos.csv"  # Path to your labels CSV file
    models_path = "./trained_models"
    
    # Run training
    trainer = run_training(
        csv_file_path=features_csv_path,
        labels_csv_path=labels_csv_path,
        models_save_path=models_path,
        test_size=0.2,
        force_retrain=False
    )

CSV Regression Model Trainer
Features CSV file: ../preprocessing/scaling/scaled-features/power/all_features_power.csv
Labels CSV file: ../../dataset/cleaned/cleaned-mos.csv
Models directory: ./trained_models
Test size: 0.2
Force retrain: False
Generated 21 feature combinations
Level 1 combinations: 7
Level 2 combinations: 7
Combined L1+L2 combinations: 7
Features CSV loaded successfully. Shape: (1000, 1153)
Total columns in features: 1153
Labels CSV loaded successfully. Shape: (1000, 7)
Total columns in labels: 7
All expected feature columns found in features dataset
All expected label columns found in labels dataset
Merging features and labels datasets...
Merged dataset shape: (1000, 1159)
Successfully merged 1000 samples
Merge statistics:
  - Original features samples: 1000
  - Original labels samples: 1000
  - Merged samples: 1000
  - Samples lost: 0

Merged dataset info:
  - Rows: 1000
  - Features available: 1152/1152
  - Labels available: 6/6

Starting model training...
STARTING 

In [6]:
import pandas as pd
import numpy as np

# Read the CSV data
# Replace 'your_file.csv' with the actual path to your CSV file
df = pd.read_csv('./trained_models/csv_training_results.csv')

# Clean column names by stripping whitespace
df.columns = df.columns.str.strip()

# Handle any missing values in PLCC column
df['PLCC'] = pd.to_numeric(df['PLCC'], errors='coerce')

# Group by model and label, then find the record with highest PLCC for each combination
highest_plcc_records = df.loc[df.groupby(['model', 'label'])['PLCC'].idxmax()]

# Define custom label order
label_order = ['TSV', 'B', 'SR', 'S', 'U', 'O']

# Create a categorical type for proper sorting
highest_plcc_records['label'] = pd.Categorical(
    highest_plcc_records['label'], 
    categories=label_order, 
    ordered=True
)

# Sort by model and label (with custom label order) for better readability
highest_plcc_records = highest_plcc_records.sort_values(['model', 'label'])

# Display results
print("Highest PLCC records for each model-label combination:")
print("=" * 60)

for idx, row in highest_plcc_records.iterrows():
    print(f"Model: {row['model']}")
    print(f"Label: {row['label']}")
    print(f"PLCC: {row['PLCC']:.6f}")
    print(f"SRCC: {row['SRCC']:.6f}")
    print(f"KRCC: {row['KRCC']:.6f}")
    print(f"RMSE: {row['RMSE']:.6f}")
    print("-" * 40)

# Alternative: Create a summary table
print("\nSummary Table:")
print("=" * 60)
summary_df = highest_plcc_records[['model', 'label', 'PLCC', 'SRCC', 'KRCC', 'RMSE']].copy()
summary_df = summary_df.round(6)
print(summary_df.to_string(index=False))

# Save results to a new CSV file
highest_plcc_records.to_csv('highest_plcc_by_model_label.csv', index=False)
print(f"\nResults saved to 'highest_plcc_by_model_label.csv'")

# Optional: Show statistics
print("\nStatistics:")
print("=" * 60)
print(f"Total unique model-label combinations: {len(highest_plcc_records)}")
print(f"Best overall PLCC: {highest_plcc_records['PLCC'].max():.6f}")
print(f"Model with best PLCC: {highest_plcc_records.loc[highest_plcc_records['PLCC'].idxmax(), 'model']}")
print(f"Label with best PLCC: {highest_plcc_records.loc[highest_plcc_records['PLCC'].idxmax(), 'label']}")

# Group by model to see best performing model overall
model_best = highest_plcc_records.groupby('model')['PLCC'].max().sort_values(ascending=False)
print(f"\nBest PLCC by model:")
for model, plcc in model_best.items():
    print(f"{model}: {plcc:.6f}")

# Group by label to see best performing label overall (preserve custom order)
label_best = highest_plcc_records.groupby('label')['PLCC'].max().reindex(label_order)
print(f"\nBest PLCC by label (custom order):")
for label, plcc in label_best.items():
    if pd.notna(plcc):  # Only show labels that exist in the data
        print(f"{label}: {plcc:.6f}")

Highest PLCC records for each model-label combination:
Model: adaboost_regressor
Label: TSV
PLCC: 0.822619
SRCC: 0.792037
KRCC: 0.603843
RMSE: 0.381403
----------------------------------------
Model: adaboost_regressor
Label: B
PLCC: 0.703890
SRCC: 0.676006
KRCC: 0.496454
RMSE: 0.588031
----------------------------------------
Model: adaboost_regressor
Label: SR
PLCC: 0.685484
SRCC: 0.676024
KRCC: 0.493392
RMSE: 0.502546
----------------------------------------
Model: adaboost_regressor
Label: S
PLCC: 0.808740
SRCC: 0.802919
KRCC: 0.599210
RMSE: 0.465072
----------------------------------------
Model: adaboost_regressor
Label: U
PLCC: 0.755805
SRCC: 0.748282
KRCC: 0.550948
RMSE: 0.517329
----------------------------------------
Model: adaboost_regressor
Label: O
PLCC: 0.810655
SRCC: 0.804691
KRCC: 0.616674
RMSE: 0.642197
----------------------------------------
Model: catboost_regressor
Label: TSV
PLCC: 0.835119
SRCC: 0.792897
KRCC: 0.608285
RMSE: 0.366126
-----------------------------