In [9]:
# Import necessary libraries
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler, PowerTransformer, QuantileTransformer, Normalizer
from scipy import stats
import warnings
warnings.filterwarnings('ignore')

In [36]:
class SVDFeatureScaler:
    """
    A comprehensive class for scaling SVD features with different approaches
    """
    
    def __init__(self, filepath):
        """
        Initialize with the dataset filepath
        """
        self.filepath = filepath
        self.df = None
        self.scaled_datasets = {}

        # Load data from CSV
        self.load_data()
        self.feature_groups = self.get_feature_groups()
        
    def load_data(self):
        """
        Load the CSV data
        """
        try:
            # Read the CSV file
            self.df = pd.read_csv(self.filepath, skipinitialspace=True)
            print(f"Dataset loaded successfully. Shape: {self.df.shape}")
            return True
        except Exception as e:
            print(f"Error loading data: {e}")
            return False
    
    def get_feature_groups(self):
        """
        Separate features into U, S, V components and different dimensions
        """

        if self.df is None:
            print("Please load data first")
            return
            
        feature_groups = {
            'U1': [col for col in self.df.columns if col.startswith('U1_')],
            'S1': [col for col in self.df.columns if col.startswith('S1_')],
            'V1': [col for col in self.df.columns if col.startswith('V1_')],
            'U2': [col for col in self.df.columns if col.startswith('U2_')],
            'S2': [col for col in self.df.columns if col.startswith('S2_')],
            'V2': [col for col in self.df.columns if col.startswith('V2_')]
        }
        
        # Print feature group sizes
        for group, features in feature_groups.items():
            print(f"{group}: {len(features)} features")
            
        return feature_groups
    
    def analyze_feature_distributions(self, sample_size=5):
        """
        Analyze the distribution of features to understand scaling needs
        """

        if self.df is None:
            print("Please load data first")
            return
            
        feature_groups = self.feature_groups
        
        print("\n=== Feature Distribution Analysis ===")
        
        for group_name, features in feature_groups.items():
            if len(features) == 0:
                continue
                
            # Take a sample of features for analysis
            sample_features = features[:sample_size]
            data = self.df[sample_features]
            
            print(f"\n{group_name} Features (sample of {len(sample_features)}):")
            print(f"  Mean range: {data.mean().min():.6f} to {data.mean().max():.6f}")
            print(f"  Std range: {data.std().min():.6f} to {data.std().max():.6f}")
            print(f"  Min range: {data.min().min():.6f} to {data.min().max():.6f}")
            print(f"  Max range: {data.max().min():.6f} to {data.max().max():.6f}")
            
            # Check for zero/negative values (important for log scaling)
            zero_count = (data == 0).sum().sum()
            negative_count = (data < 0).sum().sum()
            print(f"  Zero values: {zero_count}")
            print(f"  Negative values: {negative_count}")
            
            # Skewness analysis
            skewness = data.apply(lambda x: stats.skew(x)).mean()
            print(f"  Average skewness: {skewness:.3f}")
        print("\n")
    
    def apply_scaling_all_features(self, scaler_type):
        """
        Apply scaling to all U, S, V features together
        
        Parameters:
        scaler_type: str - 'standard', 'minmax', 'robust', 'quantile', 'power', 'log', 'log1p'
        """

        if self.df is None:
            print("Please load data first")
            return None
            
        feature_groups = self.feature_groups
        
        # Combine all feature columns
        all_features = []
        for features in feature_groups.values():
            all_features.extend(features)
        
        if len(all_features) == 0:
            print("No features found")
            return None
            
        # Get the feature data
        X = self.df[all_features].copy()
        
        # Apply scaling based on type
        X_scaled = self._apply_scaler(X, scaler_type)
        
        if X_scaled is not None:
            # Create scaled dataset
            scaled_df = self.df.copy()
            scaled_df[all_features] = X_scaled
            
            dataset_name = f"all_features_{scaler_type}"
            self.scaled_datasets[dataset_name] = scaled_df
            
            print(f"Applied {scaler_type} scaling to all features")
            return scaled_df
        
        return None
    
    def apply_scaling_by_component(self, scaler_type='standard'):
        """
        Apply scaling to each component (U1, S1, V1, U2, S2, V2) separately
        
        Parameters:
        scaler_type: str - 'standard', 'minmax', 'robust', 'quantile', 'power', 'log', 'log1p'
        """

        if self.df is None:
            print("Please load data first")
            return None
            
        feature_groups = self.feature_groups
        scaled_df = self.df.copy()
        
        for group_name, features in feature_groups.items():
            if len(features) == 0:
                continue
                
            X = self.df[features].copy()
            X_scaled = self._apply_scaler(X, scaler_type)
            
            if X_scaled is not None:
                scaled_df[features] = X_scaled
                print(f"Applied {scaler_type} scaling to {group_name} ({len(features)} features)")
        
        dataset_name = f"by_component_{scaler_type}"
        self.scaled_datasets[dataset_name] = scaled_df
        
        return scaled_df
    
    def apply_scaling_s_only(self, scaler_type='standard'):
        """
        Apply scaling only to S features (singular values)
        
        Parameters:
        scaler_type: str - 'standard', 'minmax', 'robust', 'quantile', 'power', 'log', 'log1p'
        """

        if self.df is None:
            print("Please load data first")
            return None

        feature_groups = self.feature_groups
        scaled_df = self.df.copy()
        
        # Only scale S features
        s_features = feature_groups['S1'] + feature_groups['S2']
        
        if len(s_features) == 0:
            print("No S features found")
            return None
            
        X = self.df[s_features].copy()
        X_scaled = self._apply_scaler(X, scaler_type)
        
        if X_scaled is not None:
            scaled_df[s_features] = X_scaled
            print(f"Applied {scaler_type} scaling to S features only ({len(s_features)} features)")
        
        dataset_name = f"s_only_{scaler_type}"
        self.scaled_datasets[dataset_name] = scaled_df
        
        return scaled_df
    
    def _apply_scaler(self, X, scaler_type):
        """
        Internal method to apply different types of scalers
        """
        try:
            if scaler_type == 'standard':
                scaler = StandardScaler()
                return scaler.fit_transform(X)
            
            elif scaler_type == 'minmax':
                scaler = MinMaxScaler()
                return scaler.fit_transform(X)
            
            elif scaler_type == 'robust':
                scaler = RobustScaler()
                return scaler.fit_transform(X)
            
            elif scaler_type == 'quantile':
                scaler = QuantileTransformer(output_distribution='normal')
                return scaler.fit_transform(X)
            
            elif scaler_type == 'power':
                scaler = PowerTransformer(method='yeo-johnson')
                return scaler.fit_transform(X)
            
            elif scaler_type == 'log':
                # Add small constant to handle zeros/negatives
                X_positive = X + abs(X.min().min()) + 1e-8
                return np.log(X_positive)
            
            elif scaler_type == 'log1p':
                # Add small constant to handle negatives
                X_positive = X + abs(X.min().min()) + 1e-8
                return np.log1p(X_positive)
            
            else:
                print(f"Unknown scaler type: {scaler_type}")
                return None
                
        except Exception as e:
            print(f"Error applying {scaler_type} scaling: {e}")
            return None
    
    def run_comprehensive_scaling(self):
        """
        Run all scaling approaches with different scalers
        """
        scalers = ['standard', 'minmax', 'robust', 'quantile', 'power', 'log', 'log1p']
        
        print("=== Running Comprehensive Scaling Analysis ===")
        
        for scaler in scalers:
            print(f"\n--- Applying {scaler.upper()} Scaling ---")
            
            # All features together
            self.apply_scaling_all_features(scaler)
            
            # By component
            self.apply_scaling_by_component(scaler)
            
            # S features only
            self.apply_scaling_s_only(scaler)
        
        print(f"\nCompleted! Generated {len(self.scaled_datasets)} scaled datasets")
        print("Available datasets:", list(self.scaled_datasets.keys()), "\n\n")
    
    def compare_scaling_effects(self, feature_sample_size=3):
        """
        Compare the effects of different scaling approaches
        """
        if len(self.scaled_datasets) == 0:
            print("No scaled datasets available. Run scaling first.")
            return
        
        feature_groups = self.feature_groups
        
        # Sample features from each group for comparison
        sample_features = []
        for group_name, features in feature_groups.items():
            if len(features) > 0:
                sample_features.extend(features[:min(feature_sample_size, len(features))])
        
        print("=== Scaling Effects Comparison ===")
        
        # Original data stats
        original_data = self.df[sample_features]
        print(f"\nOriginal Data:")
        print(f"  Mean: {original_data.mean().mean():.6f} ± {original_data.mean().std():.6f}")
        print(f"  Std: {original_data.std().mean():.6f} ± {original_data.std().std():.6f}")
        print(f"  Range: [{original_data.min().min():.6f}, {original_data.max().max():.6f}]")
        
        # Compare scaled datasets
        for dataset_name, scaled_df in self.scaled_datasets.items():
            scaled_data = scaled_df[sample_features]
            print(f"\n{dataset_name}:")
            print(f"  Mean: {scaled_data.mean().mean():.6f} ± {scaled_data.mean().std():.6f}")
            print(f"  Std: {scaled_data.std().mean():.6f} ± {scaled_data.std().std():.6f}")
            print(f"  Range: [{scaled_data.min().min():.6f}, {scaled_data.max().max():.6f}]")

        print("\n\n")
    
    def get_scaled_dataset(self, dataset_name):
        """
        Retrieve a specific scaled dataset
        """
        return self.scaled_datasets.get(dataset_name, None)
    
    def save_scaled_dataset(self, dataset_name, filepath):
        """
        Save a scaled dataset to file
        """
        if dataset_name in self.scaled_datasets:
            self.scaled_datasets[dataset_name].to_csv(filepath, index=False)
            print(f"Saved {dataset_name} to {filepath}")
        else:
            print(f"Dataset {dataset_name} not found")

    def save_all_datasets(self, outputDir):
        print("=== Saving Scaled Datasets ===")
        for key in self.scaled_datasets:
            self.save_scaled_dataset(key, f'{outputDir}/{key.split('_')[-1]}/{key}.csv')


In [38]:
def main():
    # Get the file path
    input_file = './../../features/cleaned/cleaned-svd-features.csv'
    
    # Initialize the scaler
    scaler = SVDFeatureScaler(input_file)
    
    # Analyze distributions to understand the data
    scaler.analyze_feature_distributions()
    
    # Run comprehensive scaling
    scaler.run_comprehensive_scaling()
    
    # Compare scaling effects
    scaler.compare_scaling_effects()

    # Save the datasets
    scaler.save_all_datasets("./scaled-features")
        

if __name__ == "__main__":
    main()

Dataset loaded successfully. Shape: (1000, 1153)
U1: 256 features
S1: 256 features
V1: 256 features
U2: 128 features
S2: 128 features
V2: 128 features

=== Feature Distribution Analysis ===

U1 Features (sample of 5):
  Mean range: 0.162150 to 0.317315
  Std range: 0.008142 to 0.022730
  Min range: 0.144225 to 0.262484
  Max range: 0.201038 to 0.410429
  Zero values: 0
  Negative values: 0
  Average skewness: 0.821

S1 Features (sample of 5):
  Mean range: 893.854290 to 14065.820617
  Std range: 192.640028 to 865.363380
  Min range: 415.141994 to 9420.811306
  Max range: 1642.627389 to 16710.755937
  Zero values: 0
  Negative values: 0
  Average skewness: 0.362

V1 Features (sample of 5):
  Mean range: 0.173614 to 0.346338
  Std range: 0.010251 to 0.026565
  Min range: 0.148395 to 0.266828
  Max range: 0.220958 to 0.446164
  Zero values: 0
  Negative values: 0
  Average skewness: 0.371

U2 Features (sample of 5):
  Mean range: 0.228297 to 0.417702
  Std range: 0.011417 to 0.029647
  Mi