In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection

In [2]:
def clean_datasets(data):
    """
    Clean datasets by removing missing values and duplicate rows.
    
    Parameters:
    -----------
    data : dict
        Dictionary containing pandas DataFrames for each dataset
    
    Returns:
    --------
    dict
        Dictionary containing cleaned DataFrames
    """
    cleaned_data = {}
    
    for label, df in data.items():
        print(f"\nCleaning dataset: {label}")
        print(f"Original shape: {df.shape}")
        
        # Check for missing values
        missing_before = df.isnull().sum().sum()
        print(f"Missing values before: {missing_before}")
        
        # Check for duplicates
        duplicates_before = df.duplicated().sum()
        print(f"Duplicate rows before: {duplicates_before}")
        
        # Remove missing values
        df_cleaned = df.dropna()
        
        # Remove duplicates
        df_cleaned = df_cleaned.drop_duplicates()
        
        # Final checks
        missing_after = df_cleaned.isnull().sum().sum()
        duplicates_after = df_cleaned.duplicated().sum()
        
        print(f"Final shape: {df_cleaned.shape}")
        print(f"Rows removed due to missing values: {len(df) - len(df_cleaned)}")
        print(f"Missing values after: {missing_after}")
        print(f"Duplicate rows after: {duplicates_after}")
        
        cleaned_data[label] = df_cleaned
    
    return cleaned_data

In [3]:
def create_train_test_splits(Xs, ys, test_size=0.2, random_state=42):
    """
    Create train/test splits for multiple datasets stored in dictionaries.
    
    Parameters:
    -----------
    Xs : dict
        Dictionary containing feature matrices for each dataset
    ys : dict
        Dictionary containing target variables for each dataset
    test_size : float, default=0.2
        Proportion of the dataset to include in the test split
    random_state : int, default=42
        Random state for reproducibility
    
    Returns:
    --------
    dict
        Dictionary containing X_train, X_test, y_train, y_test for each dataset
    """
    splits = {}
    
    for label in Xs.keys():
        X = Xs[label]
        y = ys[label]
        
        # Create train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=test_size,
            random_state=random_state,
            shuffle=True,
            stratify=y if len(np.unique(y)) < 10 else None  # Stratify only for classification tasks
        )
        
        # Store splits in dictionary
        splits[label] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'train_size': len(X_train),
            'test_size': len(X_test)
        }
    
    return splits

In [4]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

def train_regularized_models(splits, cv=5):
    """
    Train and evaluate Lasso, Ridge, and ElasticNet models on multiple datasets.
    
    Parameters:
    -----------
    splits : dict
        Dictionary containing train/test splits for each dataset
    cv : int, default=5
        Number of cross-validation folds
    
    Returns:
    --------
    dict
        Dictionary containing trained models, predictions, and performance metrics
    """
    results = {}
    
    # Define parameter grids for each model
    param_grids = {
        'Lasso': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'Ridge': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'ElasticNet': {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }
    
    # Initialize models
    models = {
        'Lasso': Lasso(random_state=42, max_iter=10000),
        'Ridge': Ridge(random_state=42),
        'ElasticNet': ElasticNet(random_state=42, max_iter=10000)
    }
    
    for dataset_name in splits:
        print(f"\nProcessing dataset: {dataset_name}")
        results[dataset_name] = {}
        
        # Get train/test data
        X_train = splits[dataset_name]['X_train']
        X_test = splits[dataset_name]['X_test']
        y_train = splits[dataset_name]['y_train']
        y_test = splits[dataset_name]['y_test']
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train and evaluate each model
        for model_name, model in models.items():
            print(f"\nTraining {model_name}...")
            
            # Perform grid search with cross-validation
            grid_search = GridSearchCV(
                model,
                param_grids[model_name],
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            
            # Get best model
            best_model = grid_search.best_estimator_
            
            # Make predictions
            y_pred_train = best_model.predict(X_train_scaled)
            y_pred_test = best_model.predict(X_test_scaled)
            
            # Calculate metrics
            metrics = {
                'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
                'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
                'train_r2': r2_score(y_train, y_pred_train),
                'test_r2': r2_score(y_test, y_pred_test),
                'best_params': grid_search.best_params_,
                'cv_results': grid_search.cv_results_
            }
            
            # Store results
            results[dataset_name][model_name] = {
                'model': best_model,
                'predictions': {
                    'train': y_pred_train,
                    'test': y_pred_test
                },
                'metrics': metrics
            }
            
            # Print results
            print(f"Best parameters: {metrics['best_params']}")
            print(f"Train RMSE: {metrics['train_rmse']:.4f}")
            print(f"Test RMSE: {metrics['test_rmse']:.4f}")
            print(f"Train R²: {metrics['train_r2']:.4f}")
            print(f"Test R²: {metrics['test_r2']:.4f}")
            
            # Print feature importance for Lasso and ElasticNet
            if model_name in ['Lasso', 'ElasticNet']:
                feature_importance = pd.DataFrame({
                    'Feature': [f"Feature_{i}" for i in range(X_train.shape[1])],
                    'Coefficient': best_model.coef_
                })
                feature_importance = feature_importance[feature_importance['Coefficient'] != 0]
                print("\nNon-zero coefficients:")
                print(feature_importance.sort_values(by='Coefficient', key=abs, ascending=False))
    
    return results

In [5]:
from sklearn.model_selection import cross_val_score

def tune_rgs_parameters(X_train, y_train, X_test, y_test, cv=5):
    """
    Tune both 'm' and 'k_max' parameters for FastRandomizedGreedySelection using cross-validation.
    
    Parameters:
    -----------
    X_train : array-like
        Training features
    y_train : array-like
        Training target
    X_test : array-like
        Test features
    y_test : array-like
        Test target
    cv : int, default=5
        Number of cross-validation folds
    
    Returns:
    --------
    dict
        Dictionary containing results and best model
    """
    n_features = X_train.shape[1]
    
    # Generate values for m and k_max
    m_values = [int(m) for m in np.unique(np.linspace(1, n_features, 10, dtype=int))]
    k_max_values = [int(k) for k in np.unique(np.linspace(2, n_features, 10, dtype=int))]
    
    # Verify types
    assert all(isinstance(m, int) for m in m_values), "All m values must be Python integers"
    assert all(isinstance(k, int) for k in k_max_values), "All k_max values must be Python integers"
    
    # Store results
    cv_results = []
    
    print("Tuning parameters...")
    print(f"Testing m values: {m_values}")
    print(f"Testing k_max values: {k_max_values}")
    
    # Grid search over both parameters
    for k_max in k_max_values:
        for m in m_values:
            assert isinstance(m, int) and isinstance(k_max, int)
            
            # Skip invalid combinations where m < k_max
            if m < k_max:
                continue
                
            rgs = FastRandomizedGreedySelection(k_max=k_max, m=m)
            scores = cross_val_score(rgs, X_train, y_train, cv=cv, scoring='r2')
            mean_score = np.mean(scores)
            std_score = np.std(scores)
            
            cv_results.append({
                'm': m,
                'k_max': k_max,
                'mean_cv_score': mean_score,
                'std_cv_score': std_score
            })
            
#             print(f"k_max={k_max}, m={m}: CV R² = {mean_score:.4f} (+/- {std_score:.4f})")
    
    # Find best parameters
    best_result = max(cv_results, key=lambda x: x['mean_cv_score'])
    best_m = best_result['m']
    best_k_max = best_result['k_max']
    
    print(f"\nBest parameters:")
    print(f"k_max = {best_k_max}")
    print(f"m = {best_m}")
    print(f"Best CV R²: {best_result['mean_cv_score']:.4f}")
    
    # Train final model with best parameters
    best_model = FastRandomizedGreedySelection(k_max=best_k_max, m=best_m)
    best_model.fit(X_train, y_train)
    
    # Evaluate on test set
    test_score = best_model.score(X_test, y_test)
    print(f"\nTest set R² with best parameters: {test_score:.4f}")
    
    # Selected features
    selected_features = np.where(best_model.coef_ != 0)[0]
    print(f"\nSelected features: {selected_features}")
    print(f"Number of selected features: {len(selected_features)}")
    
    return {
        'cv_results': cv_results,
        'best_m': best_m,
        'best_k_max': best_k_max,
        'best_cv_score': best_result['mean_cv_score'],
        'test_score': test_score,
        'best_model': best_model,
        'selected_features': selected_features
    }

In [6]:
from sklearn.metrics import mean_squared_error
import pandas as pd

def create_comparison_table(regularized_results, rgs_results, splits):
    """
    Create a table comparing test MSE for all methods.
    """
    comparison_data = []
    
    for dataset_name in splits.keys():
        # Get predictions for regularized models
        reg_models = regularized_results[dataset_name]
        y_test = splits[dataset_name]['y_test']
        
        # Calculate MSE for each regularized model
        for model_name in ['Lasso', 'Ridge', 'ElasticNet']:
            y_pred = reg_models[model_name]['predictions']['test']
            mse = mean_squared_error(y_test, y_pred)
            
            comparison_data.append({
                'Dataset': dataset_name,
                'Model': model_name,
                'Test MSE': mse
            })
        
        # Calculate MSE for RGS
        rgs_model = rgs_results[dataset_name]['best_model']
        X_test = splits[dataset_name]['X_test']
        rgs_pred = rgs_model.predict(X_test)
        rgs_mse = mean_squared_error(y_test, rgs_pred)
        
        comparison_data.append({
            'Dataset': dataset_name,
            'Model': 'RGS',
            'Test MSE': rgs_mse
        })
    
    # Create DataFrame and format
    df = pd.DataFrame(comparison_data)
    df = df.pivot(index='Dataset', columns='Model', values='Test MSE')
    
    # Round values for readability
    df = df.round(4)
    
    return df

In [7]:
labels = ['Auto Pricing', 'Bodyfat', 'Satellite Image', 'Pharynx', 'PW', 'CPU', 'House', 'MeatFat']
data = {}

# Load data
data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Satellite Image'] = pd.read_csv('../real_data/294_satellite_image.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
data['Pharynx'] = pd.read_csv('../real_data/1196_BNG_pharynx.tsv', sep='\t')
data['PW'] = pd.read_csv('../real_data/229_pwLinear.tsv', sep='\t')
data['CPU'] = pd.read_csv('../real_data/197_cpu_act.tsv', sep='\t')
data['House'] = pd.read_csv('../real_data/574_house_16H.tsv', sep='\t')
data['MeatFat'] = pd.read_csv('../real_data/505_tecator.tsv', sep='\t')

# Clean the datasets
cleaned_data = clean_datasets(data)

Xs = {label: cleaned_data[label].drop('target', axis=1).values for label in labels}
ys = {label: cleaned_data[label]['target'].values for label in labels}


Cleaning dataset: Auto Pricing
Original shape: (159, 16)
Missing values before: 0
Duplicate rows before: 0
Final shape: (159, 16)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Satellite Image
Original shape: (6435, 37)
Missing values before: 0
Duplicate rows before: 0
Final shape: (6435, 37)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Bodyfat
Original shape: (252, 15)
Missing values before: 0
Duplicate rows before: 0
Final shape: (252, 15)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Pharynx
Original shape: (1000000, 11)
Missing values before: 0
Duplicate rows before: 0
Final shape: (1000000, 11)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: PW
Original shape: (200, 11)
Missing values before: 0
Duplicate rows before: 0
Final shape: (200, 11)
Rows rem

In [8]:
# Create the splits
splits = create_train_test_splits(Xs, ys)

# Print split information
for label in splits:
    print(f"\nDataset: {label}")
    print(f"Training samples: {splits[label]['train_size']}")
    print(f"Testing samples: {splits[label]['test_size']}")
    print(f"X_train shape: {splits[label]['X_train'].shape}")
    print(f"X_test shape: {splits[label]['X_test'].shape}")


Dataset: Auto Pricing
Training samples: 127
Testing samples: 32
X_train shape: (127, 15)
X_test shape: (32, 15)

Dataset: Bodyfat
Training samples: 201
Testing samples: 51
X_train shape: (201, 14)
X_test shape: (51, 14)

Dataset: Satellite Image
Training samples: 5148
Testing samples: 1287
X_train shape: (5148, 36)
X_test shape: (1287, 36)

Dataset: Pharynx
Training samples: 800000
Testing samples: 200000
X_train shape: (800000, 10)
X_test shape: (200000, 10)

Dataset: PW
Training samples: 160
Testing samples: 40
X_train shape: (160, 10)
X_test shape: (40, 10)

Dataset: CPU
Training samples: 6553
Testing samples: 1639
X_train shape: (6553, 21)
X_test shape: (1639, 21)

Dataset: House
Training samples: 18226
Testing samples: 4557
X_train shape: (18226, 16)
X_test shape: (4557, 16)

Dataset: MeatFat
Training samples: 176
Testing samples: 44
X_train shape: (176, 124)
X_test shape: (44, 124)


In [9]:
# Run tuning for each dataset
tuning_results = {}
for dataset_name in splits:
    print(f"\nProcessing dataset: {dataset_name}")
    X_train = splits[dataset_name]['X_train']
    X_test = splits[dataset_name]['X_test']
    y_train = splits[dataset_name]['y_train']
    y_test = splits[dataset_name]['y_test']
    n_features = X_train.shape[1]
    tuning_results[dataset_name] = tune_rgs_parameters(X_train, y_train, X_test, y_test)


Processing dataset: Auto Pricing
Tuning parameters...
Testing m values: [1, 2, 4, 5, 7, 8, 10, 11, 13, 15]
Testing k_max values: [2, 3, 4, 6, 7, 9, 10, 12, 13, 15]

Best parameters:
k_max = 4
m = 4
Best CV R²: 0.8019

Test set R² with best parameters: 0.6765

Selected features: [0]
Number of selected features: 1

Processing dataset: Bodyfat
Tuning parameters...
Testing m values: [1, 2, 3, 5, 6, 8, 9, 11, 12, 14]
Testing k_max values: [2, 3, 4, 6, 7, 8, 10, 11, 12, 14]

Best parameters:
k_max = 6
m = 8
Best CV R²: 0.9681

Test set R² with best parameters: 0.9909

Selected features: [0]
Number of selected features: 1

Processing dataset: Satellite Image
Tuning parameters...
Testing m values: [1, 4, 8, 12, 16, 20, 24, 28, 32, 36]
Testing k_max values: [2, 5, 9, 13, 17, 20, 24, 28, 32, 36]

Best parameters:
k_max = 13
m = 16
Best CV R²: 0.7019

Test set R² with best parameters: 0.6889

Selected features: [0]
Number of selected features: 1

Processing dataset: Pharynx
Tuning parameters...


In [10]:
# Train and evaluate models
results = train_regularized_models(splits)


Processing dataset: Auto Pricing

Training Lasso...
Best parameters: {'alpha': 100}
Train RMSE: 2318.7822
Test RMSE: 2611.5338
Train R²: 0.8597
Test R²: 0.6167

Non-zero coefficients:
       Feature  Coefficient
6    Feature_6  2054.346455
7    Feature_7  1979.916989
4    Feature_4  1634.166353
1    Feature_1   552.224975
2    Feature_2   507.191328
3    Feature_3  -369.997229
10  Feature_10   235.325631
9    Feature_9  -201.703018
8    Feature_8   -62.216401
14  Feature_14   -56.215267
12  Feature_12    21.025979

Training Ridge...
Best parameters: {'alpha': 10}
Train RMSE: 2287.2029
Test RMSE: 2533.5819
Train R²: 0.8635
Test R²: 0.6393

Training ElasticNet...
Best parameters: {'alpha': 1, 'l1_ratio': 0.9}
Train RMSE: 2305.4277
Test RMSE: 2502.0830
Train R²: 0.8613
Test R²: 0.6482

Non-zero coefficients:
       Feature  Coefficient
7    Feature_7  1686.614829
6    Feature_6  1506.836812
4    Feature_4  1399.350717
2    Feature_2   940.708667
10  Feature_10   618.752533
1    Feature_1

Best parameters: {'alpha': 10}
Train RMSE: 45539.6317
Test RMSE: 44974.0903
Train R²: 0.2615
Test R²: 0.2588

Non-zero coefficients:
       Feature   Coefficient
5    Feature_5 -40554.427371
7    Feature_7  34374.956484
6    Feature_6 -25401.302666
9    Feature_9  15494.137719
4    Feature_4 -10124.229677
13  Feature_13  -9436.530945
2    Feature_2   9315.902786
11  Feature_11  -9063.021253
8    Feature_8  -7001.988187
14  Feature_14  -5758.068640
3    Feature_3   5081.523706
12  Feature_12  -4889.923386
1    Feature_1  -3474.208783
15  Feature_15  -2633.593492
0    Feature_0   2546.083625
10  Feature_10    -49.115521

Training Ridge...
Best parameters: {'alpha': 10}
Train RMSE: 45539.9237
Test RMSE: 44969.8905
Train R²: 0.2615
Test R²: 0.2589

Training ElasticNet...
Best parameters: {'alpha': 0.001, 'l1_ratio': 0.1}
Train RMSE: 45541.2733
Test RMSE: 44966.3930
Train R²: 0.2615
Test R²: 0.2590

Non-zero coefficients:
       Feature   Coefficient
5    Feature_5 -38633.042333
7    Featur

In [11]:
# Create comparison table
comparison_table = create_comparison_table(results, tuning_results, splits)

# Display table
print("\nTest MSE Comparison:")
print(comparison_table)

# Find best model for each dataset
best_models = comparison_table.idxmin(axis=1)
print("\nBest performing model for each dataset:")
print(best_models)


Test MSE Comparison:
Model              ElasticNet         Lasso           RGS         Ridge
Dataset                                                                
Auto Pricing     6.260419e+06  6.820109e+06  5.756188e+06  6.419037e+06
Bodyfat          3.966000e-01  2.641000e-01  4.246000e-01  4.305000e-01
CPU              7.882120e+01  7.882630e+01  7.879250e+01  7.878400e+01
House            2.021976e+09  2.022669e+09  2.022925e+09  2.022291e+09
MeatFat          5.982000e-01  5.678000e-01  5.843000e-01  8.511000e-01
PW               3.372000e+00  3.372000e+00  3.323100e+00  3.372500e+00
Pharynx          1.000831e+05  1.000825e+05  1.000835e+05  1.000832e+05
Satellite Image  1.526600e+00  1.525100e+00  1.524000e+00  1.526800e+00

Best performing model for each dataset:
Dataset
Auto Pricing              RGS
Bodyfat                 Lasso
CPU                     Ridge
House              ElasticNet
MeatFat                 Lasso
PW                        RGS
Pharynx                 Lasso