In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection

In [None]:
def create_train_test_splits(Xs, ys, test_size=0.2, random_state=42):
    """
    Create train/test splits for multiple datasets stored in dictionaries.
    
    Parameters:
    -----------
    Xs : dict
        Dictionary containing feature matrices for each dataset
    ys : dict
        Dictionary containing target variables for each dataset
    test_size : float, default=0.2
        Proportion of the dataset to include in the test split
    random_state : int, default=42
        Random state for reproducibility
    
    Returns:
    --------
    dict
        Dictionary containing X_train, X_test, y_train, y_test for each dataset
    """
    splits = {}
    
    for label in Xs.keys():
        X = Xs[label]
        y = ys[label]
        
        # Create train/test split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y,
            test_size=test_size,
            random_state=random_state,
            shuffle=True,
            stratify=y if len(np.unique(y)) < 10 else None  # Stratify only for classification tasks
        )
        
        # Store splits in dictionary
        splits[label] = {
            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test,
            'train_size': len(X_train),
            'test_size': len(X_test)
        }
    
    return splits

In [None]:
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score
import pandas as pd

def train_regularized_models(splits, cv=5):
    """
    Train and evaluate Lasso, Ridge, and ElasticNet models on multiple datasets.
    
    Parameters:
    -----------
    splits : dict
        Dictionary containing train/test splits for each dataset
    cv : int, default=5
        Number of cross-validation folds
    
    Returns:
    --------
    dict
        Dictionary containing trained models, predictions, and performance metrics
    """
    results = {}
    
    # Define parameter grids for each model
    param_grids = {
        'Lasso': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'Ridge': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'ElasticNet': {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }
    
    # Initialize models
    models = {
        'Lasso': Lasso(random_state=42, max_iter=10000),
        'Ridge': Ridge(random_state=42),
        'ElasticNet': ElasticNet(random_state=42, max_iter=10000)
    }
    
    for dataset_name in splits:
        print(f"\nProcessing dataset: {dataset_name}")
        results[dataset_name] = {}
        
        # Get train/test data
        X_train = splits[dataset_name]['X_train']
        X_test = splits[dataset_name]['X_test']
        y_train = splits[dataset_name]['y_train']
        y_test = splits[dataset_name]['y_test']
        
        # Scale features
        scaler = StandardScaler()
        X_train_scaled = scaler.fit_transform(X_train)
        X_test_scaled = scaler.transform(X_test)
        
        # Train and evaluate each model
        for model_name, model in models.items():
            print(f"\nTraining {model_name}...")
            
            # Perform grid search with cross-validation
            grid_search = GridSearchCV(
                model,
                param_grids[model_name],
                cv=cv,
                scoring='neg_mean_squared_error',
                n_jobs=-1
            )
            
            grid_search.fit(X_train_scaled, y_train)
            
            # Get best model
            best_model = grid_search.best_estimator_
            
            # Make predictions
            y_pred_train = best_model.predict(X_train_scaled)
            y_pred_test = best_model.predict(X_test_scaled)
            
            # Calculate metrics
            metrics = {
                'train_rmse': np.sqrt(mean_squared_error(y_train, y_pred_train)),
                'test_rmse': np.sqrt(mean_squared_error(y_test, y_pred_test)),
                'train_r2': r2_score(y_train, y_pred_train),
                'test_r2': r2_score(y_test, y_pred_test),
                'best_params': grid_search.best_params_,
                'cv_results': grid_search.cv_results_
            }
            
            # Store results
            results[dataset_name][model_name] = {
                'model': best_model,
                'predictions': {
                    'train': y_pred_train,
                    'test': y_pred_test
                },
                'metrics': metrics
            }
            
            # Print results
            print(f"Best parameters: {metrics['best_params']}")
            print(f"Train RMSE: {metrics['train_rmse']:.4f}")
            print(f"Test RMSE: {metrics['test_rmse']:.4f}")
            print(f"Train R²: {metrics['train_r2']:.4f}")
            print(f"Test R²: {metrics['test_r2']:.4f}")
            
            # Print feature importance for Lasso and ElasticNet
            if model_name in ['Lasso', 'ElasticNet']:
                feature_importance = pd.DataFrame({
                    'Feature': [f"Feature_{i}" for i in range(X_train.shape[1])],
                    'Coefficient': best_model.coef_
                })
                feature_importance = feature_importance[feature_importance['Coefficient'] != 0]
                print("\nNon-zero coefficients:")
                print(feature_importance.sort_values(by='Coefficient', key=abs, ascending=False))
    
    return results

In [None]:
def tune_rgs_m_parameter(X_train, y_train, X_test, y_test, k_max=15, cv=5):
    """
    Tune the 'm' parameter for FastRandomizedGreedySelection using cross-validation.
    Ensures m is always a Python int, not a numpy integer type.
    """
    n_features = X_train.shape[1]
    
    # Generate values and convert to Python int
    m_values = [int(m) for m in np.unique(np.linspace(1, n_features, 10, dtype=int))]
    
    # Verify types
    assert all(isinstance(m, int) for m in m_values), "All m values must be Python integers"
    
    # Store results
    cv_results = []
    
    print("Tuning 'm' parameter...")
    print(f"Testing values: {m_values}")
    
    # Evaluate each m value using cross-validation
    for m in m_values:
        assert isinstance(m, int), f"m must be int, got {type(m)}"
        rgs = FastRandomizedGreedySelection(k_max=k_max, m=m)
        scores = cross_val_score(rgs, X_train, y_train, cv=cv, scoring='r2')
        mean_score = np.mean(scores)
        std_score = np.std(scores)
        
        cv_results.append({
            'm': m,
            'mean_cv_score': mean_score,
            'std_cv_score': std_score
        })
        
        print(f"m={m} (type={type(m)}): CV R² = {mean_score:.4f} (+/- {std_score:.4f})")
    
    # Find best m value
    best_result = max(cv_results, key=lambda x: x['mean_cv_score'])
    best_m = best_result['m']  # Already a Python int from earlier conversion
    
    print(f"\nBest m value: {best_m} (type={type(best_m)})")
    print(f"Best CV R²: {best_result['mean_cv_score']:.4f}")
    
    # Train final model with best m value
    best_model = FastRandomizedGreedySelection(k_max=k_max, m=best_m)
    best_model.fit(X_train, y_train)
    
    # Evaluate on test set
    test_score = best_model.score(X_test, y_test)
    print(f"\nTest set R² with best m={best_m}: {test_score:.4f}")
    
    # Selected features
    selected_features = np.where(best_model.coef_ != 0)[0]
    print(f"\nSelected features: {selected_features}")
    print(f"Number of selected features: {len(selected_features)}")
    
    return {
        'cv_results': cv_results,
        'best_m': best_m,
        'best_cv_score': best_result['mean_cv_score'],
        'test_score': test_score,
        'best_model': best_model,
        'selected_features': selected_features
    }

In [None]:
from sklearn.metrics import mean_squared_error
import pandas as pd

def create_comparison_table(regularized_results, rgs_results, splits):
    """
    Create a table comparing test MSE for all methods.
    """
    comparison_data = []
    
    for dataset_name in splits.keys():
        # Get predictions for regularized models
        reg_models = regularized_results[dataset_name]
        y_test = splits[dataset_name]['y_test']
        
        # Calculate MSE for each regularized model
        for model_name in ['Lasso', 'Ridge', 'ElasticNet']:
            y_pred = reg_models[model_name]['predictions']['test']
            mse = mean_squared_error(y_test, y_pred)
            
            comparison_data.append({
                'Dataset': dataset_name,
                'Model': model_name,
                'Test MSE': mse
            })
        
        # Calculate MSE for RGS
        rgs_model = rgs_results[dataset_name]['best_model']
        X_test = splits[dataset_name]['X_test']
        rgs_pred = rgs_model.predict(X_test)
        rgs_mse = mean_squared_error(y_test, rgs_pred)
        
        comparison_data.append({
            'Dataset': dataset_name,
            'Model': 'RGS',
            'Test MSE': rgs_mse
        })
    
    # Create DataFrame and format
    df = pd.DataFrame(comparison_data)
    df = df.pivot(index='Dataset', columns='Model', values='Test MSE')
    
    # Round values for readability
    df = df.round(4)
    
    return df

In [None]:
labels = ['Auto Pricing', 'Bodyfat']
data = {}

# Load data
data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Satellite Image'] = pd.read_csv('../real_data/294_satellite_image.tsv', sep='\t')
data['Political'] = pd.read_csv('../real_data/201_pol.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
Xs = {label: data[label].drop('target', axis=1).values for label in labels}
ys = {label: data[label]['target'].values for label in labels}

In [None]:
# Create the splits
splits = create_train_test_splits(Xs, ys)

# Print split information
for label in splits:
    print(f"\nDataset: {label}")
    print(f"Training samples: {splits[label]['train_size']}")
    print(f"Testing samples: {splits[label]['test_size']}")
    print(f"X_train shape: {splits[label]['X_train'].shape}")
    print(f"X_test shape: {splits[label]['X_test'].shape}")

In [None]:
# Run tuning for each dataset
tuning_results = {}
for dataset_name in splits:
    print(f"\nProcessing dataset: {dataset_name}")
    X_train = splits[dataset_name]['X_train']
    X_test = splits[dataset_name]['X_test']
    y_train = splits[dataset_name]['y_train']
    y_test = splits[dataset_name]['y_test']
    n_features = X_train.shape[1]
    tuning_results[dataset_name] = tune_rgs_m_parameter(X_train, y_train, X_test, y_test, k_max=n_features//3)

In [None]:
# Train and evaluate models
results = train_regularized_models(splits)

In [6]:
# Create comparison table
comparison_table = create_comparison_table(results, tuning_results, splits)

# Display table
print("\nTest MSE Comparison:")
print(comparison_table)

# Find best model for each dataset
best_models = comparison_table.idxmin(axis=1)
print("\nBest performing model for each dataset:")
print(best_models)

0.012463106871794572


Unnamed: 0,rgs1,rgs2,rgs3
0,457.946755,457.946755,457.946755
1,0.958522,1.128858,1.191185
2,0.292462,0.272517,0.406167
3,0.125505,0.151693,0.186742
4,0.072272,0.064924,0.077382
5,0.041882,0.041343,0.047635
6,0.024652,0.022743,0.031133
7,0.016293,0.013587,0.021527
8,0.010693,0.009187,0.011942
9,0.007404,0.007755,0.00814
