In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV, LinearRegression
from sklearn.metrics import mean_squared_error

sys.path.append("../")

# Import RGS components
from rgs.src.rgs.core.rgs import RGSCV
from rgs.src.rgs.mse import create_mse_scorer

In [2]:
def clean_datasets(data):
    """Clean datasets by removing missing values and duplicate rows."""
    cleaned_data = {}
    
    for label, df in data.items():
        print(f"Cleaning {label}: {df.shape} -> ", end="")
        df_cleaned = df.dropna().drop_duplicates()
        print(f"{df_cleaned.shape}")
        cleaned_data[label] = df_cleaned
    
    return cleaned_data

def evaluate_methods_cv_only(Xs, ys, cv=10):
    """Evaluate baseline methods using only their internal CV."""
    
    baseline_models = {
        'LassoCV': LassoCV(cv=cv, random_state=42, max_iter=10000),
        'RidgeCV': RidgeCV(cv=cv), 
        'ElasticNetCV': ElasticNetCV(cv=cv, random_state=42, max_iter=10000,
                                   l1_ratio=[0.1, 0.3, 0.5, 0.7, 0.9])
    }
    
    results = []
    for dataset_name in Xs.keys():
        print(f"Evaluating baselines on {dataset_name}...")
        X, y = Xs[dataset_name], ys[dataset_name]
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        for method_name, model in baseline_models.items():
            model.fit(X_scaled, y)
            cv_score = model.score(X_scaled, y)
            
            results.append({
                'Dataset': dataset_name,
                'Method': method_name, 
                'CV_R2': cv_score
            })
    
    return pd.DataFrame(results).pivot(index='Dataset', columns='Method', values='CV_R2')

def evaluate_rgs_cv(Xs, ys, cv=10):
    """Evaluate RGS using cross-validation with the same setup as simulation."""
    
    results = []
    for dataset_name in Xs.keys():
        print(f"Evaluating RGS on {dataset_name}...")
        X, y = Xs[dataset_name], ys[dataset_name]
        n_samples, n_features = X.shape
        
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # Setup RGS parameters
        max_k = min(n_features, 25)
        m_grid = [round(1 + i * (n_features-1) / 9) for i in range(10)]
        
        # Estimate sigma from data
        temp_model = LinearRegression().fit(X_scaled, y)
        residuals = y - temp_model.predict(X_scaled)
        estimated_sigma = np.std(residuals)
        
        # Use the same scorer as simulation
        make_k_scorer = create_mse_scorer(
            sigma=estimated_sigma,
            n=n_samples,
            p=n_features
        )
        
        rgscv = RGSCV(
            k_max=max_k,
            m_grid=m_grid,
            n_estimators=500,
            n_resample_iter=7,
            method='fs',
            cv=cv,
            scoring=make_k_scorer,
            random_state=42
        )
        
        rgscv.fit(X_scaled, y)
        
        results.append({
            'Dataset': dataset_name,
            'Method': 'RGS',
            'CV_R2': rgscv.score(X_scaled, y),
            'Best_k': rgscv.k_,
            'Best_m': rgscv.m_
        })
    
    return results

def create_cv_comparison_table(baseline_results, rgs_results):
    """Create comparison table for CV R² scores."""
    combined_results = baseline_results.copy()
    rgs_df = pd.DataFrame(rgs_results)
    combined_results['RGS'] = rgs_df.set_index('Dataset')['CV_R2']
    return combined_results.round(3)

def create_parameter_table(rgs_results):
    """Create table showing RGS parameter selections."""
    rgs_df = pd.DataFrame(rgs_results)
    return rgs_df[['Dataset', 'Best_k', 'Best_m']].set_index('Dataset')

In [3]:

# Load datasets
print("Loading datasets...")
labels = ['Auto Pricing', 'Bodyfat', 'Sunspots', 'PW', 'CPU', 'House', 'MeatFat']
data = {}

data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Sunspots'] = pd.read_csv('../real_data/695_chatfield_4.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
# data['Pharynx'] = pd.read_csv('../real_data/1196_BNG_pharynx.tsv', sep='\t')
data['PW'] = pd.read_csv('../real_data/229_pwLinear.tsv', sep='\t')
data['CPU'] = pd.read_csv('../real_data/197_cpu_act.tsv', sep='\t')
data['House'] = pd.read_csv('../real_data/574_house_16H.tsv', sep='\t')
data['MeatFat'] = pd.read_csv('../real_data/505_tecator.tsv', sep='\t')

# Clean datasets
print("\nCleaning datasets...")
cleaned_data = clean_datasets(data)

# Prepare X and y
Xs = {label: cleaned_data[label].drop('target', axis=1).values for label in labels}
ys = {label: cleaned_data[label]['target'].values for label in labels}

# Evaluate baseline methods
print("\n" + "="*60)
print("EVALUATING BASELINE METHODS")
print("="*60)
baseline_results = evaluate_methods_cv_only(Xs, ys)

# Evaluate RGS
print("\n" + "="*60)
print("EVALUATING RGS")
print("="*60)
rgs_results = evaluate_rgs_cv(Xs, ys)

# Create and display results
print("\n" + "="*60)
print("RESULTS SUMMARY")
print("="*60)

# Create comparison tables
cv_comparison_table = create_cv_comparison_table(baseline_results, rgs_results)
parameter_table = create_parameter_table(rgs_results)

print("\nCV R² Comparison Table:")
print(cv_comparison_table)

# Find best method per dataset
best_methods = cv_comparison_table.idxmax(axis=1)
print("\nBest Method per Dataset:")
for dataset, method in best_methods.items():
    score = cv_comparison_table.loc[dataset, method]
    print(f"{dataset}: {method} ({score:.3f})")

print("\nRGS Parameter Selection Table:")
print(parameter_table)

# Summary statistics table
print("\nSummary Statistics:")
summary_stats = pd.DataFrame({
    'Mean_R2': cv_comparison_table.mean(),
    'Std_R2': cv_comparison_table.std(),
    'Min_R2': cv_comparison_table.min(),
    'Max_R2': cv_comparison_table.max(),
    'Wins': [sum(best_methods == method) for method in cv_comparison_table.columns]
}).round(3)
print(summary_stats)

Loading datasets...

Cleaning datasets...
Cleaning Auto Pricing: (159, 16) -> (159, 16)
Cleaning Sunspots: (235, 13) -> (235, 13)
Cleaning Bodyfat: (252, 15) -> (252, 15)
Cleaning PW: (200, 11) -> (200, 11)
Cleaning CPU: (8192, 22) -> (8192, 22)
Cleaning House: (22784, 17) -> (22783, 17)
Cleaning MeatFat: (240, 125) -> (220, 125)

EVALUATING BASELINE METHODS
Evaluating baselines on Auto Pricing...
Evaluating baselines on Bodyfat...
Evaluating baselines on Sunspots...
Evaluating baselines on PW...
Evaluating baselines on CPU...
Evaluating baselines on House...
Evaluating baselines on MeatFat...

EVALUATING RGS
Evaluating RGS on Auto Pricing...
Evaluating RGS on Bodyfat...
Evaluating RGS on Sunspots...
Evaluating RGS on PW...
Evaluating RGS on CPU...
Evaluating RGS on House...
Evaluating RGS on MeatFat...





RESULTS SUMMARY

CV R² Comparison Table:
Method        ElasticNetCV  LassoCV  RidgeCV    RGS
Dataset                                            
Auto Pricing         0.816    0.826    0.847  0.826
Bodyfat              0.978    0.978    0.978  0.977
CPU                  0.729    0.729    0.729  0.729
House                0.142    0.261    0.261  0.261
MeatFat              0.998    0.998    0.998  0.998
PW                   0.784    0.785    0.783  0.784
Sunspots             0.887    0.888    0.885  0.880

Best Method per Dataset:
Auto Pricing: RidgeCV (0.847)
Bodyfat: ElasticNetCV (0.978)
CPU: ElasticNetCV (0.729)
House: LassoCV (0.261)
MeatFat: ElasticNetCV (0.998)
PW: LassoCV (0.785)
Sunspots: LassoCV (0.888)

RGS Parameter Selection Table:
              Best_k  Best_m
Dataset                     
Auto Pricing       2       7
Bodyfat            1      13
Sunspots           2       6
PW                 9       2
CPU               20      10
House             15       6
MeatFat        