In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNetCV
from tqdm.auto import tqdm
import time
from datetime import datetime

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection
from data_plotting import *
from RGS_experimental import FastRandomizedGreedySelectionCV

In [2]:
def clean_datasets(data):
    """
    Clean datasets by removing missing values and duplicate rows.
    
    Parameters:
    -----------
    data : dict
        Dictionary containing pandas DataFrames for each dataset
    
    Returns:
    --------
    dict
        Dictionary containing cleaned DataFrames
    """
    cleaned_data = {}
    
    for label, df in data.items():
        print(f"\nCleaning dataset: {label}")
        print(f"Original shape: {df.shape}")
        
        # Check for missing values
        missing_before = df.isnull().sum().sum()
        print(f"Missing values before: {missing_before}")
        
        # Check for duplicates
        duplicates_before = df.duplicated().sum()
        print(f"Duplicate rows before: {duplicates_before}")
        
        # Remove missing values
        df_cleaned = df.dropna()
        
        # Remove duplicates
        df_cleaned = df_cleaned.drop_duplicates()
        
        # Final checks
        missing_after = df_cleaned.isnull().sum().sum()
        duplicates_after = df_cleaned.duplicated().sum()
        
        print(f"Final shape: {df_cleaned.shape}")
        print(f"Rows removed due to missing values: {len(df) - len(df_cleaned)}")
        print(f"Missing values after: {missing_after}")
        print(f"Duplicate rows after: {duplicates_after}")
        
        cleaned_data[label] = df_cleaned
    
    return cleaned_data

In [3]:
import pandas as pd
import numpy as np

def generate_synthetic_data(df, n_row_duplicates=2, n_col_duplicates=2, random_seed=42):
    """
    Generate synthetic data efficiently by repeating base rows and adding permuted columns.
    Base columns remain identical across row duplications. The 'target' column is excluded
    from permutations but included in row duplications.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame
    n_row_duplicates : int, default=2
        Number of times to repeat the original rows
    n_col_duplicates : int, default=2
        Number of permuted column sets to create
    random_seed : int, optional
        Random seed for reproducibility
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with repeated base rows and permuted additional columns
    """
    if random_seed is not None:
        np.random.seed(random_seed)
    
    # Calculate total number of rows in advance
    n_original_rows = len(df)
    total_rows = n_original_rows * (n_row_duplicates + 1)
    
    # Pre-allocate the final dictionary with numpy arrays
    data_dict = {}
    
    # Efficiently handle base columns using numpy repeat
    for col in df.columns:
        data_dict[col] = np.repeat(df[col].values, n_row_duplicates + 1)
    
    # Generate all permutations at once for each original column (excluding 'target')
    rng = np.random.default_rng(random_seed)  # Use newer random generator
    
    # Get columns to permute (exclude 'target')
    cols_to_permute = [col for col in df.columns if col != 'target']
    
    # Pre-generate all permutation indices
    all_perm_indices = np.array([
        rng.permutation(total_rows) 
        for _ in range(n_col_duplicates * len(cols_to_permute))
    ])
    
    # Efficiently generate permuted columns using broadcasting
    perm_idx = 0
    repeated_values = {
        col: np.tile(df[col].values, n_row_duplicates + 1)
        for col in cols_to_permute
    }
    
    for i in range(n_col_duplicates):
        for col in cols_to_permute:
            new_col = f"{col}_perm_{i+1}"
            data_dict[new_col] = repeated_values[col][all_perm_indices[perm_idx]]
            perm_idx += 1
    
    return pd.DataFrame(data_dict)

In [4]:
def run_synthetic_experiments(data_dict, n_iter=5, test_size=0.2, n_row_duplicates=2, 
                            n_col_duplicates=2, random_seed=None):
    """
    Run experiments with synthetic data, tracking performance across iterations.
    All models use cross-validation for parameter selection.
    """
    results = []
    
    # Initialize CV models with parameter ranges and no intercept
    models = {
        'LassoCV': LassoCV(
            alphas=np.logspace(-4, 2, 100),
            cv=10,
            n_jobs=-1,
            max_iter=10000,
            random_state=42,
            fit_intercept=False
        ),
        'RidgeCV': RidgeCV(
            alphas=np.logspace(-4, 2, 100),
            cv=10,
            fit_intercept=False
        ),
        'ElasticNetCV': ElasticNetCV(
            l1_ratio=np.linspace(0.1, 0.9, 9),
            alphas=np.logspace(-4, 2, 100),
            cv=10,
            n_jobs=-1,
            max_iter=10000,
            random_state=42,
            fit_intercept=False
        )
    }
    
    # Main iteration loop with progress bar
    with tqdm(total=n_iter * len(data_dict), desc="Running experiments") as pbar:
        for iteration in range(n_iter):
            iter_seed = random_seed + iteration if random_seed else None
            
            # Process each dataset
            for dataset_name, original_df in data_dict.items():
                start_time = time.time()
                
                # Generate synthetic data
                synthetic_df = generate_synthetic_data(
                    original_df, 
                    n_row_duplicates=n_row_duplicates,
                    n_col_duplicates=n_col_duplicates,
                    random_seed=iter_seed
                )
                
                # Split features and target
                X = synthetic_df.drop('target', axis=1) 
                y = synthetic_df['target']              
                
                n_features = X.shape[1]
                
                # Create FastRGS-CV model with appropriate parameters
                m_values = [int(m) for m in np.unique(np.linspace(1, n_features, 10, dtype=int))]
                max_k = int(n_features / (n_col_duplicates + 1))
                
                models['FastRGS-CV'] = FastRandomizedGreedySelectionCV(
                    k_max=max_k,
                    m_grid=m_values,
                    n_replications=1000,
                    n_resample_iter=7,
                    random_state=iter_seed,
                    cv=10
                )
                
                # Train-test split
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, random_state=iter_seed
                )
                
                # Center and scale features and target
                feature_scaler = StandardScaler(with_std=True, with_mean=True)
                X_train_scaled = feature_scaler.fit_transform(X_train)
                X_test_scaled = feature_scaler.transform(X_test)
                
                # Center target
                y_train_mean = np.mean(y_train)
                y_train_centered = y_train - y_train_mean
                y_test_centered = y_test - y_train_mean
                
                # Train and evaluate each model
                for model_name, model in models.items():
                    try:
                        # Fit model on centered data
                        best_model = model.fit(X_train_scaled, y_train_centered)
                        
                        # Get predictions and transform back
                        y_pred_train = best_model.predict(X_train_scaled) + y_train_mean
                        y_pred_test = best_model.predict(X_test_scaled) + y_train_mean
                        
                        # Calculate metrics
                        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
                        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
                        train_r2 = r2_score(y_train, y_pred_train)
                        test_r2 = r2_score(y_test, y_pred_test)
                        
                        # Get best parameters based on model type
                        if model_name == 'FastRGS-CV':
                            best_params = {'k': best_model.k_, 'm': best_model.m_}
                            n_selected_features = best_model.k_
                        elif model_name == 'LassoCV':
                            best_params = {'alpha': best_model.alpha_}
                            n_selected_features = np.sum(best_model.coef_ != 0)
                        elif model_name == 'RidgeCV':
                            best_params = {'alpha': best_model.alpha_}
                            n_selected_features = n_features  # Ridge uses all features
                        elif model_name == 'ElasticNetCV':
                            best_params = {
                                'alpha': best_model.alpha_,
                                'l1_ratio': best_model.l1_ratio_
                            }
                            n_selected_features = np.sum(best_model.coef_ != 0)
                        
                        # Store results
                        results.append({
                            'iteration': iteration,
                            'dataset': dataset_name,
                            'model': model_name,
                            'train_rmse': train_rmse,
                            'test_rmse': test_rmse,
                            'train_r2': train_r2,
                            'test_r2': test_r2,
                            'best_params': str(best_params),
                            'n_features': n_features,
                            'n_samples': len(X),
                            'n_selected_features': n_selected_features,
                            'processing_time': time.time() - start_time
                        })
                        
                    except Exception as e:
                        print(f"Error with {model_name} on {dataset_name}: {str(e)}")
                        continue
                
                pbar.update(1)
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'synthetic_experiments_results_{timestamp}.csv'
    results_df.to_csv(filename, index=False)
    
    return results_df

In [5]:
import pandas as pd
import numpy as np

def create_comparison_tables(results_df):
    """
    Create comparison tables from simulation results.
    
    Parameters:
    -----------
    results_df : pandas.DataFrame
        DataFrame containing simulation results
        
    Returns:
    --------
    dict
        Dictionary containing different comparison tables
    """
    tables = {}
    
    # 1. Average Performance Table (across all iterations)
    perf_metrics = ['test_rmse', 'test_r2', 'train_rmse', 'train_r2']
    avg_performance = results_df.groupby(['dataset', 'model'])[perf_metrics].agg(
        ['mean', 'std']
    ).round(4)
    
    # Reshape for better readability
    avg_performance_flat = pd.DataFrame()
    for metric in perf_metrics:
        mean_col = f"{metric}_mean"
        std_col = f"{metric}_std"
        avg_performance_flat[mean_col] = avg_performance[metric]['mean']
        avg_performance_flat[std_col] = avg_performance[metric]['std']
    
    tables['average_performance'] = avg_performance_flat.reset_index()
    
    # 2. Best Model Per Dataset
    best_models = (results_df.groupby(['dataset', 'model'])['test_r2']
                  .mean()
                  .reset_index()
                  .sort_values('test_r2', ascending=False)
                  .groupby('dataset').first()
                  .reset_index())
    tables['best_models'] = best_models
    
    # 3. Feature Selection Summary (for FastRGS)
    if 'n_selected_features' in results_df.columns:
        feature_selection = (results_df[results_df['model'] == 'FastRGS']
                           .groupby('dataset')
                           .agg({
                               'n_selected_features': ['mean', 'std'],
                               'n_features': 'first'
                           })
                           .round(2))
        feature_selection.columns = ['avg_selected', 'std_selected', 'total_features']
        tables['feature_selection'] = feature_selection.reset_index()
    
    # 4. Processing Time Comparison
    time_comparison = (results_df.groupby(['dataset', 'model'])['processing_time']
                      .agg(['mean', 'std'])
                      .round(2)
                      .reset_index())
    tables['processing_time'] = time_comparison
    
    # 5. Model Rankings
    rankings = (results_df.groupby(['dataset', 'model'])['test_r2']
               .mean()
               .reset_index()
               .pivot(index='dataset', columns='model', values='test_r2')
               .rank(axis=1, ascending=False)
               .round(2))
    tables['model_rankings'] = rankings.reset_index()
    
    # 6. Best Parameters Summary
    best_params = (results_df.groupby(['dataset', 'model'])
                  .agg({'best_params': lambda x: x.mode().iloc[0] if len(x) > 0 else None})
                  .reset_index())
    tables['best_parameters'] = best_params
    
    # Save all tables to Excel with multiple sheets
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    excel_filename = f'comparison_tables_{timestamp}.xlsx'
    
    with pd.ExcelWriter(excel_filename) as writer:
        for table_name, table in tables.items():
            table.to_excel(writer, sheet_name=table_name, index=False)
            
    return tables

# Example usage:
# tables = create_comparison_tables(results_df)
# 
# # Access individual tables
# print("Average Performance:")
# print(tables['average_performance'])
# 
# print("\nBest Models per Dataset:")
# print(tables['best_models'])

In [6]:
labels = ['Auto Pricing', 'Sunspots', 'Bodyfat', 'PW', 'MeatFat']
data = {}

# Load data
data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Sunspots'] = pd.read_csv('../real_data/695_chatfield_4.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
# data['Pharynx'] = pd.read_csv('../real_data/1196_BNG_pharynx.tsv', sep='\t')
data['PW'] = pd.read_csv('../real_data/229_pwLinear.tsv', sep='\t')
# data['CPU'] = pd.read_csv('../real_data/197_cpu_act.tsv', sep='\t')
# data['House'] = pd.read_csv('../real_data/574_house_16H.tsv', sep='\t')
data['MeatFat'] = pd.read_csv('../real_data/505_tecator.tsv', sep='\t')

# Clean the datasets
cleaned_data = clean_datasets(data)


Cleaning dataset: Auto Pricing
Original shape: (159, 16)
Missing values before: 0
Duplicate rows before: 0
Final shape: (159, 16)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Sunspots
Original shape: (235, 13)
Missing values before: 0
Duplicate rows before: 0
Final shape: (235, 13)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Bodyfat
Original shape: (252, 15)
Missing values before: 0
Duplicate rows before: 0
Final shape: (252, 15)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: PW
Original shape: (200, 11)
Missing values before: 0
Duplicate rows before: 0
Final shape: (200, 11)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: MeatFat
Original shape: (240, 125)
Missing values before: 0
Duplicate rows before: 20
Final shape: (220, 125)
Rows removed due to mi

In [7]:
results = run_synthetic_experiments(
    data_dict=cleaned_data,
    n_iter=5,  # Number of iterations
    n_row_duplicates=0,  # Number of times to repeat rows
    n_col_duplicates=70,  # Number of permuted column sets
    random_seed=42  # For reproducibility
)

Running experiments:   0%|          | 0/25 [00:00<?, ?it/s]

In [8]:
tables = create_comparison_tables(results)

# Access individual tables
# print("Average Performance:")
# print(tables['average_performance'])



In [9]:
print("\nBest Models per Dataset:")
print(tables['best_models'])


Best Models per Dataset:
        dataset       model   test_r2
0  Auto Pricing     LassoCV  0.591074
1       Bodyfat     LassoCV  0.987216
2       MeatFat     LassoCV  0.977867
3            PW  FastRGS-CV  0.671391
4      Sunspots     LassoCV  0.879909
