In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection
from data_plotting import *

In [2]:
def clean_datasets(data):
    """
    Clean datasets by removing missing values and duplicate rows.
    
    Parameters:
    -----------
    data : dict
        Dictionary containing pandas DataFrames for each dataset
    
    Returns:
    --------
    dict
        Dictionary containing cleaned DataFrames
    """
    cleaned_data = {}
    
    for label, df in data.items():
        print(f"\nCleaning dataset: {label}")
        print(f"Original shape: {df.shape}")
        
        # Check for missing values
        missing_before = df.isnull().sum().sum()
        print(f"Missing values before: {missing_before}")
        
        # Check for duplicates
        duplicates_before = df.duplicated().sum()
        print(f"Duplicate rows before: {duplicates_before}")
        
        # Remove missing values
        df_cleaned = df.dropna()
        
        # Remove duplicates
        df_cleaned = df_cleaned.drop_duplicates()
        
        # Final checks
        missing_after = df_cleaned.isnull().sum().sum()
        duplicates_after = df_cleaned.duplicated().sum()
        
        print(f"Final shape: {df_cleaned.shape}")
        print(f"Rows removed due to missing values: {len(df) - len(df_cleaned)}")
        print(f"Missing values after: {missing_after}")
        print(f"Duplicate rows after: {duplicates_after}")
        
        cleaned_data[label] = df_cleaned
    
    return cleaned_data

In [3]:
import pandas as pd
import numpy as np

def generate_synthetic_data(df, n_row_duplicates=2, n_col_duplicates=2, random_seed=42):
    """
    Generate synthetic data efficiently by repeating base rows and adding permuted columns.
    Base columns remain identical across row duplications.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        Input DataFrame
    n_row_duplicates : int, default=2
        Number of times to repeat the original rows
    n_col_duplicates : int, default=2
        Number of permuted column sets to create
    random_seed : int, optional
        Random seed for reproducibility
    
    Returns:
    --------
    pandas.DataFrame
        DataFrame with repeated base rows and permuted additional columns
    """
    if random_seed is not None:
        np.random.seed(random_seed)
    
    # Calculate total number of rows in advance
    n_original_rows = len(df)
    total_rows = n_original_rows * (n_row_duplicates + 1)
    
    # Pre-allocate the final dictionary with numpy arrays
    data_dict = {}
    
    # Efficiently handle base columns using numpy repeat
    for col in df.columns:
        data_dict[col] = np.repeat(df[col].values, n_row_duplicates + 1)
    
    # Generate all permutations at once for each original column
    rng = np.random.default_rng(random_seed)  # Use newer random generator
    
    # Pre-generate all permutation indices
    all_perm_indices = np.array([
        rng.permutation(total_rows) 
        for _ in range(n_col_duplicates * len(df.columns))
    ])
    
    # Efficiently generate permuted columns using broadcasting
    perm_idx = 0
    repeated_values = {
        col: np.tile(df[col].values, n_row_duplicates + 1)
        for col in df.columns
    }
    
    for i in range(n_col_duplicates):
        for col in df.columns:
            new_col = f"{col}_perm_{i+1}"
            data_dict[new_col] = repeated_values[col][all_perm_indices[perm_idx]]
            perm_idx += 1
    
    return pd.DataFrame(data_dict)

In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import Lasso, Ridge, ElasticNet
from sklearn.model_selection import GridSearchCV
from tqdm.auto import tqdm
import time
from datetime import datetime
from RGS import FastRandomizedGreedySelection

def run_synthetic_experiments(data_dict, n_iter=5, test_size=0.2, n_row_duplicates=2, 
                            n_col_duplicates=2, random_seed=None):
    """
    Run experiments with synthetic data, tracking performance across iterations.
    """
    results = []
    
    # Set up base parameter grids for standard models
    base_param_grids = {
        'Lasso': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'Ridge': {'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]},
        'ElasticNet': {
            'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10],
            'l1_ratio': [0.1, 0.3, 0.5, 0.7, 0.9]
        }
    }
    
    # Initialize base models
    models = {
        'Lasso': Lasso(random_state=42, max_iter=10000),
        'Ridge': Ridge(random_state=42),
        'ElasticNet': ElasticNet(random_state=42, max_iter=10000)
    }
    
    # Main iteration loop with progress bar
    with tqdm(total=n_iter * len(data_dict), desc="Running experiments") as pbar:
        for iteration in range(n_iter):
            iter_seed = random_seed + iteration if random_seed else None
            
            # Process each dataset
            for dataset_name, original_df in data_dict.items():
                start_time = time.time()
                
                # Generate synthetic data
                synthetic_df = generate_synthetic_data(
                    original_df, 
                    n_row_duplicates=n_row_duplicates,
                    n_col_duplicates=n_col_duplicates,
                    random_seed=iter_seed
                )
                
                # Split features and target
                X = synthetic_df.drop('target', axis=1) 
                y = synthetic_df['target']              
                
                n_features = X.shape[1]
                
                # Create dynamic parameter grid for FastRGS
                m_values = [int(m) for m in np.unique(np.linspace(1, n_features, 10, dtype=int))]
                # Calculate max k_max based on feature count divided by duplicates
                max_k = int(n_features / (n_col_duplicates + 1))  # +1 because we have original columns too
                k_max_values = [int(k) for k in np.unique(np.linspace(2, max_k, 10, dtype=int))]
                k_max_values = [k for k in k_max_values if k >= 2]  # Ensure k_max is at least 2
                
                # Initialize model and parameter grid dictionaries
                current_models = models.copy()
                current_param_grids = base_param_grids.copy()

                # Create FastRGS models for each (k_max, m) combination
                for k_max in k_max_values:
                    for m in m_values:
                        if m >= k_max:  # m must be >= k_max
                            model_name = f'FastRGS_k{k_max}_m{m}'
                            current_models[model_name] = FastRandomizedGreedySelection(
                                k_max=k_max,
                                m=m,
                                n_resample_iter=7
                            )
                            current_param_grids[model_name] = {}
                
                # Train-test split
                X_train, X_test, y_train, y_test = train_test_split(
                    X, y, test_size=test_size, random_state=iter_seed
                )
                
                # Scale features
                scaler = StandardScaler()
                X_train_scaled = scaler.fit_transform(X_train)
                X_test_scaled = scaler.transform(X_test)
                
                # Train and evaluate each model
                for model_name, model in current_models.items():
                    try:
                        # Grid search
                        grid_search = GridSearchCV(
                            model,
                            current_param_grids[model_name],
                            cv=5,
                            scoring='r2',
                            n_jobs=-1,
                            verbose=0
                        )
                        
                        grid_search.fit(X_train_scaled, y_train)
                        best_model = grid_search.best_estimator_
                        
                        # Get predictions
                        y_pred_train = best_model.predict(X_train_scaled)
                        y_pred_test = best_model.predict(X_test_scaled)
                        
                        # Calculate metrics
                        train_rmse = np.sqrt(mean_squared_error(y_train, y_pred_train))
                        test_rmse = np.sqrt(mean_squared_error(y_test, y_pred_test))
                        train_r2 = r2_score(y_train, y_pred_train)
                        test_r2 = r2_score(y_test, y_pred_test)
                        
                        # Get selected features for FastRGS
                        n_selected_features = None
                        if model_name == 'FastRGS':
                            n_selected_features = np.sum(best_model.coef_ != 0)
                        
                        # Store results
                        results.append({
                            'iteration': iteration,
                            'dataset': dataset_name,
                            'model': model_name,
                            'train_rmse': train_rmse,
                            'test_rmse': test_rmse,
                            'train_r2': train_r2,
                            'test_r2': test_r2,
                            'best_params': str(grid_search.best_params_),
                            'n_features': n_features,
                            'n_samples': len(X),
                            'n_selected_features': n_selected_features,
                            'processing_time': time.time() - start_time
                        })
                    except Exception as e:
                        print(f"Error with {model_name} on {dataset_name}: {str(e)}")
                        continue
                
                pbar.update(1)
    
    # Convert results to DataFrame
    results_df = pd.DataFrame(results)
    
    # Save results
    timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
    filename = f'synthetic_experiments_results_{timestamp}.csv'
    results_df.to_csv(filename, index=False)
    
    return results_df

In [5]:
import pandas as pd
import numpy as np

def create_comparison_tables(results_df):
    """
    Create comparison tables from simulation results.
    
    Parameters:
    -----------
    results_df : pandas.DataFrame
        DataFrame containing simulation results
        
    Returns:
    --------
    dict
        Dictionary containing different comparison tables
    """
    tables = {}
    
    # 1. Average Performance Table (across all iterations)
    perf_metrics = ['test_rmse', 'test_r2', 'train_rmse', 'train_r2']
    avg_performance = results_df.groupby(['dataset', 'model'])[perf_metrics].agg(
        ['mean', 'std']
    ).round(4)
    
    # Reshape for better readability
    avg_performance_flat = pd.DataFrame()
    for metric in perf_metrics:
        mean_col = f"{metric}_mean"
        std_col = f"{metric}_std"
        avg_performance_flat[mean_col] = avg_performance[metric]['mean']
        avg_performance_flat[std_col] = avg_performance[metric]['std']
    
    tables['average_performance'] = avg_performance_flat.reset_index()
    
    # 2. Best Model Per Dataset
    best_models = (results_df.groupby(['dataset', 'model'])['test_r2']
                  .mean()
                  .reset_index()
                  .sort_values('test_r2', ascending=False)
                  .groupby('dataset').first()
                  .reset_index())
    tables['best_models'] = best_models
    
    # 3. Feature Selection Summary (for FastRGS)
    if 'n_selected_features' in results_df.columns:
        feature_selection = (results_df[results_df['model'] == 'FastRGS']
                           .groupby('dataset')
                           .agg({
                               'n_selected_features': ['mean', 'std'],
                               'n_features': 'first'
                           })
                           .round(2))
        feature_selection.columns = ['avg_selected', 'std_selected', 'total_features']
        tables['feature_selection'] = feature_selection.reset_index()
    
    # 4. Processing Time Comparison
    time_comparison = (results_df.groupby(['dataset', 'model'])['processing_time']
                      .agg(['mean', 'std'])
                      .round(2)
                      .reset_index())
    tables['processing_time'] = time_comparison
    
    # 5. Model Rankings
    rankings = (results_df.groupby(['dataset', 'model'])['test_r2']
               .mean()
               .reset_index()
               .pivot(index='dataset', columns='model', values='test_r2')
               .rank(axis=1, ascending=False)
               .round(2))
    tables['model_rankings'] = rankings.reset_index()
    
    # 6. Best Parameters Summary
    best_params = (results_df.groupby(['dataset', 'model'])
                  .agg({'best_params': lambda x: x.mode().iloc[0] if len(x) > 0 else None})
                  .reset_index())
    tables['best_parameters'] = best_params
    
    # Save all tables to Excel with multiple sheets
    timestamp = pd.Timestamp.now().strftime('%Y%m%d_%H%M%S')
    excel_filename = f'comparison_tables_{timestamp}.xlsx'
    
    with pd.ExcelWriter(excel_filename) as writer:
        for table_name, table in tables.items():
            table.to_excel(writer, sheet_name=table_name, index=False)
            
    return tables

# Example usage:
# tables = create_comparison_tables(results_df)
# 
# # Access individual tables
# print("Average Performance:")
# print(tables['average_performance'])
# 
# print("\nBest Models per Dataset:")
# print(tables['best_models'])

In [6]:
labels = ['Auto Pricing', 'Bodyfat', 'Sunspots', 'PW']
data = {}

# Load data
data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Sunspots'] = pd.read_csv('../real_data/695_chatfield_4.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
# data['Pharynx'] = pd.read_csv('../real_data/1196_BNG_pharynx.tsv', sep='\t')
data['PW'] = pd.read_csv('../real_data/229_pwLinear.tsv', sep='\t')
# data['CPU'] = pd.read_csv('../real_data/197_cpu_act.tsv', sep='\t')
# data['House'] = pd.read_csv('../real_data/574_house_16H.tsv', sep='\t')
# data['MeatFat'] = pd.read_csv('../real_data/505_tecator.tsv', sep='\t')

# Clean the datasets
cleaned_data = clean_datasets(data)


Cleaning dataset: Auto Pricing
Original shape: (159, 16)
Missing values before: 0
Duplicate rows before: 0
Final shape: (159, 16)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Sunspots
Original shape: (235, 13)
Missing values before: 0
Duplicate rows before: 0
Final shape: (235, 13)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: Bodyfat
Original shape: (252, 15)
Missing values before: 0
Duplicate rows before: 0
Final shape: (252, 15)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0

Cleaning dataset: PW
Original shape: (200, 11)
Missing values before: 0
Duplicate rows before: 0
Final shape: (200, 11)
Rows removed due to missing values: 0
Missing values after: 0
Duplicate rows after: 0


In [7]:
results = run_synthetic_experiments(
    data_dict=cleaned_data,
    n_iter=5,  # Number of iterations
    n_row_duplicates=0,  # Number of times to repeat rows
    n_col_duplicates=15,  # Number of permuted column sets
    random_seed=42  # For reproducibility
)

Running experiments:   0%|          | 0/20 [00:00<?, ?it/s]

In [10]:
tables = create_comparison_tables(results)

# Access individual tables
# print("Average Performance:")
# print(tables['average_performance'])



In [11]:
print("\nBest Models per Dataset:")
print(tables['best_models'])


Best Models per Dataset:
        dataset            model   test_r2
0  Auto Pricing   FastRGS_k2_m57  0.748633
1       Bodyfat  FastRGS_k2_m212  0.987525
2            PW  FastRGS_k6_m175  0.736023
3      Sunspots   FastRGS_k2_m69  0.886556
