In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection

In [None]:
def clean_datasets(data):
    """
    Clean datasets by removing missing values and duplicate rows.
    
    Parameters:
    -----------
    data : dict
        Dictionary containing pandas DataFrames for each dataset
    
    Returns:
    --------
    dict
        Dictionary containing cleaned DataFrames
    """
    cleaned_data = {}
    
    for label, df in data.items():
        print(f"\nCleaning dataset: {label}")
        print(f"Original shape: {df.shape}")
        
        # Check for missing values
        missing_before = df.isnull().sum().sum()
        print(f"Missing values before: {missing_before}")
        
        # Check for duplicates
        duplicates_before = df.duplicated().sum()
        print(f"Duplicate rows before: {duplicates_before}")
        
        # Remove missing values
        df_cleaned = df.dropna()
        
        # Remove duplicates
        df_cleaned = df_cleaned.drop_duplicates()
        
        # Final checks
        missing_after = df_cleaned.isnull().sum().sum()
        duplicates_after = df_cleaned.duplicated().sum()
        
        print(f"Final shape: {df_cleaned.shape}")
        print(f"Rows removed due to missing values: {len(df) - len(df_cleaned)}")
        print(f"Missing values after: {missing_after}")
        print(f"Duplicate rows after: {duplicates_after}")
        
        cleaned_data[label] = df_cleaned
    
    return cleaned_data

In [None]:
def generate_synthetic_observations(df, n_samples=5000):
    """
    Generate synthetic observations based on the covariance structure of features
    
    Parameters:
    df: pandas DataFrame with original data
    n_samples: number of synthetic samples to generate
    
    Returns:
    DataFrame with synthetic observations
    """
    # Separate features from target
    X = df.drop('target', axis=1)
    
    # Calculate mean vector and covariance matrix
    mean_vector = np.mean(X, axis=0)
    cov_matrix = np.cov(X.values.T)
    
    # Generate random samples from multivariate normal distribution
    synthetic_samples = np.random.multivariate_normal(
        mean=mean_vector,
        cov=cov_matrix,
        size=n_samples
    )
    
    # Convert to DataFrame with original feature names
    synthetic_df = pd.DataFrame(synthetic_samples, columns=X.columns)
    
    return synthetic_df

In [None]:
labels = ['Auto Pricing', 'Bodyfat', 'Sunspots', 'Pharynx', 'PW', 'CPU', 'House', 'MeatFat']
data = {}

# Load data
data['Auto Pricing'] = pd.read_csv('../real_data/207_autoPrice.tsv', sep='\t')
data['Sunspots'] = pd.read_csv('../real_data/695_chatfield_4.tsv', sep='\t')
data['Bodyfat'] = pd.read_csv('../real_data/560_bodyfat.tsv', sep='\t')
data['Pharynx'] = pd.read_csv('../real_data/1196_BNG_pharynx.tsv', sep='\t')
data['PW'] = pd.read_csv('../real_data/229_pwLinear.tsv', sep='\t')
data['CPU'] = pd.read_csv('../real_data/197_cpu_act.tsv', sep='\t')
data['House'] = pd.read_csv('../real_data/574_house_16H.tsv', sep='\t')
data['MeatFat'] = pd.read_csv('../real_data/505_tecator.tsv', sep='\t')

# Clean the datasets
cleaned_data = clean_datasets(data)

In [None]:
# Generate synthetic data
syn_data = {}

for label in labels:
    syn_data[label] = 

In [None]:
import pandas as pd
import numpy as np

# Read TSV data
df = pd.read_csv('your_file.tsv', sep='\t')

# Convert to numpy array and compute covariance
data = df.values
covariance_matrix = np.cov(data.T)  # .T transposes the data as np.cov expects variables as rows

# Print matrix dimensions
print(f"\nCovariance matrix shape: {covariance_matrix.shape}")

# Print the covariance matrix with variable names
print("\nCovariance matrix:")
print(pd.DataFrame(covariance_matrix, columns=df.columns, index=df.columns))

In [None]:
# Generate synthetic data
np.random.seed(42)
n, p = 500, 500 # 5000 samples, 500 features

# Create base features
X = np.random.randn(n, p)

# Introduce correlations
A = np.eye(p) + 0.2 * np.ones((p, p))
X = X @ A

# Create true coefficients (some zero, some non-zero)
true_coef = np.zeros(p)
true_coef[:6] = [1.5, 0.8, 1.2, 0.5, 0.7, 0.05]  # Only first 6 features are relevant

# Create the target variable (linear model)
sigma = 1
y = X @ true_coef + sigma * np.random.normal(0, 1, n)  # Add some noise

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [2]:
rgs1 = FastRandomizedGreedySelection(k_max=15, m=200)
rgs1.fit(X_train, y_train)

In [3]:
rgs2 = FastRandomizedGreedySelection(k_max=15, m=200, n_resample_iter=1)
rgs2.fit(X_train, y_train)

In [4]:
rgs3 = FastRandomizedGreedySelection(k_max=15, m=200, n_resample_iter=2)
rgs3.fit(X_train, y_train)

In [6]:
estimators = {"rgs1" : rgs1,
              "rgs2" : rgs2,
              "rgs3" : rgs3}
results = {}
for name, estimator in estimators.items():
    results[name] = [mean_squared_error(estimator.predict(X_test, k), X_test @ true_coef) for k in range(0, 16)]
print(mean_squared_error(lasso.predict(X_test), X_test @ true_coef))
pd.DataFrame(results)

0.012463106871794572


Unnamed: 0,rgs1,rgs2,rgs3
0,457.946755,457.946755,457.946755
1,0.958522,1.128858,1.191185
2,0.292462,0.272517,0.406167
3,0.125505,0.151693,0.186742
4,0.072272,0.064924,0.077382
5,0.041882,0.041343,0.047635
6,0.024652,0.022743,0.031133
7,0.016293,0.013587,0.021527
8,0.010693,0.009187,0.011942
9,0.007404,0.007755,0.00814


In [7]:
# stability_scores = {}
# for name, estimator in estimators.items():
#     stability_scores[name] = estimator.fit_stability()
# pd.DataFrame(stability_scores)

In [None]:
np.abs(lasso.coef_).argsort()[::-1] # Feature 5 only shows up in 11th place

array([  0,   2,   1,   4,   3, 342, 205, 375, 318, 252,   5, 366, 244,
       444, 452, 338, 219, 391, 371, 326, 420, 272, 359,  77, 197, 107,
       424, 185, 489, 380, 291,  95, 163, 300, 405, 259, 279, 480, 422,
       157, 165, 167, 170, 156, 159, 158, 168, 169, 160, 161, 164, 166,
       162, 150, 155, 154, 130, 131, 132, 133, 134, 135, 136, 137, 138,
       139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 172, 149, 129,
       151, 152, 153, 171, 180, 173, 221, 208, 209, 210, 211, 212, 213,
       214, 215, 216, 217, 218, 220, 222, 206, 223, 224, 225, 226, 227,
       228, 229, 230, 231, 232, 233, 234, 207, 204, 174, 188, 175, 176,
       177, 178, 179, 127, 181, 182, 183, 184, 186, 187, 189, 203, 190,
       191, 192, 193, 194, 195, 196, 198, 199, 200, 201, 202, 128, 118,
       126,  48,  36,  37,  38,  39,  40,  41,  42,  43,  44,  45,  46,
        47,  49,  34,  50,  51,  52,  53,  54,  55,  56,  57,  58,  59,
        60,  61,  35,  33,  63,  18,   6,   7,   8,   9,  10,  1

In [16]:
np.abs(rgs1.coef_[5]).argsort()[::-1] # Feature 5 shows up in 6th place

array([  0,   2,   1,   4,   3,   5, 252, 241, 342, 452,  77, 423, 429,
       163, 375, 386, 170,  32, 434, 109,  53, 424, 116, 195, 418, 469,
       391,  60,  41, 498, 266, 179, 472, 444, 318, 185, 384, 439, 455,
       413,  55, 349, 405, 317, 487, 358, 420, 306, 387, 229, 347, 205,
       339, 265,  34, 194, 443, 181,  62,  25, 214, 218, 380, 432, 154,
       321, 126,  61,  63, 490, 260, 451, 106, 488,  85, 284,  95, 422,
       271, 426,  30, 366,  79, 122, 259, 201, 150, 192, 401, 332, 244,
       249,  84, 393,  59,  45, 335, 468, 153, 343, 107, 171, 354, 145,
       118, 296, 491, 368, 279, 234, 398, 378, 280, 143, 489, 136, 135,
       134, 130, 133, 137, 138, 139, 140, 132, 141, 142, 131, 407, 125,
       129, 114, 104, 105, 474, 108, 473, 110, 111, 112, 113, 115, 128,
       471, 117, 119, 120, 121, 470, 123, 124, 127, 144, 151, 146, 147,
       175, 176, 177, 178, 463, 180, 462, 182, 183, 184, 461, 186, 187,
       188, 189, 190, 191, 460, 193, 459, 458, 174, 173, 172, 15

In [None]:
np.abs(rgs1.coef_[10]).argsort()[::-1] # Feature 5 shows up in 8th place

array([  0,   2,   1,   4,   3, 342, 375,   5, 205, 249, 273, 452,  10,
       444, 252,  77, 109, 163, 391, 424, 318,  32, 355, 107, 185, 116,
       380, 241, 423, 429, 170, 146,  41, 405, 498, 418, 443, 259,  60,
       195,  44, 145, 422, 472, 439,  62, 386, 420, 434, 455, 384, 360,
       469, 387,  34,  95, 349, 359, 266, 179, 343, 244, 265, 490,  61,
        55, 378, 321, 413,  53, 317, 181, 194, 121,  25, 218, 426, 260,
       214, 366,  84, 306, 198, 143, 332, 118, 293,  85,  30, 487, 393,
       347, 371, 197, 489, 192, 229, 319, 451, 379, 154, 432,  45,  26,
       340,  82, 416, 280, 368, 171, 488, 494, 126, 219, 401, 326, 271,
        79, 106, 358,  59, 279, 363,  63, 296, 339, 284, 479, 122, 491,
       234, 335, 468, 459, 150, 161, 153, 201, 354, 398, 135, 130, 136,
       132, 131, 105, 108, 133, 134, 104, 119, 467, 114, 117, 120, 464,
       466, 137, 115, 123, 124, 110, 125, 113, 127, 112, 111, 128, 465,
       129, 461, 138, 177, 167, 168, 169, 172, 173, 174, 175, 17