In [1]:
import sys
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LassoCV
from sklearn.metrics import mean_squared_error

sys.path.append("../")

from RGS import FastRandomizedGreedySelection, RandomizedGreedySelection

# Generate synthetic data
np.random.seed(42)
n, p = 5000, 500  # 100 samples, 50 features

# Create base features
X = np.random.randn(n, p)

# Introduce correlations
X[:, 1] = 0 * X[:, 0] + 0.3 * np.random.randn(n)  # Correlated with feature 0
X[:, 3] = 0 * X[:, 2] + 0.4 * np.random.randn(n)  # Correlated with feature 2
X[:, 5] = 0 * X[:, 0] + 0 * X[:, 4] + 0.1 * np.random.randn(n)  # Correlated with features 0 and 4

# Create true coefficients (some zero, some non-zero)
true_coef = np.zeros(p)
true_coef[:6] = [1.5, -0.8, 1.2, -0.5, 0.7, -0.3]  # Only first 6 features are relevant

# Create the target variable (linear model)
y = X @ true_coef + np.random.normal(0, 1, n)  # Add some noise

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Compare running time

In [22]:
rfs = RandomizedGreedySelection(k=10, m=200) # Original implementation
rfs.fit(X, y) # 30.5s

In [5]:
rfs2 = FastRandomizedGreedySelection(k=10, m=200, tol=1) # Keep only feature sets appearing more than 1 time
rfs2.fit(X, y) # 2.2s

The problem with the above approach is that at each step, the total number of feature sets goes down. We also introduce a small amount of bias. We can overcome this using resampling.

In [None]:
rfs3 = FastRandomizedGreedySelection(k=10, m=200, tol=0, resample=True)
rfs3.fit(X, y) # 6.1s

In [None]:
rfs4 = FastRandomizedGreedySelection(k=10, m=200, tol=1, resample=True) # Both resample and filter
rfs4.fit(X, y) # 3.4s

In [None]:
lasso = LassoCV()
lasso.fit(X, y) # 0.5s

# Compare prediction accuracy

In [23]:
estimators = {"rfs" : rfs, 
              "rfs2" : rfs2,
              "rfs3" : rfs3,
              "rfs4" : rfs4,
              "lasso" : lasso
              }
results = {}
for name, estimator in estimators.items():
    results[name] = mean_squared_error(estimator.predict(X_test), y_test)
results

{'rfs': 1.02050835495605,
 'rfs2': 1.0196394200227814,
 'rfs3': 1.0200621735543476,
 'rfs4': 1.020219437471009,
 'lasso': 1.0409735104433}

# Compare feature ranking

Note that in our generative model, the relevant features are [0, 1, 2, 3, 4, 5]

In [None]:
np.abs(lasso.coef_).argsort()[::-1] # Feature 5 only shows up in ~170th place!

array([  0,   2,   4,   1,   3, 321, 439,   9, 230, 493, 176, 339, 385,
       224,  57, 143, 147, 494, 234, 434, 303, 248, 343, 204, 259, 462,
       177, 170, 179, 160, 161, 162, 163, 178, 164, 165, 166, 169, 167,
       175, 174, 168, 173, 172, 171, 158, 159, 150, 157, 156, 129, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 144,
       145, 146, 148, 149, 181, 151, 152, 153, 154, 155, 180, 189, 182,
       212, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       227, 228, 229, 231, 232, 233, 235, 236, 237, 238, 239, 240, 241,
       213, 211, 183, 210, 184, 185, 186, 187, 188, 127, 190, 191, 192,
       193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 205, 206,
       207, 208, 209, 128, 121, 126,  65,  36,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,  53,  54,
        55,  56,  58,  59,  60,  61,  62,  63,  35,  34,  33,  18,   5,
         6,   7,   8,  10,  11,  12,  13,  14,  15,  16,  17,  1

In [None]:
np.abs(rfs2.coef_).argsort()[::-1] # Feature 5 shows up in 6th place

array([  0,   2,   1,   4,   3,   5, 439, 321,   9, 493, 385, 230, 176,
       339, 143, 209, 147, 105, 480, 224, 449, 248,  60,  57, 346, 153,
       204, 343, 472, 164, 159, 160, 178, 161, 162, 163, 177, 169, 165,
       166, 170, 175, 174, 167, 173, 172, 171, 168, 150, 158, 141, 130,
       131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 142, 157, 144,
       145, 146, 148, 149, 180, 151, 152, 154, 155, 156, 179, 190, 181,
       227, 214, 215, 216, 217, 218, 219, 220, 221, 222, 223, 225, 226,
       228, 182, 229, 231, 232, 233, 234, 235, 236, 237, 238, 239, 240,
       241, 213, 212, 211, 210, 183, 184, 185, 186, 187, 188, 189, 128,
       191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203,
       205, 206, 207, 208, 129, 120, 127,  49,  37,  38,  39,  40,  41,
        42,  43,  44,  45,  46,  47,  48,  50,  35,  51,  52,  53,  54,
        55,  56,  58,  59,  61,  62,  63,  64,  36,  34,  66,  19,   6,
         7,   8,  10,  11,  12,  13,  14,  15,  16,  17,  18,  2

# Compare total number of feature subsets at step k

In [24]:
n_subsets = {}
for name, estimator in estimators.items():
    if name not in ["lasso", "rfs"]:
        n_subsets[name] = [sum(estimator.trajectory[j].values()) for j in range(10)]
pd.DataFrame(n_subsets)

Unnamed: 0,rfs2,rfs3,rfs4
0,1000,1000,1000
1,1000,1000,1000
2,998,1000,998
3,986,1000,988
4,949,1000,983
5,883,1000,960
6,785,1000,956
7,671,1000,927
8,568,1000,914
9,466,1000,913
