## CONTEXTUAL FEATURE SELECTION WITH CONDITIONAL STOCHASTIC GATES

In [1]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np

# Load dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Reduce to maximum 1000 rows
# X = X[:1000, :]
# y = y[:1000]

# Original column names
column_names = housing.feature_names

# Add noise columns
np.random.seed(42)  # For reproducibility

# Gaussian noise
gaussian_noise = np.random.normal(0, 1, size=X.shape[0])

# Uniform noise
uniform_noise = np.random.uniform(-1, 1, size=X.shape[0])

# Cosine function
cosine_values = np.cos(np.linspace(0, 10, X.shape[0]))

# Create a DataFrame from X
df = pd.DataFrame(X, columns=column_names)

# Add the noise columns to DataFrame
df['Gaussian_Noise'] = gaussian_noise
df['Uniform_Noise'] = uniform_noise
df['Cosine_Values'] = cosine_values

# Shuffle column locations
np.random.seed(42)  # Ensure reproducibility for column shuffling
shuffled_columns = np.random.permutation(df.columns)
df = df[shuffled_columns]

# Now, df is a DataFrame with shuffled columns and includes the noise features
# You can view the DataFrame as follows:
print(df.head())

# Convert target to a DataFrame and concatenate with features for a complete view
y_df = pd.DataFrame(y, columns=['Target'])
df_full = pd.concat([df, y_df], axis=1)

# If you wish to proceed with splitting and scaling:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset

# Splitting the data (assuming you want to keep DataFrame structure for X)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# X_train_scaled and X_test_scaled are now DataFrames with scaled features and retained column names.
# convert them to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)

# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)



   AveOccup  MedInc  Uniform_Noise  Cosine_Values  AveRooms  HouseAge  \
0  2.555556  8.3252       0.143745       1.000000  6.984127      41.0   
1  2.109842  8.3014      -0.515390       1.000000  6.238137      21.0   
2  2.802260  7.2574       0.677635       1.000000  8.288136      52.0   
3  2.547945  5.6431      -0.038881       0.999999  5.817352      52.0   
4  2.181467  3.8462       0.285882       0.999998  6.281853      52.0   

   Gaussian_Noise  Population  Longitude  AveBedrms  Latitude  
0        0.496714       322.0    -122.23   1.023810     37.88  
1       -0.138264      2401.0    -122.22   0.971880     37.86  
2        0.647689       496.0    -122.24   1.073446     37.85  
3        1.523030       558.0    -122.25   1.073059     37.85  
4       -0.234153       565.0    -122.25   1.081081     37.85  


In [2]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal, Bernoulli


##

 Define the Hypernetwork

The hypernetwork takes contextual variables as input and outputs parameters for the Bernoulli distributions of the stochastic gates.

In [7]:
class HyperNetwork(nn.Module):
    def __init__(self, context_dim, feature_dim):
        super(HyperNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(context_dim, 64),  # Adjust sizes as necessary
            nn.ReLU(),
            nn.Linear(64, feature_dim),
            nn.Sigmoid()  # Ensures output is between 0 and 1
        )

    def forward(self, context):
        return self.network(context)


Define the Prediction Network

The prediction network maps the selected features to the response variable. It takes both the original features and the stochastic gates as inputs.

In [8]:
class PredictionNetwork(nn.Module):
    def __init__(self, feature_dim):
        super(PredictionNetwork, self).__init__()
        self.network = nn.Sequential(
            nn.Linear(feature_dim, 64),  # Adjust sizes as necessary
            nn.ReLU(),
            nn.Linear(64, 1)  # Assuming a single continuous outcome
        )

    def forward(self, features):
        return self.network(features)


In [9]:
def stochastic_gates(bernoulli_params, device='cpu'):
    sigma = 1.0  # Hyperparameter for Gaussian noise
    epsilon = Normal(0, sigma).sample(bernoulli_params.shape).to(device)
    gates = torch.sigmoid(bernoulli_params + epsilon)  # Apply the sigmoid to approximate the Bernoulli distribution
    return gates


5. Training Loop


  return F.mse_loss(input, target, reduction=self.reduction)
  return F.mse_loss(input, target, reduction=self.reduction)


Epoch 1, Training Loss: 4.889840602874756, Validation Loss: 4.750686168670654
Epoch 11, Training Loss: 4.373039722442627, Validation Loss: 4.240394592285156
Epoch 21, Training Loss: 3.894871473312378, Validation Loss: 3.7698419094085693
Epoch 31, Training Loss: 3.453162908554077, Validation Loss: 3.3406975269317627
Epoch 41, Training Loss: 3.040623664855957, Validation Loss: 2.938717842102051
Epoch 51, Training Loss: 2.661078929901123, Validation Loss: 2.5778002738952637
Epoch 61, Training Loss: 2.3257458209991455, Validation Loss: 2.259857654571533
Epoch 71, Training Loss: 2.0462188720703125, Validation Loss: 1.9626799821853638
Epoch 81, Training Loss: 1.821929931640625, Validation Loss: 1.7591900825500488
Epoch 91, Training Loss: 1.6745803356170654, Validation Loss: 1.620639443397522


In [8]:

num_epochs = 100
for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Training step
    contexts = X_train_tensor[:, :context_dim]
    features = X_train_tensor[:, context_dim:]
    
    bernoulli_params = hypernetwork(contexts)
    gates = stochastic_gates(bernoulli_params)
    selected_features = features * gates
    predictions = prediction_network(selected_features)
    
    loss = loss_fn(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation step
    with torch.no_grad():
        contexts_val = X_test_tensor[:, :context_dim]
        features_val = X_test_tensor[:, context_dim:]
        
        bernoulli_params_val = hypernetwork(contexts_val)
        gates_val = stochastic_gates(bernoulli_params_val)
        selected_features_val = features_val * gates_val
        predictions_val = prediction_network(selected_features_val)
        
        val_loss = loss_fn(predictions_val, y_test_tensor)

    if epoch % 10 == 0:
        print(f'Epoch {epoch+1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')


Epoch 1, Training Loss: 1.5713249444961548, Validation Loss: 1.5317533016204834
Epoch 11, Training Loss: 1.5220485925674438, Validation Loss: 1.4840314388275146
Epoch 21, Training Loss: 1.4977829456329346, Validation Loss: 1.4694759845733643
Epoch 31, Training Loss: 1.482914686203003, Validation Loss: 1.4629521369934082
Epoch 41, Training Loss: 1.4752850532531738, Validation Loss: 1.4683964252471924
Epoch 51, Training Loss: 1.4687414169311523, Validation Loss: 1.4382028579711914
Epoch 61, Training Loss: 1.4637579917907715, Validation Loss: 1.4313669204711914
Epoch 71, Training Loss: 1.470191240310669, Validation Loss: 1.4284188747406006
Epoch 81, Training Loss: 1.4533123970031738, Validation Loss: 1.4246701002120972
Epoch 91, Training Loss: 1.4487234354019165, Validation Loss: 1.4211912155151367


To present the best feature combinations found by the model, we can analyze the gates (stochastic gates) produced by the hypernetwork for each feature across the validation set. The gates closest to 1 indicate features that are most relevant for the prediction according to the model. However, since the gates are produced in a stochastic manner, a straightforward approach is to use the mean of the Bernoulli parameters (π) output by the hypernetwork as an approximation of feature importance. This way, we can determine which features are consistently considered important across different contexts.

Below is the code to achieve this, including a modification to compute the average Bernoulli parameters across the validation set and then display the best feature combinations based on these averages:

In [12]:
list_contexts = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]  # List of context dimensions to try
for context_dim in list_contexts:
    # Assume first 3 features are "contextual" for demonstration purposes
    context_dim = 3
    feature_dim = X_train_tensor.shape[1] - context_dim

    # Update Hypernetwork and PredictionNetwork definitions if needed

    # Initialize models
    hypernetwork = HyperNetwork(context_dim, feature_dim)
    prediction_network = PredictionNetwork(feature_dim)

    # Optimizer and loss function
    optimizer = optim.Adam(list(hypernetwork.parameters()) + list(prediction_network.parameters()), lr=0.001)
    loss_fn = nn.MSELoss()

    hypernetwork = HyperNetwork(context_dim=1, feature_dim=X_train_tensor.shape[1])

    # Compute average Bernoulli parameters for the validation set
    with torch.no_grad():
        contexts_val = X_test_tensor[:, :context_dim]
        avg_bernoulli_params = hypernetwork(contexts_val).mean(dim=0)

    # Sort features based on average Bernoulli parameters
    sorted_features = torch.argsort(avg_bernoulli_params, descending=True)
    sorted_params = torch.sort(avg_bernoulli_params, descending=True).values

    # Print the sorted features and their corresponding Bernoulli parameters
    print("Feature Importance based on average Bernoulli parameters:")
    for i, (feature_index, importance) in enumerate(zip(sorted_features, sorted_params)):
        print(f"Feature {feature_index + context_dim} (Original Index): Importance {importance.item()}")

    # Optionally, select top N features to consider as "best" feature combinations
    N = 5  # Change N based on how many top features you want to consider
    top_features = sorted_features[:N]
    print(f"\nTop {N} Most Important Features (Adjusted Index): {top_features + context_dim}")


    # Initialize dictionary to store feature combinations and their best validation loss
    feature_combinations = {}

    num_epochs = 100
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        optimizer.zero_grad()
        
        # Training step
        contexts = X_train_tensor[:, :context_dim]
        features = X_train_tensor[:, context_dim:]
        
        bernoulli_params = hypernetwork(contexts)
        gates = stochastic_gates(bernoulli_params)
        selected_features = features * gates
        predictions = prediction_network(selected_features)
        
        loss = loss_fn(predictions, y_train_tensor)
        loss.backward()
        optimizer.step()

        # Validation step
        with torch.no_grad():
            contexts_val = X_test_tensor[:, :context_dim]
            features_val = X_test_tensor[:, context_dim:]
            
            bernoulli_params_val = hypernetwork(contexts_val)
            gates_val = stochastic_gates(bernoulli_params_val)
            selected_features_val = features_val * gates_val
            predictions_val = prediction_network(selected_features_val)
            
            val_loss = loss_fn(predictions_val, y_test_tensor)

        # Store feature combinations based on the top N important features for each epoch
        N = 5  # Change based on desired top N features
        avg_bernoulli_params = bernoulli_params_val.mean(dim=0)
        top_features_indices = torch.argsort(avg_bernoulli_params, descending=True)[:N]
        top_features_tuple = tuple(top_features_indices.cpu().numpy())
        
        if top_features_tuple in feature_combinations:
            if feature_combinations[top_features_tuple]['val_loss'] > val_loss.item():
                feature_combinations[top_features_tuple]['val_loss'] = val_loss.item()
        else:
            feature_combinations[top_features_tuple] = {'val_loss': val_loss.item(), 'avg_params': avg_bernoulli_params[top_features_indices].cpu().numpy()}

        if epoch % 10 == 0 or epoch == num_epochs - 1:
            print(f'Epoch {epoch+1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

    # Display the best feature combination
    best_combination = min(feature_combinations.items(), key=lambda x: x[1]['val_loss'])
    print("\nBest feature combination (indices):", best_combination[0])
    print("Validation loss for best combination:", best_combination[1]['val_loss'])
    print("Average Bernoulli parameters for best combination:", best_combination[1]['avg_params'])

Feature Importance based on average Bernoulli parameters:
Feature 7 (Original Index): Importance 0.592074990272522
Feature 10 (Original Index): Importance 0.5558695793151855
Feature 5 (Original Index): Importance 0.5503217577934265
Feature 11 (Original Index): Importance 0.5348395109176636
Feature 1 (Original Index): Importance 0.5235576629638672
Feature 4 (Original Index): Importance 0.5093138813972473
Feature 9 (Original Index): Importance 0.45627906918525696
Feature 8 (Original Index): Importance 0.4559199810028076
Feature 2 (Original Index): Importance 0.4457564949989319
Feature 3 (Original Index): Importance 0.4329746961593628
Feature 6 (Original Index): Importance 0.38816526532173157

Top 5 Most Important Features (Adjusted Index): tensor([ 7, 10,  5, 11,  1])


NameError: name 'optimizer' is not defined

Epoch 1, Training Loss: 1.4472154378890991, Validation Loss: 1.4151694774627686
Epoch 11, Training Loss: 1.4414145946502686, Validation Loss: 1.4116743803024292
Epoch 21, Training Loss: 1.4412184953689575, Validation Loss: 1.4077492952346802
Epoch 31, Training Loss: 1.4331527948379517, Validation Loss: 1.4035084247589111
Epoch 41, Training Loss: 1.4280548095703125, Validation Loss: 1.4010308980941772
Epoch 51, Training Loss: 1.4253065586090088, Validation Loss: 1.3972856998443604
Epoch 61, Training Loss: 1.4205454587936401, Validation Loss: 1.3926745653152466
Epoch 71, Training Loss: 1.424072265625, Validation Loss: 1.3922308683395386
Epoch 81, Training Loss: 1.415163278579712, Validation Loss: 1.387839436531067
Epoch 91, Training Loss: 1.4128848314285278, Validation Loss: 1.3898916244506836
Epoch 100, Training Loss: 1.4109370708465576, Validation Loss: 1.38599693775177

Best feature combination (indices): (0, 5, 2, 7, 3)
Validation loss for best combination: 1.3809586763381958
Average

In [11]:
# Assuming the boston dataset and the feature names are loaded as shown previously
feature_names = np.array(boston.feature_names)

# Modify the feature names array to match the division between "contextual" and "explanatory" features
# For simplicity, we consider the first 'context_dim' features as contextual and the rest as explanatory here
# Adjust this as per your actual model's design
explanatory_feature_names = feature_names[context_dim:]

# Initialize dictionary to store feature combinations and their best validation loss
feature_combinations = {}

num_epochs = 100
best_val_loss = float('inf')

for epoch in range(num_epochs):
    optimizer.zero_grad()
    
    # Training step
    contexts = X_train_tensor[:, :context_dim]
    features = X_train_tensor[:, context_dim:]
    
    bernoulli_params = hypernetwork(contexts)
    gates = stochastic_gates(bernoulli_params)
    selected_features = features * gates
    predictions = prediction_network(selected_features)
    
    loss = loss_fn(predictions, y_train_tensor)
    loss.backward()
    optimizer.step()

    # Validation step
    with torch.no_grad():
        contexts_val = X_test_tensor[:, :context_dim]
        features_val = X_test_tensor[:, context_dim:]
        
        bernoulli_params_val = hypernetwork(contexts_val)
        gates_val = stochastic_gates(bernoulli_params_val)
        selected_features_val = features_val * gates_val
        predictions_val = prediction_network(selected_features_val)
        
        val_loss = loss_fn(predictions_val, y_test_tensor)

    # Store feature combinations based on the top N important features for each epoch
    N = 5  # Adjust based on the desired top N features
    avg_bernoulli_params = bernoulli_params_val.mean(dim=0)
    top_features_indices = torch.argsort(avg_bernoulli_params, descending=True)[:N]
    top_features_tuple = tuple(top_features_indices.cpu().numpy())
    
    feature_names_tuple = tuple(explanatory_feature_names[top_features_indices])
    
    if feature_names_tuple in feature_combinations:
        if feature_combinations[feature_names_tuple]['val_loss'] > val_loss.item():
            feature_combinations[feature_names_tuple]['val_loss'] = val_loss.item()
    else:
        feature_combinations[feature_names_tuple] = {'val_loss': val_loss.item(), 'avg_params': avg_bernoulli_params[top_features_indices].cpu().numpy()}

    if epoch % 10 == 0 or epoch == num_epochs - 1:
        print(f'Epoch {epoch+1}, Training Loss: {loss.item()}, Validation Loss: {val_loss.item()}')

# Display the best feature combination
best_combination = min(feature_combinations.items(), key=lambda x: x[1]['val_loss'])
print("\nBest feature combination (names):", best_combination[0])
print("Validation loss for best combination:", best_combination[1]['val_loss'])
print("Average Bernoulli parameters for best combination:", best_combination[1]['avg_params'])


NameError: name 'boston' is not defined