## CONTEXTUAL FEATURE SELECTION WITH CONDITIONAL STOCHASTIC GATES

In [11]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np

# Load dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Reduce to maximum 1000 rows
# X = X[:1000, :]
# y = y[:1000]

# Original column names
column_names = housing.feature_names
feature_names = np.array(column_names)
# Add noise columns
np.random.seed(42)  # For reproducibility

# Gaussian noise
gaussian_noise = np.random.normal(0, 1, size=X.shape[0])

# Uniform noise
uniform_noise = np.random.uniform(-1, 1, size=X.shape[0])

# Cosine function
cosine_values = np.cos(np.linspace(0, 10, X.shape[0]))

# Create a DataFrame from X
df = pd.DataFrame(X, columns=column_names)
df=df.sample(frac=1,random_state=42).reset_index(drop=True)
# Add the noise columns to DataFrame
df['Gaussian_Noise'] = gaussian_noise
df['Uniform_Noise'] = uniform_noise
df['Cosine_Noise'] = cosine_values
df=df.sample(frac=1,random_state=42).reset_index(drop=True)

# Shuffle column locations
np.random.seed(42)  # Ensure reproducibility for column shuffling
shuffled_columns = np.random.permutation(df.columns)
df = df[shuffled_columns]

# Now, df is a DataFrame with shuffled columns and includes the noise features
# You can view the DataFrame as follows:
display(df.head())

# Convert target to a DataFrame and concatenate with features for a complete view
y_df = pd.DataFrame(y, columns=['Target'])
df_full = pd.concat([df, y_df], axis=1)

# If you wish to proceed with splitting and scaling:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset

# Splitting the data (assuming you want to keep DataFrame structure for X)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# X_train_scaled and X_test_scaled are now DataFrames with scaled features and retained column names.
# convert them to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
y_train_tensor = y_train_tensor.view(-1, 1)
y_test_tensor = y_test_tensor.view(-1, 1)

# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# new column names
feature_names = np.array(X_train_scaled.columns)
# feature_names

Unnamed: 0,AveOccup,MedInc,Uniform_Noise,Cosine_Noise,AveRooms,HouseAge,Gaussian_Noise,Population,Longitude,AveBedrms,Latitude
0,2.247299,4.5625,0.760274,-0.958842,4.845138,46.0,-0.706843,1872.0,-122.59,1.027611,37.97
1,3.26793,4.5,-0.134879,0.105413,5.262517,17.0,-1.869742,2415.0,-119.19,1.012179,34.23
2,3.445217,5.2174,0.78474,0.261862,7.306957,5.0,0.116566,3962.0,-117.21,1.078261,33.95
3,2.345733,2.3083,-0.577435,-0.877524,5.485777,20.0,1.198482,1072.0,-122.63,1.262582,38.96
4,2.496,6.0,0.833185,0.042673,5.442667,26.0,-0.133279,936.0,-117.24,0.781333,34.15


In [26]:
import torch
from torch import nn
from torch.optim import Adam
from torch.nn.init import xavier_uniform_

# Define the Hypernetwork
class HyperNet(nn.Module):
    def __init__(self, input_dim):
        super(HyperNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        xavier_uniform_(self.fc1.weight)  # Xavier initialization
        self.dropout1 = nn.Dropout(0.4)   # Dropout layer with 50% probability
        self.fc2 = nn.Linear(128, 64)
        xavier_uniform_(self.fc2.weight)  # Xavier initialization
        self.dropout2 = nn.Dropout(0.4)   # Another Dropout layer with 50% probability
        self.fc3 = nn.Linear(64, 32)
        xavier_uniform_(self.fc3.weight)  # Xavier initialization
        self.fc4 = nn.Linear(32, input_dim)
        xavier_uniform_(self.fc4.weight)  # Xavier initialization
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)  # Applying dropout after activation
        x = torch.relu(self.fc3(x))
        x = self.sigmoid(self.fc4(x))  # Sigmoid to ensure output is a probability
        return x

# Define the Prediction Network
class PredNet(nn.Module):
    def __init__(self, feature_dim):
        super(PredNet, self).__init__()
        self.fc1 = nn.Linear(feature_dim, 128)
        xavier_uniform_(self.fc1.weight)  # Xavier initialization
        self.dropout1 = nn.Dropout(0.5)   # Dropout layer with 50% probability
        self.fc2 = nn.Linear(128, 64)
        xavier_uniform_(self.fc2.weight)  # Xavier initialization
        self.dropout2 = nn.Dropout(0.5)   # Another Dropout layer with 50% probability
        self.fc3 = nn.Linear(64, 1) # final layer for regression

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc2(x))
        x = self.dropout2(x)  # Applying dropout after activation
        x = self.fc3(x)  # No activation function, suitable for regression
        return x
    
from torch.distributions.normal import Normal
from torch.autograd import Variable

# Define the custom loss function incorporating both MSE and regularization term
class CustomLoss(nn.Module):
    def __init__(self, lambda_reg, sigma):
        super(CustomLoss, self).__init__()
        self.mse_loss = nn.MSELoss()
        self.lambda_reg = lambda_reg
        self.sigma = sigma
        self.normal_dist = Normal(torch.tensor([0.0]), torch.tensor([sigma]))

    def forward(self, y_pred, y_true, selection_probs):
        # Calculate the MSE part
        mse = self.mse_loss(y_pred, y_true)
        
        # Calculate the regularization term
        # Use the CDF of the standard Gaussian distribution
        regularization_term = torch.sum(self.normal_dist.cdf(selection_probs))/ self.sigma
        
        # Combine the MSE and regularization term
        total_loss = mse + self.lambda_reg * regularization_term
        return total_loss

# Usage of CustomLoss
# Set lambda_reg and sigma based on your requirements
lambda_regu = 0.01  # Example value; needs to be tuned
sigma = 1.0  # Assuming noise standard deviation is 1

# Initialize the custom loss
custom_loss = CustomLoss(lambda_regu, sigma)

# During training, you would use this custom_loss function like so:
# custom_loss(y_pred, y_true, selection_probs)

# Define the training loop
def train(model, hypernet, optimizer, epochs, X_train, y_train, X_val, y_val):
    for epoch in range(epochs):
        model.train()
        hypernet.train()
        optimizer.zero_grad()
        # Forward pass through the hypernetwork to get selection probabilities
        selection_prob = hypernet(X_train)
        # Sample from the Bernoulli distribution to get the feature mask
        selection_mask = torch.bernoulli(selection_prob)
        print("##"*90)
        print(selection_prob)
        print("##"*90)
        print("##"*90)
        print(selection_mask)
        print("##"*90)
        # Apply mask to the input features
        selected_features = X_train * selection_mask
        # Make predictions with the masked features
        y_pred = model(selected_features)
        loss = custom_loss(y_pred, y_train, selection_mask) # TODO: Better loss function in progress
        # Backpropagation
        loss.backward() 
        optimizer.step()
        
        # Validation phase
        model.eval()
        hypernet.eval()
        with torch.no_grad():
            val_selection_prob = hypernet(X_val)
            val_selection_mask = torch.bernoulli(val_selection_prob)
            val_selected_features = X_val * val_selection_mask
            val_y_pred = model(val_selected_features)
            # val_loss = criterion(val_y_pred, y_val)
            val_loss = custom_loss(val_y_pred, y_val, val_selection_prob)
        
        print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')

# Initialize the hypernetwork, prediction network, loss criterion, and optimizer
hypernet = HyperNet(input_dim=X_train_tensor.shape[1])
model = PredNet(feature_dim=X_train_tensor.shape[1])
optimizer = Adam(list(hypernet.parameters()) + list(model.parameters()) , lr=0.01)

# Training
epochs = 100  # Adjust as necessary based on convergence and performance
train(model, hypernet, optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

####################################################################################################################################################################################
tensor([[0.4252, 0.5598, 0.4586,  ..., 0.6082, 0.4405, 0.5994],
        [0.5043, 0.4448, 0.5515,  ..., 0.4905, 0.4877, 0.5178],
        [0.4207, 0.4792, 0.4999,  ..., 0.5122, 0.4387, 0.4857],
        ...,
        [0.4273, 0.4794, 0.4464,  ..., 0.6425, 0.4298, 0.6029],
        [0.4002, 0.6063, 0.5224,  ..., 0.5475, 0.3892, 0.5517],
        [0.5236, 0.5496, 0.5005,  ..., 0.5129, 0.5045, 0.5714]],
       grad_fn=<SigmoidBackward0>)
####################################################################################################################################################################################
####################################################################################################################################################################################
tensor([[0., 1., 1.,  ..

# training

## Comparing with stg

In [None]:
import matplotlib.pyplot as plt

# Define a function to train and evaluate models using STG algorithm
def train_evaluate_stg(X_train, y_train, X_test, y_test, context_dim):
    # Instantiate HyperNetwork and PredictionNetwork for STG
    hypernetwork_stg = HyperNetwork(context_dim, X_train.shape[1] - context_dim)
    prediction_network_stg = PredictionNetwork(X_train.shape[1] - context_dim)

    # Train the model
    stg_losses_train, stg_losses_val = train_model(X_train, y_train, X_test, y_test, 
                                                    hypernetwork_stg, prediction_network_stg)
    return stg_losses_train, stg_losses_val

# Define a function to train and evaluate models using CSTG algorithm
def train_evaluate_cstg(X_train, y_train, X_test, y_test, context_dim):
    # Instantiate ConditionalStochasticGates model
    cstg_model = ConditionalStochasticGates(context_dim, X_train.shape[1] - context_dim)

    # Train the model
    cstg_losses_train, cstg_losses_val = train_model(X_train, y_train, X_test, y_test, 
                                                      cstg_model, prediction_network)
    return cstg_losses_train, cstg_losses_val

# Define a function to train the model and return training and validation losses
def train_model(X_train, y_train, X_test, y_test, hypernetwork, prediction_network):
    optimizer = optim.Adam(list(hypernetwork.parameters()) + list(prediction_network.parameters()), lr=0.001)
    loss_fn = nn.MSELoss()

    num_epochs = 100
    stg_losses_train, stg_losses_val = [], []

    for epoch in range(num_epochs):
        # Training step
        hypernetwork.train()
        prediction_network.train()
        optimizer.zero_grad()

        # Forward pass
        gates_train = hypernetwork(X_train[:, :context_dim])
        selected_features_train = X_train[:, context_dim:] * stochastic_gates(gates_train)
        predictions_train = prediction_network(selected_features_train)
        loss_train = loss_fn(predictions_train, y_train)

        # Backward pass
        loss_train.backward()
        optimizer.step()

        # Validation step
        with torch.no_grad():
            hypernetwork.eval()
            prediction_network.eval()

            gates_val = hypernetwork(X_test[:, :context_dim])
            selected_features_val = X_test[:, context_dim:] * stochastic_gates(gates_val)
            predictions_val = prediction_network(selected_features_val)
            loss_val = loss_fn(predictions_val, y_test)

        stg_losses_train.append(loss_train.item())
        stg_losses_val.append(loss_val.item())

    return stg_losses_train, stg_losses_val

# Define the context dimension
context_dim = 3  # Assuming the first 3 features are used as context

# Train and evaluate models using STG algorithm
stg_losses_train, stg_losses_val = train_evaluate_stg(X_train_tensor, y_train_tensor, 
                                                      X_test_tensor, y_test_tensor, context_dim)

# Train and evaluate models using CSTG algorithm
cstg_losses_train, cstg_losses_val = train_evaluate_cstg(X_train_tensor, y_train_tensor, 
                                                          X_test_tensor, y_test_tensor, context_dim)

# Plot training and validation losses for STG and CSTG algorithms
plt.figure(figsize=(10, 6))
plt.plot(range(1, num_epochs + 1), stg_losses_train, label='STG Train Loss', color='blue')
plt.plot(range(1, num_epochs + 1), stg_losses_val, label='STG Validation Loss', linestyle='--', color='blue')
plt.plot(range(1, num_epochs + 1), cstg_losses_train, label='CSTG Train Loss', color='orange')
plt.plot(range(1, num_epochs + 1), cstg_losses_val, label='CSTG Validation Loss', linestyle='--', color='orange')
plt.title('Training and Validation Losses')
plt.xlabel('Epochs')
plt.ylabel('Loss')
plt.legend()
plt.grid(True)
plt.show()

----

In [27]:

# # Assuming X_train_tensor and X_test_tensor are already defined and properly shaped
# input_dim = X_train_tensor.shape[1]  # Number of features in the input data
# feature_dim = input_dim  # The hypernetwork output and prediction input dimensions must match

# # Initialize the hypernetwork, prediction network, loss criterion, and optimizer
# hypernet = HyperNet(input_dim)
# model = PredNet(feature_dim)
# optimizer = Adam(list(hypernet.parameters()) + list(model.parameters()) , lr=0.01)

# # Training
# epochs = 2  # Adjust as necessary based on convergence and performance
# train(model, hypernet, criterion, optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)


In [28]:
# Making predictions with the trained models
def make_inference(model, hypernet, X_test):
    model.eval()
    hypernet.eval()
    
    with torch.no_grad():
        # Get the feature selection probabilities from the hypernetwork
        selection_prob = hypernet(X_test)
        # For inference use the expected values (probabilities) instead of sampling
        selected_features = X_test * selection_prob
        # Get the model predictions
        predictions = model(selected_features)
    return predictions, selection_prob

# Load the test data (replace this with your actual test data)
X_test = X_test_tensor  # Assuming X_test_tensor is your test data

# Make inference
predictions, feature_importance_probabilities = make_inference(model, hypernet, X_test)


# Print the conditional probability of each feature being significant
print("Conditional probabilities of feature significance given other features:")
for i, feature_name in enumerate(feature_names):
    print(f"{feature_name}: {feature_importance_probabilities[:, i].mean().item()}")

Conditional probabilities of feature significance given other features:
AveOccup: 0.44150832295417786
MedInc: 0.5098904967308044
Uniform_Noise: 0.474697470664978
Cosine_Noise: 0.4606289863586426
AveRooms: 0.47779104113578796
HouseAge: 0.461354523897171
Gaussian_Noise: 0.5596177577972412
Population: 0.407277375459671
Longitude: 0.5244668126106262
AveBedrms: 0.4461166560649872
Latitude: 0.5383712649345398


In [29]:
# Create a list of tuples (feature_name, probability)
feature_probabilities = [(feature_name, feature_importance_probabilities[:, i].mean().item()) for i, feature_name in enumerate(feature_names)]

# Sort the list by probability in descending order
sorted_feature_probabilities = sorted(feature_probabilities, key=lambda x: x[1], reverse=True)

# Print the sorted list
print("Conditional probabilities of feature significance given other features:")
for feature, probability in sorted_feature_probabilities:
    print(f"{feature}: {probability}")

Conditional probabilities of feature significance given other features:
Gaussian_Noise: 0.5596177577972412
Latitude: 0.5383712649345398
Longitude: 0.5244668126106262
MedInc: 0.5098904967308044
AveRooms: 0.47779104113578796
Uniform_Noise: 0.474697470664978
HouseAge: 0.461354523897171
Cosine_Noise: 0.4606289863586426
AveBedrms: 0.4461166560649872
AveOccup: 0.44150832295417786
Population: 0.407277375459671


In [5]:
# Calculate the mean feature importance probability across all examples
mean_feature_importance_probabilities = feature_importance_probabilities.mean(dim=0)

# Determine which features have a mean selection probability above the threshold
important_features_indices = mean_feature_importance_probabilities > threshold

print("Mean feature selection probabilities from the hypernetwork (higher means more important):")
print(mean_feature_importance_probabilities)

print("\nFeatures with mean selection probability higher than threshold:")
print(important_features_indices)
# Extract the names of the important features
important_feature_names = [name for i, name in enumerate(feature_names) if important_features_indices[i]]

print("\nImportant feature names:")
print(important_feature_names)


NameError: name 'threshold' is not defined

In [None]:
# ... (after the training loop)

# Retrieve the final parameters of HyperNet
hypernet_params = hypernet.state_dict()
print("Final parameters of HyperNet:")
for param_name, param in hypernet_params.items():
    print(f"{param_name}:\n{param}")

# Retrieve the final parameters of PredNet
prednet_params = model.state_dict()
print("\nFinal parameters of PredNet:")
for param_name, param in prednet_params.items():
    print(f"{param_name}:\n{param}")


Final parameters of HyperNet:
fc1.weight:
tensor([[ 0.1502, -0.1104, -0.0366,  ...,  0.1718,  0.0637, -0.0697],
        [-0.1804,  0.0159, -0.1112,  ...,  0.1196, -0.0227, -0.0770],
        [ 0.1346,  0.1256,  0.1248,  ..., -0.0737, -0.1655, -0.1717],
        ...,
        [-0.0565,  0.0891,  0.0982,  ..., -0.1358, -0.1359,  0.0080],
        [-0.0074, -0.0703,  0.0896,  ...,  0.2032, -0.2027, -0.1844],
        [-0.0535,  0.0381,  0.0786,  ..., -0.1920, -0.0853, -0.0579]])
fc1.bias:
tensor([ 0.1512,  0.2043,  0.1095, -0.1715, -0.0402, -0.1496, -0.2718,  0.2931,
         0.1333,  0.1599,  0.0045, -0.1857, -0.2328,  0.0870,  0.1531, -0.2914,
         0.0040,  0.2220, -0.0121, -0.1919,  0.0696, -0.0186,  0.0074,  0.0958,
        -0.0544,  0.2858, -0.1110, -0.0218,  0.0621, -0.1147, -0.2755,  0.1071,
         0.2763,  0.1478, -0.2083,  0.2526,  0.0627, -0.0947, -0.2117, -0.2135,
         0.1645, -0.2223,  0.1970, -0.1847, -0.0560, -0.2816, -0.2027, -0.0012,
         0.2529,  0.2172, -0.0118,

In [None]:
# raise error
raise ValueError("This is a custom error message. You can replace this with an actual error message.")

In [None]:
import torch
from torch import nn
from torch.optim import Adam
import numpy as np

from torch.nn.init import xavier_uniform_
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Normal, Bernoulli

class HyperNet(nn.Module):
    def __init__(self, input_dim):
        super(HyperNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        xavier_uniform_(self.fc1.weight)
        self.fc2 = nn.Linear(128, input_dim)  # The output should match the number of features
        xavier_uniform_(self.fc2.weight)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x

class PredNet(nn.Module):
    def __init__(self, feature_dim):
        super(PredNet, self).__init__()
        self.fc1 = nn.Linear(feature_dim, 128)
        xavier_uniform_(self.fc1.weight)
        self.fc2 = nn.Linear(128, 1)
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x

# Initialize models (assuming 11 input features)
input_dim = X_train_tensor.shape[1]  # This should be the number of features in your input
feature_dim = input_dim  # This should match the number of input features
hypernet = HyperNet(input_dim)
model = PredNet(feature_dim)

# Rest of the initialization and training code...
criterion = nn.MSELoss()
optimizer = Adam(list(model.parameters()) + list(hypernet.parameters()), lr=0.001)

hypernet.train()
optimizer.zero_grad()

# Forward pass through hypernetwork to get selection probabilities
selection_prob = hypernet(X_train)
# Sample from Bernoulli distribution to get feature mask
selection_mask = torch.bernoulli(selection_prob)
# Apply mask to input features
selected_features = X_train * selection_mask

# Make predictions with masked features
y_pred = model(selected_features)
loss = criterion(y_pred, y_train)

# Backpropagation
loss.backward()
optimizer.step()

# Validation phase
model.eval()
hypernet.eval()
# with torch.no_grad():
#     val_selection_prob = hypernet(X_val)
#     val_selection_mask = torch.bernoulli(val_selection_prob)
#     val_selected_features = X_val * val_selection_mask
#     val_y_pred = model(val_selected_features)
#     val_loss = criterion(val_y_pred, y_val)

# print(f'Epoch {epoch+1}, Loss: {loss.item()}, Val Loss: {val_loss.item()}')

# Model initialization


# Training
# epochs = 100 # Adjust as necessary
# train(model, hypernet, criterion, optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)


TypeError: linear(): argument 'input' (position 1) must be Tensor, not DataFrame

In [None]:

# # Define the Hypernetwork
# class HyperNet(nn.Module):
#     def __init__(self, input_dim, feature_dim):
#         super(HyperNet, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 128) # Adjust the size as necessary
#         self.fc2 = nn.Linear(128, feature_dim)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         x = self.sigmoid(self.fc2(x))
#         return x


# class HyperNet(nn.Module):
#     def __init__(self, input_dim, feature_dim):
#         super(HyperNet, self).__init__()
#         self.fc1 = nn.Linear(input_dim, 128)
#         # Initialize weights using Xavier initialization
#         xavier_uniform_(self.fc1.weight)
#         self.fc2 = nn.Linear(128, feature_dim)
#         xavier_uniform_(self.fc2.weight)
#         self.sigmoid = nn.Sigmoid()
        
#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         # Final layer with sigmoid to ensure outputs are between 0 and 1
#         x = self.sigmoid(self.fc2(x))
#         return x

class HyperNet(nn.Module):
    def __init__(self, input_dim, feature_dim):
        super(HyperNet, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        xavier_uniform_(self.fc1.weight)
        self.fc2 = nn.Linear(128, feature_dim)  # feature_dim should match the number of input features
        xavier_uniform_(self.fc2.weight)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc1(x))
        x = self.fc2(x)
        return x
    
# Define the Prediction Network
# class PredNet(nn.Module):
#     def __init__(self, feature_dim):
#         super(PredNet, self).__init__()
#         self.fc1 = nn.Linear(feature_dim, 128) # Adjust the size as necessary
#         self.fc2 = nn.Linear(128, 1) # Assuming a single output
        
#     def forward(self, x):
#         x = torch.relu(self.fc1(x))
#         x = torch.relu(self.fc1(x))
#         x = self.fc2(x)
#         return x

class PredNet(nn.Module):
    def __init__(self, feature_dim):
        super(PredNet, self).__init__()
        self.fc1 = nn.Linear(feature_dim, 128)  # feature_dim should match the number of input features
        self.fc2 = nn.Linear(128, 1)

# Initialize models (assuming 11 input features)
input_dim = 11
feature_dim = input_dim  # Make sure this matches the number of input features
hypernet = HyperNet(input_dim, feature_dim)
model = PredNet(feature_dim)
def train(model, hypernet, criterion, optimizer, epochs, X_train, y_train, X_val, y_val):
    for epoch in range(epochs):
        model.train()


RuntimeError: mat1 and mat2 shapes cannot be multiplied (16512x128 and 11x128)

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions.normal import Normal
from torch.distributions.bernoulli import Bernoulli


In [None]:
class Hypernetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(Hypernetwork, self).__init__()
        # Define the architecture of the hypernetwork.
        self.fc1 = nn.Linear(input_dim, 128) # Input layer
        self.fc2 = nn.Linear(128, output_dim) # Output layer mapping to Bernoulli parameters

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.fc1(x)) # Activation function for non-linearity
        x = torch.sigmoid(self.fc2(x)) # Sigmoid to ensure output is in [0,1], representing probabilities
        return x


In [None]:
class PredictionNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(PredictionNetwork, self).__init__()
        # Define the architecture of the prediction network.
        self.fc1 = nn.Linear(input_dim, 128) # Input layer
        self.fc2 = nn.Linear(128, output_dim) # Output layer for the response variable

    def forward(self, x):
        # Forward pass through the network
        x = F.relu(self.fc1(x)) # Activation function for non-linearity
        x = self.fc2(x) # Output layer
        return x


In [None]:
def train(model, hypernet, criterion, optimizer, data_loader, epochs=10):
    for epoch in range(epochs):
        for x, z, y in data_loader: # Assuming x is the feature, z is the context, y is the target
            optimizer.zero_grad()
            
            # Generate probabilities from the Hypernetwork
            probs = hypernet(z)
            
            # Sample from Bernoulli to get the feature selection mask
            m = Bernoulli(probs)
            mask = m.sample()
            
            # Apply mask and predict
            x_masked = x * mask
            predictions = model(x_masked)
            
            # Compute loss and backpropagate
            loss = criterion(predictions, y)
            loss.backward()
            optimizer.step()
        
        print(f'Epoch {epoch+1}, Loss: {loss.item()}')
