## CONTEXTUAL FEATURE SELECTION WITH CONDITIONAL STOCHASTIC GATES

In [14]:
from sklearn.datasets import fetch_california_housing
import pandas as pd
import numpy as np

# Load dataset
housing = fetch_california_housing()
X, y = housing.data, housing.target

# Reduce to maximum 1000 rows
# X = X[:1000, :]
# y = y[:1000]

# Original column names
column_names = housing.feature_names
feature_names = np.array(column_names)
# Add noise columns
np.random.seed(42)  # For reproducibility

# Gaussian noise
gaussian_noise = np.random.normal(0, 1, size=X.shape[0])

# Uniform noise
uniform_noise = np.random.uniform(-1, 1, size=X.shape[0])

# Cosine function
cosine_values = np.cos(np.linspace(0, 10, X.shape[0]))

# Create a DataFrame from X
df = pd.DataFrame(X, columns=column_names)
df=df.sample(frac=1,random_state=42).reset_index(drop=True)
# Add the noise columns to DataFrame
df['Gaussian_Noise'] = gaussian_noise
df['Uniform_Noise'] = uniform_noise
df['Cosine_Noise'] = cosine_values
df=df.sample(frac=1,random_state=42).reset_index(drop=True)

# Shuffle column locations
np.random.seed(42)  # Ensure reproducibility for column shuffling
shuffled_columns = np.random.permutation(df.columns)
df = df[shuffled_columns]

# Now, df is a DataFrame with shuffled columns and includes the noise features
# You can view the DataFrame as follows:
display(df.head())

# Convert target to a DataFrame and concatenate with features for a complete view
y_df = pd.DataFrame(y, columns=['Target'])
df_full = pd.concat([df, y_df], axis=1)

# If you wish to proceed with splitting and scaling:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import torch
from torch.utils.data import TensorDataset

# Splitting the data (assuming you want to keep DataFrame structure for X)
X_train, X_test, y_train, y_test = train_test_split(df, y, test_size=0.2, random_state=42)

# Scaling features
scaler = StandardScaler()
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns)
X_test_scaled = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

# X_train_scaled and X_test_scaled are now DataFrames with scaled features and retained column names.
# convert them to PyTorch tensors
X_train_tensor = torch.tensor(X_train_scaled.values, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_scaled.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.float32)
y_train_tensor = y_train_tensor.view(-1, 1)
y_test_tensor = y_test_tensor.view(-1, 1)

# Create TensorDataset
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# new column names
feature_names = np.array(X_train_scaled.columns)
# feature_names

Unnamed: 0,AveOccup,MedInc,Uniform_Noise,Cosine_Noise,AveRooms,HouseAge,Gaussian_Noise,Population,Longitude,AveBedrms,Latitude
0,2.247299,4.5625,0.760274,-0.958842,4.845138,46.0,-0.706843,1872.0,-122.59,1.027611,37.97
1,3.26793,4.5,-0.134879,0.105413,5.262517,17.0,-1.869742,2415.0,-119.19,1.012179,34.23
2,3.445217,5.2174,0.78474,0.261862,7.306957,5.0,0.116566,3962.0,-117.21,1.078261,33.95
3,2.345733,2.3083,-0.577435,-0.877524,5.485777,20.0,1.198482,1072.0,-122.63,1.262582,38.96
4,2.496,6.0,0.833185,0.042673,5.442667,26.0,-0.133279,936.0,-117.24,0.781333,34.15


## Loss Func

In [15]:
import torch
import torch.nn as nn
from torch.distributions import Normal
# Let's define a function to calculate the custom loss
def custom_loss_function(model_output, y_true, mu_d_z, sigma, D, lambda_reg):
    # Calculate MSE loss
    mse_loss = nn.functional.mse_loss(model_output, y_true)
    # Calculate the regularization term
    normal_dist = Normal(torch.zeros_like(mu_d_z), torch.ones_like(mu_d_z) * sigma)
    regularization_term = torch.sum(normal_dist.cdf(mu_d_z)) / D  # normalize by D
    # Combine MSE loss with the regularization term
    total_loss = mse_loss + lambda_reg * regularization_term
    return total_loss
lambda_reg = 0.01
sigma = 1.0  # Given constant sigma

## Model

In [16]:
import torch
from torch import nn
from torch.optim import Adam
from torch.nn.init import xavier_uniform_
    
from torch.distributions.normal import Normal
from torch.autograd import Variable

# Define the Hypernetwork
class C_StochasticGates(nn.Module):
    def __init__(self, input_dim):
        super(C_StochasticGates, self).__init__()
        self.dropout1 = nn.Dropout(0.5)   # Dropout layer with 50% probability
        self.sigmoid = nn.Sigmoid()
        self.fc1 = nn.Linear(input_dim, 128)
        xavier_uniform_(self.fc1.weight)  # Xavier initialization
        self.fc2 = nn.Linear(128, 64)
        xavier_uniform_(self.fc2.weight)  # Xavier initialization
        self.fc3 = nn.Linear(64, 32)
        xavier_uniform_(self.fc3.weight)  # Xavier initialization
        self.fc4 = nn.Linear(32, input_dim)
        xavier_uniform_(self.fc4.weight)  # Xavier initialization
        self.fc5 = nn.Linear(64, 1) # final layer for regression
        xavier_uniform_(self.fc5.weight)  # Xavier initialization
        self.sigma = 1.0  # Given constant sigma
    def forward(self, x):
        """"" hϕ: selection of contextual probabilities """
        x_original = x# Copy the input tensor
        x = torch.relu(self.fc1(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc2(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc3(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc4(x))
        x = self.dropout1(x)
        
        # Assuming that self.fc4 outputs the logit z which is fed into the sigmoid to obtain mu_z
        mu_z = torch.sigmoid(x)  # Sigmoid to ensure output is a probability

        # stochastic_gates = torch.stack(stochastic_gates, dim=1)
        stochastic_gates = []
        for d in range(mu_z.size(1)):  # Loop over each dimension
            epsilon_d = torch.normal(0, self.sigma, size=(mu_z.size(0),), device=mu_z.device)
            sigma_d = torch.clamp(mu_z[:, d] + epsilon_d, min=0, max=1)
            stochastic_gates.append(sigma_d)

        stochastic_gates = torch.stack(stochastic_gates, dim=1)
        """" Fully connected network for regression parameters yˆ(k)"""
        selected_features = x_original * stochastic_gates
        x = torch.relu(self.fc1(selected_features))
        x = self.dropout1(x)  # Applying dropout after activation
        x = torch.relu(self.fc2(x))
        x = self.dropout1(x)  # Applying dropout after activation
        x = self.fc5(x)  # No activation function, suitable for regression
        return x,stochastic_gates
# c_stg = C_StochasticGates(input_dim=X_train_scaled.shape[1])
    

## Training

In [17]:
import torch
from torch.optim import Adam
import numpy as np

# Assuming your custom_loss_function is defined somewhere above

def mse_metric(preds, targets):
    return torch.mean((preds - targets) ** 2)

def rmse_metric(preds, targets):
    return torch.sqrt(mse_metric(preds, targets))


In [18]:
def train(model, optimizer, epochs, X_train, y_train, X_val, y_val):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        y_pred, stochastic_gates_pred = model(X_train)
        loss = custom_loss_function(y_pred, y_train, stochastic_gates_pred, sigma=1, D=X_train.shape[1], lambda_reg=0.05)
        loss.backward()
        optimizer.step()

        train_mse = mse_metric(y_pred, y_train).item()
        train_rmse = rmse_metric(y_pred, y_train).item()

        # Validation phase
        model.eval()
        with torch.no_grad():
            val_y_pred, val_stochastic_gates = model(X_val)
            val_loss = custom_loss_function(val_y_pred, y_val, val_stochastic_gates, sigma=1, D=X_val.shape[1], lambda_reg=0.05)
            val_mse = mse_metric(val_y_pred, y_val).item()
            val_rmse = rmse_metric(val_y_pred, y_val).item()
        #"Gates: {stochastic_gates_pred}\n 
        print(f'Epoch {epoch+1},\nTrain: \n  Loss: {loss.item()}, MSE: {train_mse}, RMSE: {train_rmse}, \nVal:\n   Loss: {val_loss.item()}, Val MSE: {val_mse}, Val RMSE: {val_rmse}')
    return stochastic_gates_pred

# Initialize the hypernetwork, prediction network, loss criterion, and optimizer
model = C_StochasticGates(input_dim=X_train_tensor.shape[1])
optimizer = Adam(list(model.parameters()) , lr=0.001)

# Training
epochs = 100  # Adjust as necessary based on convergence and performance
# gates = train(model,  optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

In [19]:
import torch

def r2_score(y_true, y_pred):
    """Compute R-squared score."""
    ss_res = torch.sum((y_true - y_pred) ** 2)
    ss_tot = torch.sum((y_true - torch.mean(y_true)) ** 2)
    r2 = 1 - ss_res / ss_tot
    return r2.item()

def train(model, optimizer, epochs, X_train, y_train, X_val, y_val):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        y_pred, stochastic_gates_pred = model(X_train)
        loss = custom_loss_function(y_pred, y_train, stochastic_gates_pred, sigma=1, D=X_train.shape[1], lambda_reg=0.05)
        loss.backward()
        optimizer.step()

        train_mse = mse_metric(y_pred, y_train).item()
        train_rmse = rmse_metric(y_pred, y_train).item()
        train_r2 = r2_score(y_train, y_pred)

        # Validation phase
        model.eval()
        with torch.no_grad():
            val_y_pred, val_stochastic_gates = model(X_val)
            val_loss = custom_loss_function(val_y_pred, y_val, val_stochastic_gates, sigma=1, D=X_val.shape[1], lambda_reg=0.05)
            val_mse = mse_metric(val_y_pred, y_val).item()
            val_rmse = rmse_metric(val_y_pred, y_val).item()
            val_r2 = r2_score(y_val, val_y_pred)

        print(f'Epoch {epoch+1},\n'
              f'Train: Loss: {loss.item()}, MSE: {train_mse}, RMSE: {train_rmse}, R^2: {train_r2}\n'
              f'Val: Loss: {val_loss.item()}, Val MSE: {val_mse}, Val RMSE: {val_rmse}, Val R^2: {val_r2}')

    return stochastic_gates_pred

# Note: Ensure custom_loss_function, mse_metric, rmse_metric, and C_StochasticGates are defined
# Also, ensure X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor are properly initialized

# Example initialization (for illustration purposes)
model = C_StochasticGates(input_dim=X_train_tensor.shape[1])
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Training
epochs = 100  # Adjust as necessary
gates = train(model, optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)


Epoch 1,
Train: Loss: 570.35791015625, MSE: 6.9034423828125, RMSE: 2.6274402141571045, R^2: -4.164239406585693
Val: Loss: 146.06898498535156, Val MSE: 5.910678863525391, Val RMSE: 2.4311888217926025, Val R^2: -3.5105600357055664
Epoch 2,
Train: Loss: 570.2364501953125, MSE: 6.448670864105225, RMSE: 2.5394232273101807, R^2: -3.8240394592285156
Val: Loss: 145.5257110595703, Val MSE: 5.504674434661865, Val RMSE: 2.3462042808532715, Val R^2: -3.2007298469543457
Epoch 3,
Train: Loss: 568.9860229492188, MSE: 5.948492527008057, RMSE: 2.438953161239624, R^2: -3.4498724937438965
Val: Loss: 144.679443359375, Val MSE: 5.118729114532471, Val RMSE: 2.262460947036743, Val R^2: -2.9062068462371826
Epoch 4,
Train: Loss: 568.32421875, MSE: 5.557180881500244, RMSE: 2.3573672771453857, R^2: -3.1571450233459473
Val: Loss: 144.61224365234375, Val MSE: 4.73947286605835, Val RMSE: 2.177032947540283, Val R^2: -2.616788864135742
Epoch 5,
Train: Loss: 568.0288696289062, MSE: 5.1136579513549805, RMSE: 2.26133990

## Comparing with stg

In [20]:
import pandas as pd
import torch
# from stg import STG
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
# split the data into features and target variable 

reg_url = 'https://raw.githubusercontent.com/FreeDataSets/DataPool/main/tracks_150000.csv' # this is the url for the dataset
reg_df = pd.read_csv(reg_url)#.sample(100000,random_state=42) # In order to reduce the size of the dataset, we are taking a random sample of 5000 rows from the dataset
reg_df.drop(['name', 'artists','id','release_date', 'artists_id','genre',], axis=1, inplace=True, errors='ignore') # Removing Categorical features with more then 10 unique values
reg_df = reg_df.sample(800, random_state=42)
# a preview of the dataframe
# reg_df.info() 
# display(reg_df.head())


In [21]:
Xreg = reg_df.drop('popularity', axis=1).values # features
yreg = reg_df['popularity'] # target variable


# split the data into train and test sets
Xreg_train, Xreg_test, yreg_train, yreg_test = train_test_split(Xreg, yreg, test_size=0.2, random_state=42)


scaler_reg = StandardScaler().fit(Xreg_train)

Xreg_train_scaled = scaler_reg.transform(Xreg_train)
Xreg_test_scaled = scaler_reg.transform(Xreg_test)
scaler_y = StandardScaler().fit(yreg_train.values.reshape(-1, 1))
yreg_train_scaled = scaler_y.transform(yreg_train.values.reshape(-1, 1))
yreg_test_scaled = scaler_y.transform(yreg_test.values.reshape(-1, 1))
# Define model parameters
args_cuda = torch.cuda.is_available()
device = torch.device("cuda" if args_cuda else "cpu")

print()

# X_train_scaled and X_test_scaled are now DataFrames with scaled features and retained column names.
# convert them to PyTorch tensors
X_train_tensor = torch.tensor(Xreg_train_scaled, dtype=torch.float32)
X_test_tensor = torch.tensor(Xreg_test_scaled, dtype=torch.float32)
y_train_tensor = torch.tensor(yreg_train_scaled, dtype=torch.float32)
y_test_tensor = torch.tensor(yreg_test_scaled, dtype=torch.float32)
y_train_tensor = y_train_tensor.view(-1, 1)
y_test_tensor = y_test_tensor.view(-1, 1)




In [22]:
def train(model, optimizer, epochs, X_train, y_train, X_val, y_val):
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()

        y_pred, stochastic_gates_pred = model(X_train)
        loss = custom_loss_function(y_pred, y_train, stochastic_gates_pred, sigma=1, D=X_train.shape[1], lambda_reg=0.05)
        loss.backward()
        optimizer.step()

        train_mse = mse_metric(y_pred, y_train).item()
        train_rmse = rmse_metric(y_pred, y_train).item()

        # Validation phase
        model.eval()
        with torch.no_grad():
            val_y_pred, val_stochastic_gates = model(X_val)
            val_loss = custom_loss_function(val_y_pred, y_val, val_stochastic_gates, sigma=1, D=X_val.shape[1], lambda_reg=0.05)
            val_mse = mse_metric(val_y_pred, y_val).item()
            val_rmse = rmse_metric(val_y_pred, y_val).item()
        #"Gates: {stochastic_gates_pred}\n 
        print(f'Epoch {epoch+1},\nTrain: \n  Loss: {loss.item()}, MSE: {train_mse}, RMSE: {train_rmse}, \nVal:\n   Loss: {val_loss.item()}, Val MSE: {val_mse}, Val RMSE: {val_rmse}')
    return stochastic_gates_pred

# Initialize the hypernetwork, prediction network, loss criterion, and optimizer
model = C_StochasticGates(input_dim=X_train_tensor.shape[1])
optimizer = Adam(list(model.parameters()) , lr=0.001)
# c_stg = C_StochasticGates(input_dim=X_train_tensor.shape[1])

# Training
epochs = 100  # Adjust as necessary based on convergence and performance
gates = train(c_stg,  optimizer, epochs, X_train_tensor, y_train_tensor, X_test_tensor, y_test_tensor)

Epoch 1,
Train: 
  Loss: 23.503328323364258, MSE: 1.5982611179351807, RMSE: 1.264223575592041, 
Val:
   Loss: 6.6467061042785645, Val MSE: 1.256059169769287, Val RMSE: 1.1207404136657715
Epoch 2,
Train: 
  Loss: 23.501220703125, MSE: 1.616357445716858, RMSE: 1.2713605165481567, 
Val:
   Loss: 6.767306804656982, Val MSE: 1.2940722703933716, Val RMSE: 1.137573003768921
Epoch 3,
Train: 
  Loss: 23.533437728881836, MSE: 1.6092185974121094, RMSE: 1.2685497999191284, 
Val:
   Loss: 6.732451438903809, Val MSE: 1.3145971298217773, Val RMSE: 1.1465587615966797
Epoch 4,
Train: 
  Loss: 23.592121124267578, MSE: 1.6794666051864624, RMSE: 1.2959423065185547, 
Val:
   Loss: 6.7568769454956055, Val MSE: 1.3675134181976318, Val RMSE: 1.1694072484970093
Epoch 5,
Train: 
  Loss: 23.553632736206055, MSE: 1.6617352962493896, RMSE: 1.2890831232070923, 
Val:
   Loss: 6.754984378814697, Val MSE: 1.3274564743041992, Val RMSE: 1.1521530151367188
Epoch 6,
Train: 
  Loss: 23.503028869628906, MSE: 1.6658868789672

In [33]:
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,mean_absolute_percentage_error
import numpy as np

## TESTING THE MODEL
y_pred, _ = model(X_test_tensor)
y_pred_array = y_pred.detach().numpy()
spot_results_dict = {
        'R2 Score': round(r2_score(y_test_tensor.detach().numpy(), y_pred_array),3),
        'RMSE': round(mean_squared_error(y_test_tensor.detach().numpy(), y_pred_array, squared=False),3),
        'MAE': round(mean_absolute_error(y_test_tensor.detach().numpy(), y_pred_array),3),
        'MAPE': round(mean_absolute_percentage_error(y_test_tensor.detach().numpy(), y_pred_array),3),
        # 'gates_found': model.get_gates(mode='prob').astype(str),
    }

spot_results_dict = pd.DataFrame(spot_results_dict, index=[0])


In [34]:
spot_results_dict

Unnamed: 0,R2 Score,RMSE,MAE,MAPE
0,-0.405,1.203,0.966,2.783


In [23]:
# import matplotlib.pyplot as plt

# # Define a function to train and evaluate models using STG algorithm
# def train_evaluate_stg(X_train, y_train, X_test, y_test, context_dim):
#     # Instantiate HyperNetwork and PredictionNetwork for STG
#     hypernetwork_stg = HyperNetwork(context_dim, X_train.shape[1] - context_dim)
#     prediction_network_stg = PredictionNetwork(X_train.shape[1] - context_dim)

#     # Train the model
#     stg_losses_train, stg_losses_val = train_model(X_train, y_train, X_test, y_test, 
#                                                     hypernetwork_stg, prediction_network_stg)
#     return stg_losses_train, stg_losses_val

# # Define a function to train and evaluate models using CSTG algorithm
# def train_evaluate_cstg(X_train, y_train, X_test, y_test, context_dim):
#     # Instantiate ConditionalStochasticGates model
#     cstg_model = ConditionalStochasticGates(context_dim, X_train.shape[1] - context_dim)

#     # Train the model
#     cstg_losses_train, cstg_losses_val = train_model(X_train, y_train, X_test, y_test, 
#                                                       cstg_model, prediction_network)
#     return cstg_losses_train, cstg_losses_val

# # Define a function to train the model and return training and validation losses
# def train_model(X_train, y_train, X_test, y_test, hypernetwork, prediction_network):
#     optimizer = optim.Adam(list(hypernetwork.parameters()) + list(prediction_network.parameters()), lr=0.001)
#     loss_fn = nn.MSELoss()

#     num_epochs = 100
#     stg_losses_train, stg_losses_val = [], []

#     for epoch in range(num_epochs):
#         # Training step
#         hypernetwork.train()
#         prediction_network.train()
#         optimizer.zero_grad()

#         # Forward pass
#         gates_train = hypernetwork(X_train[:, :context_dim])
#         selected_features_train = X_train[:, context_dim:] * stochastic_gates(gates_train)
#         predictions_train = prediction_network(selected_features_train)
#         loss_train = loss_fn(predictions_train, y_train)

#         # Backward pass
#         loss_train.backward()
#         optimizer.step()

#         # Validation step
#         with torch.no_grad():
#             hypernetwork.eval()
#             prediction_network.eval()

#             gates_val = hypernetwork(X_test[:, :context_dim])
#             selected_features_val = X_test[:, context_dim:] * stochastic_gates(gates_val)
#             predictions_val = prediction_network(selected_features_val)
#             loss_val = loss_fn(predictions_val, y_test)

#         stg_losses_train.append(loss_train.item())
#         stg_losses_val.append(loss_val.item())

#     return stg_losses_train, stg_losses_val

# # Define the context dimension
# context_dim = 3  # Assuming the first 3 features are used as context

# # Train and evaluate models using STG algorithm
# stg_losses_train, stg_losses_val = train_evaluate_stg(X_train_tensor, y_train_tensor, 
#                                                       X_test_tensor, y_test_tensor, context_dim)

# # Train and evaluate models using CSTG algorithm
# cstg_losses_train, cstg_losses_val = train_evaluate_cstg(X_train_tensor, y_train_tensor, 
#                                                           X_test_tensor, y_test_tensor, context_dim)

# # Plot training and validation losses for STG and CSTG algorithms
# plt.figure(figsize=(10, 6))
# plt.plot(range(1, num_epochs + 1), stg_losses_train, label='STG Train Loss', color='blue')
# plt.plot(range(1, num_epochs + 1), stg_losses_val, label='STG Validation Loss', linestyle='--', color='blue')
# plt.plot(range(1, num_epochs + 1), cstg_losses_train, label='CSTG Train Loss', color='orange')
# plt.plot(range(1, num_epochs + 1), cstg_losses_val, label='CSTG Validation Loss', linestyle='--', color='orange')
# plt.title('Training and Validation Losses')
# plt.xlabel('Epochs')
# plt.ylabel('Loss')
# plt.legend()
# plt.grid(True)
# plt.show()