# Embed dataset:

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from datasets import load_dataset
import numpy as np
from tqdm import tqdm

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")

# Move the model to GPU
model = model.to(device)

# Load the SST-2 dataset from the GLUE benchmark
dataset = load_dataset('glue', 'sst2', streaming=True)

Using device: cuda


In [3]:

# Define tokenization function (works on individual examples)
def tokenize_function(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True, max_length=128)

# Streaming dataset: iterate through the dataset manually
streamed_dataset = dataset['train']

# Initialize lists to store embeddings and labels
embeddings_list = []
labels_list = []

# Set a total number of examples to process if you want, or leave it for streaming
total_examples = None  # Set a limit for demonstration purposes (or None for the whole dataset)

# Iterate through the streamed dataset and tokenize + embed each example
for i, example in enumerate(tqdm(streamed_dataset)):
    
    # Tokenize the example
    tokenized_example = tokenize_function(example)
    
    # Convert tokenized inputs to PyTorch tensors
    input_ids = torch.tensor(tokenized_example['input_ids']).unsqueeze(0).to(device)  # Add batch dimension
    attention_mask = torch.tensor(tokenized_example['attention_mask']).unsqueeze(0).to(device)
    
    # Generate embeddings using the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        
        # Mean pooling with attention masking
        expanded_attention_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * expanded_attention_mask, dim=1)
        sum_mask = torch.clamp(expanded_attention_mask.sum(dim=1), min=1e-9)
        embeddings = sum_embeddings / sum_mask
    
    # Store embeddings and labels
    embeddings_list.append(embeddings.cpu().numpy())
    labels_list.append(example['label'])

# Convert lists to numpy arrays
embeddings = np.vstack(embeddings_list)
labels = np.array(labels_list)

# Print the shape of the embeddings to confirm successful processing
print(f"Embeddings shape: {embeddings.shape}")
print(f"Labels shape: {labels.shape}")


  attn_output = torch.nn.functional.scaled_dot_product_attention(
Starting from v4.46, the `logits` model output will have the same type as the model (except at train time, where it will always be FP32)
67349it [24:19, 46.15it/s]


Embeddings shape: (67349, 2048)
Labels shape: (67349,)


# Ensemble training

In [4]:
# Define the MLP architecture
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPClassifier, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return self.softmax(x)

# Create an ensemble of MLPs
ensemble_size = 10  # Set ensemble size
input_size = embeddings.shape[1]
hidden_size = 128
output_size = 2  # For binary classification

ensemble = [MLPClassifier(input_size, hidden_size, output_size).to(device) for _ in range(ensemble_size)]

# Convert embeddings and labels to tensors
train_embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
train_labels_tensor = torch.tensor(labels, dtype=torch.long).to(device)

# Training loop for MLP classifiers
def train_mlp(model, train_embeddings, train_labels, epochs=10, learning_rate=0.001, batch_size=32):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    dataset_size = train_embeddings.shape[0]
    
    for epoch in range(epochs):
        model.train()

        # Shuffle the data
        indices = torch.randperm(dataset_size)
        train_embeddings = train_embeddings[indices]
        train_labels = train_labels[indices]
        
        for i in range(0, dataset_size, batch_size):
            batch_embeddings = train_embeddings[i:i+batch_size]
            batch_labels = train_labels[i:i+batch_size]
            
            # Forward pass
            outputs = model(batch_embeddings)
            loss = criterion(outputs, batch_labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Train each MLP in the ensemble
for i, model in enumerate(ensemble):
    print(f'Training MLP {i+1}/{ensemble_size}')
    train_mlp(model, train_embeddings_tensor, train_labels_tensor)

# Function for ensemble prediction with uncertainty quantification
def ensemble_predict_with_uncertainty(ensemble, embeddings, batch_size=32):
    ensemble_predictions = []
    dataset_size = embeddings.shape[0]
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    for i in range(0, dataset_size, batch_size):
        batch_embeddings = embeddings_tensor[i:i+batch_size]
        
        # Collect predictions from each model in the ensemble
        batch_predictions = []
        for model in ensemble:
            model.eval()
            with torch.no_grad():
                outputs = model(batch_embeddings)
                batch_predictions.append(outputs.cpu().numpy())

        # Stack all predictions for the current batch and append to the ensemble predictions
        batch_predictions = np.stack(batch_predictions)
        ensemble_predictions.append(batch_predictions)

    # Stack all batch-level predictions across all samples
    ensemble_predictions = np.concatenate(ensemble_predictions, axis=1)

    # Average predictions across ensemble members
    averaged_predictions = np.mean(ensemble_predictions, axis=0)

    # Calculate predictive uncertainty (variance across ensemble predictions)
    predictive_variance = np.var(ensemble_predictions, axis=0)

    # Calculate predictive entropy (measures uncertainty)
    predictive_entropy = -np.sum(averaged_predictions * np.log(averaged_predictions + 1e-9), axis=1)

    # Final class predictions (based on averaged predictions)
    final_predictions = np.argmax(averaged_predictions, axis=1)

    return final_predictions, predictive_variance, predictive_entropy

# Example: Get ensemble predictions on the training set
ensemble_predictions, predictive_variance, predictive_entropy = ensemble_predict_with_uncertainty(ensemble, embeddings)

# Print predictive uncertainty for the first few samples
print(f'Predictive variance for first few samples: {predictive_variance[:5]}')
print(f'Predictive entropy for first few samples: {predictive_entropy[:5]}')


Training MLP 1/10
Epoch [1/10], Loss: 0.3690
Epoch [2/10], Loss: 0.3778
Epoch [3/10], Loss: 0.4081
Epoch [4/10], Loss: 0.4425
Epoch [5/10], Loss: 0.4217
Epoch [6/10], Loss: 0.4552
Epoch [7/10], Loss: 0.3827
Epoch [8/10], Loss: 0.4605
Epoch [9/10], Loss: 0.4865
Epoch [10/10], Loss: 0.3618
Training MLP 2/10
Epoch [1/10], Loss: 0.4621
Epoch [2/10], Loss: 0.4431
Epoch [3/10], Loss: 0.4952
Epoch [4/10], Loss: 0.4933
Epoch [5/10], Loss: 0.3539
Epoch [6/10], Loss: 0.3791
Epoch [7/10], Loss: 0.3964
Epoch [8/10], Loss: 0.5521
Epoch [9/10], Loss: 0.4234
Epoch [10/10], Loss: 0.3324
Training MLP 3/10
Epoch [1/10], Loss: 0.4155
Epoch [2/10], Loss: 0.5022
Epoch [3/10], Loss: 0.3344
Epoch [4/10], Loss: 0.3875
Epoch [5/10], Loss: 0.4528
Epoch [6/10], Loss: 0.4954
Epoch [7/10], Loss: 0.3693
Epoch [8/10], Loss: 0.3672
Epoch [9/10], Loss: 0.4187
Epoch [10/10], Loss: 0.4182
Training MLP 4/10
Epoch [1/10], Loss: 0.3852
Epoch [2/10], Loss: 0.4012
Epoch [3/10], Loss: 0.5005
Epoch [4/10], Loss: 0.4055
Epoch [

In [5]:
# Function for ensemble prediction with uncertainty quantification and accuracy calculation
def ensemble_predict_with_uncertainty(ensemble, embeddings, true_labels, batch_size=32):
    ensemble_predictions = []
    dataset_size = embeddings.shape[0]
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    for i in range(0, dataset_size, batch_size):
        batch_embeddings = embeddings_tensor[i:i+batch_size]
        
        # Collect predictions from each model in the ensemble
        batch_predictions = []
        for model in ensemble:
            model.eval()
            with torch.no_grad():
                outputs = model(batch_embeddings)
                batch_predictions.append(outputs.cpu().numpy())

        # Stack all predictions for the current batch and append to the ensemble predictions
        batch_predictions = np.stack(batch_predictions)
        ensemble_predictions.append(batch_predictions)

    # Stack all batch-level predictions across all samples
    ensemble_predictions = np.concatenate(ensemble_predictions, axis=1)

    # Average predictions across ensemble members
    averaged_predictions = np.mean(ensemble_predictions, axis=0)

    # Calculate predictive uncertainty (variance across ensemble predictions)
    predictive_variance = np.var(ensemble_predictions, axis=0)

    # Calculate predictive entropy (measures uncertainty)
    predictive_entropy = -np.sum(averaged_predictions * np.log(averaged_predictions + 1e-9), axis=1)

    # Final class predictions (based on averaged predictions)
    final_predictions = np.argmax(averaged_predictions, axis=1)

    # Calculate accuracy
    accuracy = np.mean(final_predictions == true_labels)
    
    return final_predictions, predictive_variance, predictive_entropy, accuracy

# Example: Get ensemble predictions on the training set and calculate accuracy
ensemble_predictions, predictive_variance, predictive_entropy, accuracy = ensemble_predict_with_uncertainty(ensemble, embeddings, labels)

# Print predictive uncertainty for the first few samples
print(f'Predictive variance for first few samples: {predictive_variance[:5]}')
print(f'Predictive entropy for first few samples: {predictive_entropy[:5]}')

# Print the accuracy
print(f'Ensemble Accuracy: {accuracy * 100:.2f}%')


Predictive variance for first few samples: [[6.2016567e-04 6.2016555e-04]
 [2.1361142e-05 2.1361266e-05]
 [2.8604026e-30 0.0000000e+00]
 [4.9819651e-16 1.4210855e-15]
 [7.9947205e-07 7.9947131e-07]]
Predictive entropy for first few samples: [7.4693501e-02 1.2246697e-02 1.1683351e-14 2.3039902e-07 4.8157596e-03]
Ensemble Accuracy: 93.62%
