# Embed dataset:

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from datasets import load_dataset
import numpy as np
from tqdm import tqdm

# Check for GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained("microsoft/Phi-3.5-mini-instruct", trust_remote_code=True)

# Move the model to GPU
model = model.to(device)

# Load the SST-2 dataset from the GLUE benchmark
dataset = load_dataset('glue', 'sst2', streaming=True)

Using device: cuda


tokenizer_config.json:   0%|          | 0.00/3.98k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/306 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/3.45k [00:00<?, ?B/s]

configuration_phi3.py:   0%|          | 0.00/11.2k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- configuration_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_phi3.py:   0%|          | 0.00/73.8k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/microsoft/Phi-3.5-mini-instruct:
- modeling_phi3.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


model.safetensors.index.json:   0%|          | 0.00/16.3k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/195 [00:00<?, ?B/s]

In [2]:

# Define tokenization function (works on individual examples)
def tokenize_function(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True, max_length=128)

# Streaming dataset: iterate through the dataset manually
streamed_dataset = dataset['train']

# Initialize lists to store embeddings and labels
embeddings_list = []
labels_list = []

# Iterate through the streamed dataset and tokenize + embed each example
for i, example in enumerate(tqdm(streamed_dataset)):
    
    # Tokenize the example
    tokenized_example = tokenize_function(example)
    
    # Convert tokenized inputs to PyTorch tensors
    input_ids = torch.tensor(tokenized_example['input_ids']).unsqueeze(0).to(device)  # Add batch dimension
    attention_mask = torch.tensor(tokenized_example['attention_mask']).unsqueeze(0).to(device)
    
    # Generate embeddings using the model
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]
        
        # Mean pooling with attention masking
        expanded_attention_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * expanded_attention_mask, dim=1)
        sum_mask = torch.clamp(expanded_attention_mask.sum(dim=1), min=1e-9)
        embeddings = sum_embeddings / sum_mask
    
    # Store embeddings and labels
    embeddings_list.append(embeddings.cpu().numpy())
    labels_list.append(example['label'])

# Convert lists to numpy arrays
embeddings = np.vstack(embeddings_list)
labels = np.array(labels_list)

# Print the shape of the embeddings to confirm successful processing
print(f"Embeddings shape: {embeddings.shape}")
print(f"Labels shape: {labels.shape}")


0it [00:00, ?it/s]You are not running the flash-attention implementation, expect numerical differences.
67349it [56:12, 19.97it/s]


Embeddings shape: (67349, 3072)
Labels shape: (67349,)


# Ensemble training

In [3]:
# Define the MLP architecture
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPClassifier, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return self.softmax(x)

# Create an ensemble of MLPs
ensemble_size = 10  # Set ensemble size
input_size = embeddings.shape[1]
hidden_size = 128
output_size = 2  # For binary classification

ensemble = [MLPClassifier(input_size, hidden_size, output_size).to(device) for _ in range(ensemble_size)]

# Convert embeddings and labels to tensors
train_embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)
train_labels_tensor = torch.tensor(labels, dtype=torch.long).to(device)

# Training loop for MLP classifiers
def train_mlp(model, train_embeddings, train_labels, epochs=10, learning_rate=0.001, batch_size=32):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    dataset_size = train_embeddings.shape[0]
    
    for epoch in range(epochs):
        model.train()

        # Shuffle the data
        indices = torch.randperm(dataset_size)
        train_embeddings = train_embeddings[indices]
        train_labels = train_labels[indices]
        
        for i in range(0, dataset_size, batch_size):
            batch_embeddings = train_embeddings[i:i+batch_size]
            batch_labels = train_labels[i:i+batch_size]
            
            # Forward pass
            outputs = model(batch_embeddings)
            loss = criterion(outputs, batch_labels)
            
            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Train each MLP in the ensemble
for i, model in enumerate(ensemble):
    print(f'Training MLP {i+1}/{ensemble_size}')
    train_mlp(model, train_embeddings_tensor, train_labels_tensor)

# Function for ensemble prediction with uncertainty quantification
def ensemble_predict_with_uncertainty(ensemble, embeddings, batch_size=32):
    ensemble_predictions = []
    dataset_size = embeddings.shape[0]
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    for i in range(0, dataset_size, batch_size):
        batch_embeddings = embeddings_tensor[i:i+batch_size]
        
        # Collect predictions from each model in the ensemble
        batch_predictions = []
        for model in ensemble:
            model.eval()
            with torch.no_grad():
                outputs = model(batch_embeddings)
                batch_predictions.append(outputs.cpu().numpy())

        # Stack all predictions for the current batch and append to the ensemble predictions
        batch_predictions = np.stack(batch_predictions)
        ensemble_predictions.append(batch_predictions)

    # Stack all batch-level predictions across all samples
    ensemble_predictions = np.concatenate(ensemble_predictions, axis=1)

    # Average predictions across ensemble members
    averaged_predictions = np.mean(ensemble_predictions, axis=0)

    # Calculate predictive uncertainty (variance across ensemble predictions)
    predictive_variance = np.var(ensemble_predictions, axis=0)

    # Calculate predictive entropy (measures uncertainty)
    predictive_entropy = -np.sum(averaged_predictions * np.log(averaged_predictions + 1e-9), axis=1)

    # Final class predictions (based on averaged predictions)
    final_predictions = np.argmax(averaged_predictions, axis=1)

    return final_predictions, predictive_variance, predictive_entropy

# Example: Get ensemble predictions on the training set
ensemble_predictions, predictive_variance, predictive_entropy = ensemble_predict_with_uncertainty(ensemble, embeddings)

# Print predictive uncertainty for the first few samples
print(f'Predictive variance for first few samples: {predictive_variance[:5]}')
print(f'Predictive entropy for first few samples: {predictive_entropy[:5]}')


Training MLP 1/10
Epoch [1/10], Loss: 0.5990
Epoch [2/10], Loss: 0.7895
Epoch [3/10], Loss: 0.7895
Epoch [4/10], Loss: 0.7418
Epoch [5/10], Loss: 0.7895
Epoch [6/10], Loss: 0.8371
Epoch [7/10], Loss: 0.6466
Epoch [8/10], Loss: 0.7418
Epoch [9/10], Loss: 0.7418
Epoch [10/10], Loss: 0.6942
Training MLP 2/10
Epoch [1/10], Loss: 0.6466
Epoch [2/10], Loss: 0.6942
Epoch [3/10], Loss: 0.6466
Epoch [4/10], Loss: 0.7895
Epoch [5/10], Loss: 0.5990
Epoch [6/10], Loss: 0.6466
Epoch [7/10], Loss: 0.7418
Epoch [8/10], Loss: 0.6942
Epoch [9/10], Loss: 0.5990
Epoch [10/10], Loss: 0.7895
Training MLP 3/10
Epoch [1/10], Loss: 0.6942
Epoch [2/10], Loss: 0.8371
Epoch [3/10], Loss: 0.7418
Epoch [4/10], Loss: 0.9799
Epoch [5/10], Loss: 0.7418
Epoch [6/10], Loss: 0.7895
Epoch [7/10], Loss: 0.8847
Epoch [8/10], Loss: 0.7418
Epoch [9/10], Loss: 0.9323
Epoch [10/10], Loss: 0.7418
Training MLP 4/10
Epoch [1/10], Loss: 0.8847
Epoch [2/10], Loss: 0.8371
Epoch [3/10], Loss: 0.9323
Epoch [4/10], Loss: 0.7418
Epoch [

In [4]:
# Function for ensemble prediction with uncertainty quantification and accuracy calculation
def ensemble_predict_with_uncertainty(ensemble, embeddings, true_labels, batch_size=32):
    ensemble_predictions = []
    dataset_size = embeddings.shape[0]
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    for i in range(0, dataset_size, batch_size):
        batch_embeddings = embeddings_tensor[i:i+batch_size]
        
        # Collect predictions from each model in the ensemble
        batch_predictions = []
        for model in ensemble:
            model.eval()
            with torch.no_grad():
                outputs = model(batch_embeddings)
                batch_predictions.append(outputs.cpu().numpy())

        # Stack all predictions for the current batch and append to the ensemble predictions
        batch_predictions = np.stack(batch_predictions)
        ensemble_predictions.append(batch_predictions)

    # Stack all batch-level predictions across all samples
    ensemble_predictions = np.concatenate(ensemble_predictions, axis=1)

    # Average predictions across ensemble members
    averaged_predictions = np.mean(ensemble_predictions, axis=0)

    # Calculate predictive uncertainty (variance across ensemble predictions)
    predictive_variance = np.var(ensemble_predictions, axis=0)

    # Calculate predictive entropy (measures uncertainty)
    predictive_entropy = -np.sum(averaged_predictions * np.log(averaged_predictions + 1e-9), axis=1)

    # Final class predictions (based on averaged predictions)
    final_predictions = np.argmax(averaged_predictions, axis=1)

    # Calculate accuracy
    accuracy = np.mean(final_predictions == true_labels)
    
    return final_predictions, predictive_variance, predictive_entropy, accuracy

# Example: Get ensemble predictions on the training set and calculate accuracy
ensemble_predictions, predictive_variance, predictive_entropy, accuracy = ensemble_predict_with_uncertainty(ensemble, embeddings, labels)

# Print predictive uncertainty for the first few samples
print(f'Predictive variance for first few samples: {predictive_variance[:5]}')
print(f'Predictive entropy for first few samples: {predictive_entropy[:5]}')

# Print the accuracy
print(f'Ensemble Accuracy: {accuracy * 100:.2f}%')


Predictive variance for first few samples: [[0.18087845 0.18087845]
 [0.20999369 0.20999369]
 [0.08999999 0.08999999]
 [0.08998967 0.08998967]
 [0.20995176 0.20995176]]
Predictive entropy for first few samples: [0.5895269  0.61086047 0.325083   0.32519665 0.6108351 ]
Ensemble Accuracy: 55.78%
