# Embed dataset:

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_directml
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
from datasets import load_dataset
import numpy as np

# Check for GPU
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device = torch_directml.device()
print(f'Using device: {device}')

# Load model directly
tokenizer = AutoTokenizer.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")
model = AutoModelForCausalLM.from_pretrained("HuggingFaceTB/SmolLM-1.7B-Instruct")

# Move the model to GPU
model = model.to(device)

# Load the SST-2 dataset from the GLUE benchmark
dataset = load_dataset('glue', 'sst2')

Using device: privateuseone:0


tokenizer_config.json:   0%|          | 0.00/3.59k [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


vocab.json:   0%|          | 0.00/801k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.10M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/655 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/738 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.42G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

In [None]:
# Define the number of examples to use
N_train = 70  # Adjust this number as needed
N_val = 30
N_total = N_train + N_val

# Select a subset of the dataset
total_dataset = dataset['train'].select(range(N_total))

# Tokenization function
def tokenize_function(example):
    return tokenizer(example['sentence'], padding='max_length', truncation=True, max_length=128)

# Tokenize the subset
tokenized_total_dataset = total_dataset.map(tokenize_function, batched=True)

# Embedding function with explicit data types
def embed_sentences(batch):
    with torch.no_grad():
        input_ids = torch.tensor(batch['input_ids'], dtype=torch.long).to(device)
        attention_mask = torch.tensor(batch['attention_mask'], dtype=torch.bool).to(device)

        outputs = model(input_ids=input_ids, attention_mask=attention_mask, output_hidden_states=True)
        last_hidden_state = outputs.hidden_states[-1]

        # Mean pooling with attention masking
        expanded_attention_mask = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * expanded_attention_mask, dim=1)
        sum_mask = torch.clamp(expanded_attention_mask.sum(dim=1), min=1e-9)
        embeddings = sum_embeddings / sum_mask

    return {'embeddings': embeddings.cpu().numpy()}


# Embed the tokenized subset
embedded_total_dataset = tokenized_total_dataset.map(embed_sentences, batched=True)

# Extract embeddings and labels
embeddings = np.array(embedded_total_dataset['embeddings'])
labels = np.array(embedded_total_dataset['label'])

# Split embeddings and labels into train and validation sets
train_embeddings = embeddings[:N_train]
train_labels = labels[:N_train]

validation_embeddings = embeddings[N_train:N_train+N_val]
validation_labels = labels[N_train:N_train+N_val]

# Convert data to tensors
train_embeddings_tensor = torch.tensor(train_embeddings, dtype=torch.float32).to(device)
train_labels_tensor = torch.tensor(train_labels, dtype=torch.long).to(device)

validation_embeddings_tensor = torch.tensor(validation_embeddings, dtype=torch.float32).to(device)
validation_labels_tensor = torch.tensor(validation_labels, dtype=torch.long).to(device)



Map:   0%|          | 0/100 [00:00<?, ? examples/s]

# Ensemble training

In [None]:
# Define the MLP architecture
class MLPClassifier(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(MLPClassifier, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.relu = nn.ReLU()
        self.layer2 = nn.Linear(hidden_size, output_size)
        self.softmax = nn.Softmax(dim=1)
    
    def forward(self, x):
        x = self.layer1(x)
        x = self.relu(x)
        x = self.layer2(x)
        return self.softmax(x)

# Create an ensemble of MLPs
ensemble_size = 5
input_size = train_embeddings.shape[1]
hidden_size = 128
output_size = 2  # For binary classification

ensemble = [MLPClassifier(input_size, hidden_size, output_size).to(device) for _ in range(ensemble_size)]

# Training loop
def train_mlp(model, train_embeddings, train_labels, epochs=10, learning_rate=0.001):
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    
    for epoch in range(epochs):
        model.train()
        outputs = model(train_embeddings)
        loss = criterion(outputs, train_labels)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

# Train each MLP in the ensemble
for i, model in enumerate(ensemble):
    print(f'Training MLP {i+1}/{ensemble_size}')
    train_mlp(model, train_embeddings_tensor, train_labels_tensor)

# Function for ensemble prediction
def ensemble_predict(ensemble, embeddings):
    ensemble_predictions = []
    embeddings_tensor = torch.tensor(embeddings, dtype=torch.float32).to(device)

    for model in ensemble:
        model.eval()
        with torch.no_grad():
            outputs = model(embeddings_tensor)
            ensemble_predictions.append(outputs.cpu().numpy())

    averaged_predictions = np.mean(ensemble_predictions, axis=0)
    return np.argmax(averaged_predictions, axis=1)

# Get ensemble predictions on the validation set
ensemble_predictions = ensemble_predict(ensemble, validation_embeddings)

# Calculate accuracy
accuracy = np.mean(ensemble_predictions == validation_labels)
print(f'Ensemble Accuracy: {accuracy * 100:.2f}%')