In [1]:
import torch
import json
from torch.utils.data import DataLoader, Dataset
from transformers import BertForSequenceClassification, BertTokenizer

In [5]:
# Define your dataset class
class PromptDataset(Dataset):
    def __init__(self, file_path):
        self.data = self.load_data(file_path)
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']
        label = self.data[idx]['label']

        if label == "generation":
            label_encoded = 0
        elif label == "completion":
            label_encoded = 1
        elif label == "question-answer":
            label_encoded = 2

        # Tokenize the prompt
        encoded_prompt = self.tokenizer.encode_plus(
            prompt,
            add_special_tokens=True,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoded_prompt['input_ids'].squeeze(),
            'attention_mask': encoded_prompt['attention_mask'].squeeze(),
            'label': torch.tensor(label_encoded)
        }
    
    def load_data(self, file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data

In [6]:
dataset = PromptDataset('../data/prompts.json')

In [7]:
# Define the model architecture
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=3)

# Define your training parameters
batch_size = 32
num_epochs = 20
learning_rate = 2e-5

# Create a DataLoader for batching and shuffling the data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set device (GPU if available, else CPU)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Move the model to the device
model = model.to(device)

# Set the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        logits = outputs.logits
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")

# Save the trained model
model.save_pretrained('../models/prompt_classifier')

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

Epoch 1/20 - Average Loss: 1.1280
Epoch 2/20 - Average Loss: 0.8706
Epoch 3/20 - Average Loss: 0.8441
Epoch 4/20 - Average Loss: 0.7217
Epoch 5/20 - Average Loss: 0.6683
Epoch 6/20 - Average Loss: 0.5854
Epoch 7/20 - Average Loss: 0.4398
Epoch 8/20 - Average Loss: 0.4632
Epoch 9/20 - Average Loss: 0.3556
Epoch 10/20 - Average Loss: 0.2946
Epoch 11/20 - Average Loss: 0.3020
Epoch 12/20 - Average Loss: 0.2267
Epoch 13/20 - Average Loss: 0.2160
Epoch 14/20 - Average Loss: 0.1726
Epoch 15/20 - Average Loss: 0.1532
Epoch 16/20 - Average Loss: 0.1358
Epoch 17/20 - Average Loss: 0.1173
Epoch 18/20 - Average Loss: 0.1129
Epoch 19/20 - Average Loss: 0.0949
Epoch 20/20 - Average Loss: 0.0936


In [8]:
##Testing
# Load the saved model
model = BertForSequenceClassification.from_pretrained('../models/prompt_classifier/')
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example input
input_text = "Dear Darren, I am writing to express my interest in the software engineer position at Amazon. In my previous role as a software developer, I successfully completed several projects that required me to collaborate with cross-functional teams. I am confident in my ability to contribute to your organization's growth and deliver high-quality code. What type of projects did you work on as a software developer that involved collaboration with cross-functional teams?"

# Tokenize the input text
encoded_input = tokenizer.encode_plus(
    input_text,
    add_special_tokens=True,
    truncation=True,
    padding='max_length',
    max_length=128,
    return_tensors='pt'
)

# Perform inference
input_ids = encoded_input['input_ids']
attention_mask = encoded_input['attention_mask']
outputs = model(input_ids, attention_mask=attention_mask)

# Get the predicted label
predicted_label = outputs.logits.argmax().item()
print('Predicted Label:', predicted_label)

Predicted Label: 2
