In [2]:
import torch
import json
from torch.utils.data import Dataset, DataLoader
from transformers import AutoModelForCausalLM, AutoTokenizer

In [28]:
class PromptDataset(Dataset):
    def __init__(self, file_path):
        self.data = self.load_data(file_path)
        self.tokenizer = AutoTokenizer.from_pretrained('gpt2')

        # Add a new padding token if it doesn't exist
        if '[PAD]' not in self.tokenizer.get_vocab():
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        prompt = self.data[idx]['prompt']

        encoded_prompt = self.tokenizer.encode_plus(
            prompt,
            truncation=True,
            padding='max_length',
            max_length=128,
            return_tensors='pt'
        )

        return {
            'input_ids': encoded_prompt['input_ids'].squeeze(),
            'attention_mask': encoded_prompt['attention_mask'].squeeze(),
        }

    def load_data(self, file_path):
        with open(file_path, 'r') as file:
            data = json.load(file)
        return data

In [34]:
# Define your dataset and model
dataset = PromptDataset('../data/autocomplete.json')
model = AutoModelForCausalLM.from_pretrained('../models/autocompletion')

# Define your training parameters
batch_size = 32
num_epochs = 20
learning_rate = 2e-5

# Create a DataLoader for batching and shuffling the data
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

# Set device (GPU if available, else CPU)
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')

# Move the model to the device
model = model.to(device)

# Set the optimizer and loss function
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
criterion = torch.nn.CrossEntropyLoss()

# Training loop
# Training loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)

        # Zero the gradients
        optimizer.zero_grad()

        # Forward pass
        outputs = model(input_ids, attention_mask=attention_mask, labels=input_ids)
        loss = outputs.loss
        logits = outputs.logits

        # Backward pass and optimization
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    average_loss = total_loss / len(dataloader)
    print(f"Epoch {epoch+1}/{num_epochs} - Average Loss: {average_loss:.4f}")


# Save the trained model
model.save_pretrained('../models/autocompletion')

Epoch 1/20 - Average Loss: 1.4178
Epoch 2/20 - Average Loss: 1.3504
Epoch 3/20 - Average Loss: 1.2795
Epoch 4/20 - Average Loss: 1.2374
Epoch 5/20 - Average Loss: 1.1359
Epoch 6/20 - Average Loss: 1.0853
Epoch 7/20 - Average Loss: 1.0490
Epoch 8/20 - Average Loss: 0.9810
Epoch 9/20 - Average Loss: 0.9713
Epoch 10/20 - Average Loss: 0.9317
Epoch 11/20 - Average Loss: 0.8866
Epoch 12/20 - Average Loss: 0.8361
Epoch 13/20 - Average Loss: 0.8249
Epoch 14/20 - Average Loss: 0.7961
Epoch 15/20 - Average Loss: 0.7500
Epoch 16/20 - Average Loss: 0.7022
Epoch 17/20 - Average Loss: 0.6953
Epoch 18/20 - Average Loss: 0.6886
Epoch 19/20 - Average Loss: 0.6200
Epoch 20/20 - Average Loss: 0.6357


In [36]:
model = AutoModelForCausalLM.from_pretrained('../models/autocompletion')
tokenizer = AutoTokenizer.from_pretrained('gpt2')

prompt = "Dear Hiring Manager, I am writing to apply for"

# Tokenize the input text
encoded_input = tokenizer.encode_plus(
    prompt,
    add_special_tokens=True,
    truncation=True,
    max_length=128,
    return_tensors='pt'
)


# Perform inference
input_ids = encoded_input['input_ids']
outputs = model(input_ids)
generated_ids = model.generate(input_ids, max_length=200, num_return_sequences=1, num_beams=5)

for generated_id in generated_ids:
    completion = tokenizer.decode(generated_id, skip_special_tokens=False)
    print("Completion:", completion)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Completion: Dear Hiring Manager, I am writing to apply for the software engineer position at your company. With a strong background in software development and a passion for creating innovative solutions, I believe I can contribute to your company's success. In my previous role, I developed and implemented complex software solutions that resulted in significant revenue growth. I am confident in my ability to contribute my skills and expertise to your company's success. Thank you for considering my application. Sincerely, [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name] [Your Name]
