!pip install transformers torch pandas numpy

In [2]:
print(1)

1


In [1]:
!pip install transformers torch pandas numpy



In [2]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import GPT2Tokenizer, GPT2LMHeadModel, AdamW, get_linear_schedule_with_warmup
import pandas as pd
import numpy as np



In [4]:
class TextDataset(Dataset):
    def __init__(self, tokenizer, file_path, block_size=512):
        # Read and load the dataset
        self.examples = []
        with open(file_path, encoding='utf-8') as f:
            text = f.read()
        
        # Tokenize the text and create blocks of data
        tokenized_text = tokenizer.convert_tokens_to_ids(tokenizer.tokenize(text))
        for i in range(0, len(tokenized_text) - block_size + 1, block_size):
            self.examples.append(tokenizer.build_inputs_with_special_tokens(tokenized_text[i:i + block_size]))

    def __len__(self):
        return len(self.examples)

    def __getitem__(self, i):
        return torch.tensor(self.examples[i], dtype=torch.long)


In [6]:
# Initialize the tokenizer and model
model_name = 'gpt2'
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

In [13]:
# Load the dataset and create DataLoader
dataset = TextDataset(tokenizer, "app/lyrics-data.csv")
data_loader = DataLoader(dataset, batch_size=8, shuffle=True)

In [15]:
# Set the device to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Define training parameters
epochs = 3
learning_rate = 5e-5
optimizer = AdamW(model.parameters(), lr=learning_rate)
total_steps = len(data_loader) * epochs


In [16]:
# Create a scheduler to adjust learning rate during training
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=100, num_training_steps=total_steps)

# Start training the model
model.train()
for epoch in range(epochs):
    total_loss = 0
    for step, batch in enumerate(data_loader):
        # Move batch to the device
        batch = batch.to(device)
        
        # Reset gradients
        model.zero_grad()
        
        # Forward pass
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        total_loss += loss.item()
        
        # Backward pass to calculate gradients
        loss.backward()
        
        # Update weights and learning rate
        optimizer.step()
        scheduler.step()
        
        if step % 100 == 0:
            print(f"Epoch: {epoch + 1}, Step: {step}, Loss: {loss.item()}")
    
    print(f"Epoch {epoch + 1} completed. Total Loss: {total_loss / len(data_loader)}")


Epoch: 1, Step: 0, Loss: 2.871904134750366
Epoch 1 completed. Total Loss: 2.9050551551883506
Epoch: 2, Step: 0, Loss: 2.8329460620880127
Epoch 2 completed. Total Loss: 2.655882091845496
Epoch: 3, Step: 0, Loss: 2.204702138900757
Epoch 3 completed. Total Loss: 2.5295837087146307


In [17]:
# Save the fine-tuned model
model.save_pretrained("fine_tuned_gpt2")
tokenizer.save_pretrained("fine_tuned_gpt2")

# Text generation with the fine-tuned model
def generate_text(prompt, max_length=50):
    model.eval()
    generated = tokenizer.encode(prompt, return_tensors='pt').to(device)
    with torch.no_grad():
        output = model.generate(generated, max_length=max_length, num_return_sequences=1, pad_token_id=tokenizer.eos_token_id)
    return tokenizer.decode(output[0], skip_special_tokens=True)

# Example: Generate text using a prompt
prompt = "Once upon a time"
generated_text = generate_text(prompt)
print("Generated Text:\n", generated_text)

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text:
 Once upon a time, I was a child
And I was a child
And I was a child
And I was a child
And I was a child
And I was a child
And I was a child
And I was a
