In [2]:
!pip install transformers torch datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting 

In [4]:
from transformers import GPT2Tokenizer

# Load your lyrics.txt file
with open('lyrics.txt', 'r', encoding='utf-8') as f:
    lyrics = [line.strip() for line in f if line.strip()]

# Initialize tokenizer with special tokens
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({
    'bos_token': '<|startoftext|>',
    'eos_token': '<|endoftext|>',
    'pad_token': '<|pad|>'
})

# Tokenize all lyrics with special tokens
formatted_lyrics = [f"<|startoftext|>{lyric}<|endoftext|>" for lyric in lyrics]
tokenized_lyrics = tokenizer(formatted_lyrics, truncation=True, max_length=512, padding="max_length")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [5]:
import torch

class LyricsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings

    def __getitem__(self, idx):
        item = {
            'input_ids': torch.tensor(self.encodings['input_ids'][idx]),
            'attention_mask': torch.tensor(self.encodings['attention_mask'][idx]),
            'labels': torch.tensor(self.encodings['input_ids'][idx])  # Crucial: labels for LM
        }
        return item

    def __len__(self):
        return len(self.encodings.input_ids)

dataset = LyricsDataset(tokenized_lyrics)

In [6]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer

model = GPT2LMHeadModel.from_pretrained("gpt2")
model.resize_token_embeddings(len(tokenizer))  # Handle new tokens
model.config.pad_token_id = tokenizer.pad_token_id  # Set pad token

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

The new embeddings will be initialized from a multivariate normal distribution that has old embeddings' mean and covariance. As described in this article: https://nlp.stanford.edu/~johnhew/vocab-expansion.html. To disable this, use `mean_resizing=False`


In [7]:
training_args = TrainingArguments(
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    save_steps=10_000,
    save_total_limit=2,
    report_to="none",  # Disable W&B if not needed
    logging_steps=100,
    #evaluation_strategy="no",
    learning_rate=5e-5,
    weight_decay=0.01,
    fp16=True,  # Enable if using GPU
)

In [8]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    # Add this to handle data collation properly
    data_collator=lambda data: {
        'input_ids': torch.stack([item['input_ids'] for item in data]),
        'attention_mask': torch.stack([item['attention_mask'] for item in data]),
        'labels': torch.stack([item['labels'] for item in data])
    }
)

trainer.train()


`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,7.3296
200,0.1916
300,0.119
400,0.1124
500,0.1069
600,0.0979
700,0.0975
800,0.1004
900,0.0998
1000,0.101


TrainOutput(global_step=9288, training_loss=0.15945898680518558, metrics={'train_runtime': 1524.2697, 'train_samples_per_second': 12.185, 'train_steps_per_second': 6.093, 'total_flos': 4852976910336000.0, 'train_loss': 0.15945898680518558, 'epoch': 3.0})

In [9]:
model.save_pretrained("./lyrics_gpt2")
tokenizer.save_pretrained("./lyrics_gpt2")

# Generation example
from transformers import pipeline

lyrics_generator = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
    device=0 if torch.cuda.is_available() else -1
)

generated = lyrics_generator(
    "<|startoftext|>",
    max_length=100,
    num_return_sequences=3,
    temperature=0.7,
    top_k=50,
    top_p=0.95,
    repetition_penalty=1.2
)

for i, text in enumerate(generated):
    print(f"Generated Lyrics {i+1}:")
    print(text['generated_text'])
    print("\n---\n")

Device set to use cuda:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Generated Lyrics 1:
<|startoftext|>You can go to the mall and buy some clothes

---

Generated Lyrics 2:
<|startoftext|>I dont know if Im the first or not

---

Generated Lyrics 3:
<|startoftext|>So I dont really need a doctor to treat me

---

