# Fine-Tuning kkirchheim/german-gpt2-medium on Battle Rap Data
This notebook demonstrates how to fine-tune the [kkirchheim/german-gpt2-medium](https://huggingface.co/kkirchheim/german-gpt2-medium) model using your `battles.csv` file containing German battle rap lyrics.

In [None]:
# Install required libraries (uncomment if needed)
# !pip install transformers datasets accelerate

In [3]:
!pip install transformers datasets accelerate

Collecting transformers
  Downloading transformers-4.54.0-py3-none-any.whl.metadata (41 kB)
Collecting datasets
  Downloading datasets-4.0.0-py3-none-any.whl.metadata (19 kB)
Collecting accelerate
  Downloading accelerate-1.9.0-py3-none-any.whl.metadata (19 kB)
Collecting huggingface-hub<1.0,>=0.34.0 (from transformers)
  Downloading huggingface_hub-0.34.1-py3-none-any.whl.metadata (14 kB)
Collecting regex!=2019.12.17 (from transformers)
  Downloading regex-2024.11.6-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)
Collecting tokenizers<0.22,>=0.21 (from transformers)
  Downloading tokenizers-0.21.2-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.8 kB)
Collecting safetensors>=0.4.3 (from transformers)
  Downloading safetensors-0.5.3-cp38-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)
Collecting hf-xet<2.0.0,>=1.1.3 (from huggingface-hub<1.0,>=0.34.0->transformers)
  Downloading hf_xet-1.1.5-cp37-abi3-manylinux_2_17_

In [4]:
import pandas as pd
from datasets import Dataset

# Load your battle rap data
df = pd.read_csv('../battles.csv')

# For fine-tuning, we only need the text. Let's use the 'lyrics' column.
df = df.dropna(subset=['lyrics'])
dataset = Dataset.from_pandas(df[['lyrics']].rename(columns={'lyrics': 'text'}))

# Optionally, split into train/validation (here, all for training due to small size)
train_dataset = dataset

In [7]:
import torch
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Using device: {device}")

Using device: cpu


In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer

model_name = 'kkirchheim/german-gpt2-medium'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

config.json:   0%|          | 0.00/981 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.43G [00:00<?, ?B/s]

In [None]:
# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length', max_length=512)

tokenized_dataset = train_dataset.map(tokenize_function, batched=True)

In [None]:
# Set up training arguments
training_args = TrainingArguments(
    output_dir='./german-gpt2-battle-rap',
    per_device_train_batch_size=2,
    num_train_epochs=5,
    save_steps=10,
    save_total_limit=2,
    logging_steps=5,
    fp16=True,
    report_to='none',
    push_to_hub=False
)

In [None]:
# Define Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer
)

# Start training
trainer.train()

## Inference Example
After training, you can generate new battle rap lyrics like this:

In [None]:
prompt = "[Dein Battle-Rap-Start hier]"
inputs = tokenizer(prompt, return_tensors='pt')
outputs = model.generate(**inputs, max_length=200, do_sample=True, top_k=50, top_p=0.95)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))