In [19]:
import pandas as pd
from datasets import Dataset
import ast
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import os
import csv
import torch
from peft import (
    LoraConfig, 
    get_peft_model, 
    PeftModel, 
    PeftConfig,
    prepare_model_for_kbit_training
)
from trl import SFTTrainer

In [2]:
df = pd.read_csv('../Additional Data/Combined_Songs_Artists.csv')

In [3]:
df["Playlist_Songs"] = df["Playlist_Songs"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Training targets 4th Iteration, 
## Combined_Songs_Artists.csv
def format_example(row):
    # Use the song-artist strings as-is
    playlist_body = "\n".join(row["Playlist_Songs"])
    return (
        f"### Prompt: {row['Playlist_Name']}\n"
        f"### Description: {row['Playlist_Description']}\n"
        f"### Playlist:\n{playlist_body}"
    )

df['text'] = df.apply(format_example, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": df['text'].tolist()})

In [25]:
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = GPT2LMHeadModel.from_pretrained("gpt2-medium",
                                       load_in_8bit=True)
model.resize_token_embeddings(len(tokenizer))  # In case we add special tokens

lora_config = LoraConfig(
    r=16,                            # ‚Üì rank of adaptation matrices (try 8‚Äë64)
    lora_alpha=32,                   # scaling (Œ±). Often 2√ór
    lora_dropout=0.05,               # dropout on LoRA weights
    bias="none",                     # keep original bias terms frozen
    task_type="CAUSAL_LM",           # GPT = causal LM
    target_modules=["c_attn", "c_proj"]  # which GPT‚Äë2 linear layers to adapt
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Tokenize
def tokenize(batch):
    encodings = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    encodings["labels"] = encodings["input_ids"].copy()  # üî• Add this line
    return encodings
    
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


trainable params: 4,325,376 || all params: 359,148,544 || trainable%: 1.2043


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 2289/2289 [00:07<00:00, 304.48 examples/s]


In [27]:
training_args = TrainingArguments(
    output_dir="./Models/lora_ckpts",
    eval_strategy="steps",
    save_strategy="steps",
    eval_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    num_train_epochs=10,
    per_device_train_batch_size=4,
    learning_rate=5e-4,          # ‚Üê LoRA usually likes a *higher* LR
    warmup_steps=200,
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    save_total_limit=2,
)
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
)
trainer.train()

  trainer = Trainer(
Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Step,Training Loss,Validation Loss
100,2.0763,2.300283
200,1.9089,2.306375
300,2.046,2.310531
400,2.1302,2.294348
500,2.0168,2.309999
600,1.9768,2.307914
700,2.0362,2.310506
800,1.9467,2.304048
900,1.9859,2.285598
1000,1.9902,2.304229




TrainOutput(global_step=5150, training_loss=1.8928715604022868, metrics={'train_runtime': 1795.2217, 'train_samples_per_second': 11.475, 'train_steps_per_second': 2.869, 'total_flos': 1.940495794176e+16, 'train_loss': 1.8928715604022868, 'epoch': 10.0})

In [None]:
model.save_pretrained("./Models/gpt2_lora")   # saves adapter_config.json + adapter_model.bin
tokenizer.save_pretrained("./Models/gpt2_lora")

In [None]:
peft_model = PeftModel.from_pretrained(
    GPT2LMHeadModel.from_pretrained(BASE_MODEL, device_map="auto", load_in_8bit=LOAD_8BIT),
    "./models/gpt2_playlist_lora"
)
peft_model.eval()

def generate_playlist(prompt, max_length=120):
    in_text = f"### Prompt: {prompt}\n### Playlist:\n"
    enc = tokenizer(in_text, return_tensors="pt", return_attention_mask=True).to("cuda")
    out = peft_model.generate(
        **enc,
        max_length=max_length,
        temperature=0.9,
        top_p=0.95,
        pad_token_id=tokenizer.eos_token_id
    )
    decoded = tokenizer.decode(out[0], skip_special_tokens=True)
    return decoded.split("### Playlist:\n")[-1].strip()

print(generate_playlist("High‚Äëenergy pop for workouts"))