# GPT Model w/ Descriptions
Combined_Songs_Artists.csv

Model trained on Playlist_Names, Playlist_Descriptions, Songs and Artists combined in one string

In [None]:
## Required Libraries
import pandas as pd
from datasets import Dataset
import ast
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import os
import csv
import optuna
import torch

## Load and Preprocess Dataset

In [None]:
df = pd.read_csv('../Additional Data/Combined_Songs_Artists.csv') ## Use your dataset path

In [None]:
# Preprocess the dataset
df["Playlist_Songs"] = df["Playlist_Songs"].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

# Convert the playlist songs to a string format
def format_example(row):
    # Use the song-artist strings as-is
    playlist_body = "\n".join(row["Playlist_Songs"])
    return (
        f"### Prompt: {row['Playlist_Name']}\n"
        f"### Description: {row['Playlist_Description']}\n"
        f"### Playlist:\n{playlist_body}"
    )

# Apply the formatting function to each row
df['text'] = df.apply(format_example, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": df['text'].tolist()})

## Load and FineTune Model

In [None]:
# Load the model and tokenizer
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))  # In case we add special tokens

# Definie a function to tokenize the dataset
def tokenize(batch):
    encodings = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    encodings["labels"] = encodings["input_ids"].copy()
    return encodings
    
# Tokenize the dataset
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

# Split the dataset into training and evaluation sets
# The dataset is split into 90% training and 10% evaluation
split_dataset = tokenized_dataset.train_test_split(test_size=0.1)
train_dataset = split_dataset["train"]
eval_dataset = split_dataset["test"]

Map: 100%|██████████| 2289/2289 [00:07<00:00, 310.51 examples/s]


In [None]:
# Define the training arguments
training_args = TrainingArguments(
    output_dir="./Models/gpt2_Combined_Song_Artists_Eval_Checkpoints",
    overwrite_output_dir=True,
    eval_strategy='steps',
    eval_steps=100,
    num_train_epochs=20,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False,
    load_best_model_at_end=True,
    metric_for_best_model='loss',
    greater_is_better=False
    
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)

# Train the model
trainer.train()

## Save the FineTuned Model and Tokenizer

In [12]:
model.save_pretrained("./models/gpt2_Combined_Song_Artists")
tokenizer.save_pretrained("./models/gpt2_Combined_Song_Artists")

('./models/gpt2_Combined_Song_Artists/tokenizer_config.json',
 './models/gpt2_Combined_Song_Artists/special_tokens_map.json',
 './models/gpt2_Combined_Song_Artists/vocab.json',
 './models/gpt2_Combined_Song_Artists/merges.txt',
 './models/gpt2_Combined_Song_Artists/added_tokens.json')

In [None]:
# Define a function to generate playlists
def generate_playlist(prompt, max_length=200):
    input_text = f"### Prompt: {prompt}\n### Playlist:\n"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    output = model.generate(
        input_ids = input_ids.to('cuda'),
        max_length=max_length,
        temperature=0.9,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )
    # Decode the generated output
    result = tokenizer.decode(output[0], skip_special_tokens=True)
    # Extract the playlist from the generated text
    return result.split("### Playlist:\n")[1].strip()

# Try it out!
prompt = "Hype Pregame"
print(generate_playlist(prompt))