# GPT Model w/o Descriptions

In [1]:
import pandas as pd
from datasets import Dataset
import ast
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from transformers import Trainer, TrainingArguments
import os
import csv

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
df2 = pd.read_csv('Playlist_data_with_lyrics.csv')
df = pd.read_csv('playlist_data.csv')

In [3]:
import pandas as pd
from datasets import Dataset
import ast

# Load and parse the dataset

# Convert stringified lists to actual lists
for col in ['Playlist_Songs', 'Playlist_Artists']:
    df[col] = df[col].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)

## Training Targets 1st Iteration, 3 epoch Loss ~ 3, 5 epoch Loss ~ 2.5
# Create prompt + output text for GPT-2 training
# def format_example(row):
#     song_lines = [f"{s} - {a}" for s, a in zip(row['Playlist_Songs'], row['Playlist_Artists'])]
#     return f"### Prompt: {row['Playlist_Name']}\n### Playlist:\n" + "\n".join(song_lines)

## Training Targets 2nd Iteration, 5 epoch Loss ~ 1.3
# def format_example(row):
#     lines = [f"[SONG] {s} [ARTIST] {a}" for s, a in zip(row["Playlist_Songs"], row["Playlist_Artists"])]
#     return f"### Prompt: {row['Playlist_Name']}\n### Playlist:\n" + "\n".join(lines)

## Training targets 3rd Iteration, 5 epoch Loss ~ 
def format_example(row):
    lines = [f"[SONG] {song} [ARTIST] {artist}" for song, artist in zip(row["Playlist_Songs"], row["Playlist_Artists"])]
    playlist_body = "\n".join(lines)
    
    # Include Playlist_Description in the prompt for training only
    return (
        f"### Prompt: {row['Playlist_Name']}\n"
        f"### Description: {row['Playlist_Description']}\n"
        f"### Playlist:\n{playlist_body}"
    )

df['text'] = df.apply(format_example, axis=1)

# Convert to Hugging Face Dataset
dataset = Dataset.from_dict({"text": df['text'].tolist()})

In [4]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel

tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token  # GPT-2 doesn't have a pad token

model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
model.resize_token_embeddings(len(tokenizer))  # In case we add special tokens

# Tokenize
def tokenize(batch):
    encodings = tokenizer(batch["text"], padding="max_length", truncation=True, max_length=512)
    encodings["labels"] = encodings["input_ids"].copy()  # ðŸ”¥ Add this line
    return encodings
    
tokenized_dataset = dataset.map(tokenize, batched=True)
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 2289/2289 [00:16<00:00, 139.65 examples/s]


In [5]:
from transformers import Trainer, TrainingArguments
import torch

training_args = TrainingArguments(
    output_dir="./gpt2_playlist_model_w_Descriptions",
    overwrite_output_dir=True,
    num_train_epochs=5,
    per_device_train_batch_size=2,
    save_steps=500,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    fp16=True if torch.cuda.is_available() else False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset
)

trainer.train()

Detected kernel version 4.18.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
50,2.3561
100,1.9877
150,1.8785
200,1.8549
250,1.7707
300,1.7796
350,1.8072
400,1.713
450,1.676
500,1.6859


TrainOutput(global_step=5725, training_loss=1.3597300187156711, metrics={'train_runtime': 821.0293, 'train_samples_per_second': 13.94, 'train_steps_per_second': 6.973, 'total_flos': 1.062897944887296e+16, 'train_loss': 1.3597300187156711, 'epoch': 5.0})

In [6]:
def generate_playlist(prompt, max_length=200):
    input_text = f"### Prompt: {prompt}\n### Playlist:\n"
    input_ids = tokenizer.encode(input_text, return_tensors="pt")

    output = model.generate(
        input_ids = input_ids.to('cuda'),
        max_length=max_length,
        temperature=0.9,
        top_p=0.95,
        do_sample=True,
        num_return_sequences=1,
        pad_token_id=tokenizer.eos_token_id
    )

    result = tokenizer.decode(output[0], skip_special_tokens=True)
    return result.split("### Playlist:\n")[1].strip()

# Try it out!
prompt = "James broke his computer"
print(generate_playlist(prompt))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


[SONG] All The Things She Said [ARTIST] James Arthur
[SONG] Like I Do [ARTIST] James Arthur
[SONG] Stuck In The Moment [ARTIST] James Arthur
[SONG] I Think He Knows [ARTIST] James Arthur
[SONG] You Can't Stop The Beat [ARTIST] James Arthur
[SONG] I'm Okay [ARTIST] James Arthur
[SONG] In the Morning [ARTIST] James Arthur
[SONG] Love With You [ARTIST] James Arthur
[SONG] Hold On - Version Revisited [ARTIST] James Arthur
[SONG] This One's For You [ARTIST] James Arthur
[SONG] Like You [ARTIST] James Arthur
[SONG] This One's For Me [ARTIST] James Arthur
[SONG]


In [7]:
## Import Saved Model and use with generate_playlist function
from transformers import GPT2LMHeadModel, GPT2Tokenizer

model = GPT2LMHeadModel.from_pretrained("./gpt2_playlist_model_w_Descriptions/checkpoint-5725")
tokenizer = GPT2Tokenizer.from_pretrained("./gpt2_playlist_model_w_Descriptions/checkpoint-5725")

HFValidationError: Repo id must be in the form 'repo_name' or 'namespace/repo_name': './gpt2_playlist_model_w_Descriptions/checkpoint-5725'. Use `repo_type` argument if needed.

In [10]:
os.chdir("..")
os.getcwd()

'/sfs/gpfs/tardis/home/nuf8ms/Documents/MSDS/LLM/DS6051-Project'

In [6]:
os.path.join()

['checkpoint-5725', 'checkpoint-5500']

# GPT Model w/ Descriptions