In [23]:
import os
import pandas as pd
import numpy as np
import torch

# Load Dataset

In [8]:
data = pd.read_csv('artist_song_lyrics.csv')

In [9]:
data.head()

Unnamed: 0,Artist,Song,Lyrics
0,Adele,Skyfall,"['This is the end', 'Hold your breath and coun..."
1,The Weeknd,Save Your Tears,"['Ooh', 'Na-na, yeah', '[Verse 1]', '[Verse 1]..."
2,Harry Styles,Watermelon Sugar,"[""Tastes like strawberries on a summer evenin'..."
3,Doja Cat,Say So,"['Day to night to morning, keep with me in the..."
4,Imagine Dragons,Thunder,"['Just a young gun with a quick fuse', 'I was ..."


# Data Exploration

In [10]:
data.shape

(99, 3)

In [11]:
data.describe()

Unnamed: 0,Artist,Song,Lyrics
count,99,99,99
unique,47,99,99
top,Adele,Skyfall,"['This is the end', 'Hold your breath and coun..."
freq,5,1,1


In [12]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 99 entries, 0 to 98
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Artist  99 non-null     object
 1   Song    99 non-null     object
 2   Lyrics  99 non-null     object
dtypes: object(3)
memory usage: 2.4+ KB


# Data Processing

In [13]:
import string

# create translator that replaces punctuation with empty spaces
translator = str.maketrans("", "", string.punctuation)

In [14]:
# remove punctuation characters and convert all chars to lowercase
data['Lyrics'] = data['Lyrics'].apply(lambda x: x.translate(translator).lower() if isinstance(x, str) else x)

In [15]:
from collections import Counter
from collections import OrderedDict

#Get top 5 words used in each song's lyrics
for row in range(0,data.shape[0]):
    words = data['Lyrics'][row].split() #split text into words
    word_frequency = Counter(words) #Use Counter to count the freq of each word
    word_frequency = sorted(word_frequency.items(), key=lambda item: (-item[1], item[0])) #sort frequencies

    df_dict = pd.DataFrame(OrderedDict(word_frequency).items(),columns= ['word','frequency']) #convert sorted freqs to ordereddict and add data to df 
    print(df_dict.head())

    word  frequency
0     it         18
1    the         17
2  stand         14
3     we         14
4   fall         13
      word  frequency
0        i         27
1      you         17
2     your         13
3      for         11
4  another         10
         word  frequency
0       sugar         27
1  watermelon         25
2        high         23
3           i         17
4        just          9
   word  frequency
0   you         37
1    it         26
2    to         24
3    me         15
4  keep         13
        word  frequency
0    thunder         73
1        the         26
2  lightning         10
3       feel          9
4       then          9
  word  frequency
0   im         22
1    i         17
2  the         16
3   in         15
4  you         15
   word  frequency
0  yeah         47
1  babe         32
2   the         29
3  that         27
4    on         20
   word  frequency
0     i         30
1   you         19
2   and         13
3  knew         13
4    me          9
    

In [16]:
# Get Top 5 words used across all songs

#concatenate all lyrics
lyricsConcat = data['Lyrics'].str.cat(sep=' ')

# Split the text into words
words = lyricsConcat.split()

# Use Counter to count the frequency of each word
all_word_frequency = Counter(words)

all_word_frequency = sorted(all_word_frequency.items(), key=lambda item: (-item[1], item[0])) #sort frequencies

df_all_dict = pd.DataFrame(OrderedDict(all_word_frequency).items(),columns= ['word','frequency']) #convert sorted freqs to ordereddict and add data to df 

df_all_dict.head()

Unnamed: 0,word,frequency
0,i,1667
1,you,1619
2,the,1208
3,it,705
4,to,704


### Removing words that are common among the entire dataset could strengthen the model if the those words don't add additional value

### For now, I will remove the words 'verse', 'bridge', and 'chorus' since I know these words indicate sections of the song are not lyrics

In [17]:
data['Lyrics'] = data['Lyrics'].apply(lambda x: x.replace('chorus','').replace('verse','').replace('bridge','')
                                     if isinstance(x, str) else x)

In [18]:
#save lyric data to text file
data['Lyrics'].to_csv('input.csv', index=False, header=False, sep='\t')

# Finetune trained model with my lyrics data

In [19]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config
from transformers import TextDataset, DataCollatorForLanguageModeling
from transformers import Trainer, TrainingArguments

In [26]:
# Load pre-trained GPT-2 model and tokenizer
model_name = "gpt2"
model = GPT2LMHeadModel.from_pretrained(model_name)
tokenizer = GPT2Tokenizer.from_pretrained(model_name)

# Load your dataset
train_dataset = TextDataset(
    tokenizer=tokenizer,
    file_path='input.csv',  # Replace with the path to your dataset
    block_size=128  # Adjust block size as needed
)

# Create a data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # For language modeling tasks, mlm should be set to False
)

# Fine-tuning arguments
if not os.path.exists('output2'):
    try:
        os.makedirs('output2')
    except OSError as e:
        print (e)
else:
    print(f"Folder already exists.")


training_args = TrainingArguments(
    output_dir="output",  # Replace with the path where you want to save the fine-tuned model
    overwrite_output_dir=True,
    num_train_epochs=10,  # Adjust as needed
    per_device_train_batch_size=4,  # Adjust batch size as needed
    save_total_limit=2,
    logging_steps=2,

)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=train_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model
model.save_pretrained("output2")
tokenizer.save_pretrained("output2")



Folder already exists.


Step,Training Loss
2,4.757
4,4.5501
6,3.8101
8,4.1141
10,4.6039
12,4.05
14,4.2899
16,3.9289
18,4.6306
20,4.0629


('output2\\tokenizer_config.json',
 'output2\\special_tokens_map.json',
 'output2\\vocab.json',
 'output2\\merges.txt',
 'output2\\added_tokens.json')

# Testing The Model

In [27]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Define the directory where the fine-tuned model and tokenizer are saved
model_directory = "output2"

# Load the fine-tuned model and tokenizer
try:
    model = GPT2LMHeadModel.from_pretrained(model_directory)
    tokenizer = GPT2Tokenizer.from_pretrained(model_directory)
except Exception as e:
    print("Error loading model and tokenizer:", e)
    # Handle the error accordingly, such as providing a more informative message or exiting gracefully


In [30]:
# Define a function for searching songs based on user queries
def search_songs(query, data, model, tokenizer):
    # Encode the user query
    input_ids = tokenizer.encode(query, return_tensors="pt")

    # Generate an attention mask
    attention_mask = torch.ones(input_ids.shape, dtype=torch.long, device=input_ids.device)

    # Generate a response from the model
    output = model.generate(input_ids, attention_mask=attention_mask, pad_token_id=tokenizer.eos_token_id, max_new_tokens=20)

    # Decode and return the generated response
    decoded_response = tokenizer.decode(output[0], skip_special_tokens=True)
    return decoded_response

In [40]:
# Example usage
user_query = "what are some songs about cars?"
result = search_songs(user_query, data, model, tokenizer)
print(result)

what are some songs about cars?   i dont know why i dont know why i dont know why i dont know why i dont
