In [1]:
# Installing necessary packages
!pip install nltk rouge



In [2]:
# Importing necessary packages and libraries
import os
import pandas as pd
import random
import torch
import nltk
from transformers import GPT2Tokenizer, GPT2LMHeadModel
from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction
from rouge import Rouge

# Handle SSL issue for NLTK punkt download
import ssl
try:
    _create_default_https_context = ssl._create_default_https_context
    ssl._create_default_https_context = ssl._create_unverified_context
    nltk.download('punkt')
finally:
    ssl._create_default_https_context = _create_default_https_context


  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/Ayush/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# # Mounting Google Drive in Google Colab
# from google.colab import drive
# drive.mount('/content/drive')

In [4]:
# Listing all available artists based on filenames in a directory
def list_available_artists(directory):
    files = os.listdir(directory)
    print(files)
    artists = [file.split('_')[-1].replace('.csv', '') for file in files if file.startswith('processed_LDA_lyrics_with_topic_')]
    return artists

# artists = list_available_artists('/Users/Ayush/MLProject/ML FINAL PROJECT')
# print(artists)

In [5]:
# Loading lyrics data for a specific artist and theme
def load_lyrics(artist_name, theme, directory):
    file_path = os.path.join(directory, f'processed_LDA_lyrics_with_topic_{artist_name}.csv')
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    return df[df['theme'] == theme]['processed_lyrics'].tolist()

In [6]:
# Loading themes for a given artist
def load_themes(artist_name, directory):
    file_path = os.path.join(directory, f'processed_LDA_lyrics_with_topic_{artist_name}.csv')
    if not os.path.exists(file_path):
        return None
    df = pd.read_csv(file_path)
    return df['theme'].unique()

In [7]:
# Generating lyrics in structured format
def generate_lyrics(model, tokenizer, lyrics, seed_text, max_length=100, device='cpu'):
    model.eval()
    context_line = random.choice(lyrics)  # Adding context from existing lyrics
    input_ids = tokenizer.encode(f"{seed_text} {context_line}", return_tensors='pt').to(device)
    sample_outputs = model.generate(
        input_ids,
        max_length=max_length + len(input_ids[0]),
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=50,
        top_p=0.90,
        temperature=0.8,
        no_repeat_ngram_size=2,
        repetition_penalty=1.5
    )
    # Decoding and formating the output
    generated_text = tokenizer.decode(sample_outputs[0], skip_special_tokens=True)
    # Limiting the output to 100 words
    words = generated_text.split()[:100]
    return ' '.join(words)

In [8]:
# Additional formatting to introduce rhyme and structure
def format_lyrics_improved(lyrics):
    lines = lyrics.split('. ')
    formatted_lyrics = "\n".join(line.capitalize() for line in lines if line)
    return formatted_lyrics

In [9]:
# Calculating BLEU score with smoothing.
def calculate_bleu(reference_words, generated_words):
    smoothie = SmoothingFunction().method4
    bleu_score = sentence_bleu(reference_words, generated_words, smoothing_function=smoothie)
    return bleu_score

In [10]:
# Defining and calling function to generate appropriate lyrics matching the esseance of artist and its theme provided by the user
def main():
    directory = '/Users/Ayush/MLProject/data'
    # Reading name of the artist from the available list
    artists = list_available_artists(directory)
    print("Available artists:", artists)
    artist_name = input("Enter an artist from the list: ")
    if artist_name not in artists:
        print("Artist not available. Please choose from the list.")
        return
    # Providing list of unique themes of the selected artist
    themes = load_themes(artist_name, directory)
    if themes is None:
        print("No themes available for the selected artist.")
        return
    # Reading theme provided by the user based on the list
    print("Available themes:", themes)
    theme = input("Select a theme from the list: ")
    if theme not in themes:
        print("Theme not available. Please choose from the list.")
        return
    # Reading seed text which act as starting part of the lyrics
    seed_text = input("Enter a seed phrase to start the lyrics: ")
    lyrics = load_lyrics(artist_name, theme, directory)
    if lyrics is None:
        print("No lyrics data found for the selected theme.")
        return
    tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
    model = GPT2LMHeadModel.from_pretrained('gpt2')
    device = 'cuda' if torch.cuda.is_available() else 'cpu'
    model.to(device)
    # Generating lyrics in proper format
    generated_lyrics = generate_lyrics(model, tokenizer, lyrics, seed_text, device=device)
    formatted_lyrics = format_lyrics_improved(generated_lyrics)
    print("Generated Lyrics:\n", formatted_lyrics)
    # Evaluating the lyrics using the BLEU and Rouge calculation
    reference_data = lyrics[int(len(lyrics) * 0.8):]
    generated_words = formatted_lyrics.split()
    reference_words = [ref.split() for ref in reference_data]
    bleu_score = calculate_bleu(reference_words, generated_words)
    print(f"BLEU Score: {bleu_score}")
    rouge = Rouge()
    try:
        rouge_scores = rouge.get_scores(' '.join(generated_words), ' '.join(reference_words[0]))
        print("ROUGE Scores:", rouge_scores)
    except IndexError:
        print("ROUGE Scores: Not enough reference data to evaluate ROUGE.")

In [11]:
# Calling main function to perform all the operations and generating appropriate lyrics
if __name__ == "__main__":
    main()

['processed_LDA_lyrics_with_topic_arctic monkeys.csv', 'processed_LDA_lyrics_with_topic_drake.csv', 'processed_LDA_lyrics_with_topic_post malone.csv', 'processed_LDA_lyrics_with_topic_halsey.csv', 'processed_LDA_lyrics_with_topic_imagine dragons.csv', 'processed_LDA_lyrics_with_topic_eminem.csv', 'processed_LDA_lyrics_with_topic_lady gaga.csv', 'processed_LDA_lyrics_with_topic_pink floyd.csv', 'processed_LDA_lyrics_with_topic_machine gun kelly.csv', 'processed_LDA_lyrics_with_topic_dj khaled.csv', 'processed_LDA_lyrics_with_topic_nirvana.csv', 'processed_LDA_lyrics_with_topic_travis scott.csv', 'processed_LDA_lyrics_with_topic_ariana grande.csv', 'processed_LDA_lyrics_with_topic_ed sheeran.csv', 'processed_LDA_lyrics_with_topic_maroon 5.csv', 'processed_LDA_lyrics_with_topic_justin bieber.csv', 'processed_LDA_lyrics_with_topic_taylor swift.csv', 'processed_LDA_lyrics_with_topic_cardi b.csv', 'processed_LDA_lyrics_with_topic_billie eilish.csv', 'processed_LDA_lyrics_with_topic_queen.csv

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Lyrics:
 Identity & recognition oliver set safe - eternity safe - nuh time safe - don't lie puffy l'z and mo-g – switch it up anthem yg - i wanna benz ft 
50 cent and nipsey hu$$le offset - first day out rich the kid - that bag young thug - hercules cashier fresh - before lil wayne - jumpman starlito - yaomingolajuwon a$ap rocky - wavybone ft 
Juicy j and ugk prince 85 set raf riley - summer ft 
Etta bond , avelino and dun d aaliyah - we need a resolution young thug - paradise michael
BLEU Score: 0.018717746813977667
ROUGE Scores: [{'rouge-1': {'r': 0.060836501901140684, 'p': 0.21621621621621623, 'f': 0.09495548618689971}, 'rouge-2': {'r': 0.0064516129032258064, 'p': 0.03260869565217391, 'f': 0.010771990060887282}, 'rouge-l': {'r': 0.049429657794676805, 'p': 0.17567567567567569, 'f': 0.07715133188422912}}]


In [12]:
from transformers import GPT2Model, GPT2Config
# Load pre-configured GPT-2
config = GPT2Config()
model = GPT2Model(config)
# Print the model's configuration
print(model.config)

GPT2Config {
  "_attn_implementation_autoset": true,
  "activation_function": "gelu_new",
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "transformers_version": "4.46.3",
  "use_cache": true,
  "vocab_size": 50257
}



## Hyperparameter Table

| Hyperparameter Name         | Value                            |
|-----------------------------|----------------------------------|
| Activation Function (Hidden Layer) | GeLU                       |
| Activation Function (Output Layer) | Linear                     |
| Weight Initializer          | Normal Distribution (std=0.02)  |
| Number of Hidden Layers     | 12                              |
| Neurons in Hidden Layers    | 768                             |
| Loss Function               | Categorical Cross-Entropy               |
| Optimizer                   | AdamW                            |
| Number of Epochs            | 3                               |
| Batch Size                  | 32                              |
| Learning Rate               | 5e-5                            |
| Evaluation Metric           | Bleu, Rouge                      |
| Dropout Rate                | 0.1                             ||