In [31]:
# Installing necessary packages
!pip install rouge sentence-transformers

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [32]:
# Importing Packages and Libraries required
import pandas as pd
import numpy as np
import os
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input, Embedding, Bidirectional
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.losses import CategoricalCrossentropy
from tensorflow.keras.callbacks import LearningRateScheduler
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from nltk.translate.bleu_score import sentence_bleu
from rouge import Rouge
from sentence_transformers import SentenceTransformer


In [33]:
# # Mounting Google Drive
# from google.colab import drive
# drive.mount('/content/drive')

In [34]:
# Function to load data and prepare embeddings
def load_data_and_prepare_embeddings(artist_name):
    file_path = f'/Users/Ayush/MLProject/data/processed_LDA_lyrics_with_topic_{artist_name}.csv'
    if os.path.exists(file_path):
        df = pd.read_csv(file_path)
        model_sbert = SentenceTransformer('all-MiniLM-L6-v2')
        embeddings = model_sbert.encode(df['processed_lyrics'].tolist(), show_progress_bar=True)
        return df, embeddings
    else:
        print("File does not exist.")
        return None, None



In [35]:
# Preparing sequences from text
def prepare_sequences(texts, seq_length=50):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)
    vocab_size = len(tokenizer.word_index) + 1
    dataX, dataY = [], []
    for sequence in sequences:
        for i in range(0, len(sequence) - seq_length):
            seq_in = sequence[i:i + seq_length]
            seq_out = sequence[i + seq_length]
            dataX.append(seq_in)
            dataY.append(seq_out)
    X = pad_sequences(dataX, maxlen=seq_length, padding='pre')
    Y = to_categorical(dataY, num_classes=vocab_size)
    return X, Y, tokenizer, vocab_size

In [36]:
# Creating model for text generation
def create_model(seq_length, vocab_size):
    model = Sequential()
    model.add(Embedding(vocab_size, 50, input_length=seq_length))
    model.add(Bidirectional(LSTM(100, return_sequences=True)))
    model.add(Dropout(0.2))
    model.add(Bidirectional(LSTM(100)))
    model.add(Dense(100, activation='relu'))
    model.add(Dense(vocab_size, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [37]:
# Function to train the Bidirectional LSTM model and displaying its sumamry
def train_model(X, Y):
    print(X.shape, Y.shape)
    if X.size == 0 or Y.size == 0:
        print("Empty training data: Cannot train model.")
        return None
    seq_length = X.shape[1]
    vocab_size = Y.shape[1]
    model = create_model(seq_length, vocab_size)
    model.summary()
    model.fit(X, Y, epochs=5, batch_size=128)
    return model


In [38]:
# Function to generate lyrics
def generate_lyrics(model, tokenizer, seq_length, seed_text, num_words=100):
    result = [seed_text]
    for _ in range(num_words):
        encoded = tokenizer.texts_to_sequences([seed_text])[-1]
        encoded = pad_sequences([encoded], maxlen=seq_length, truncating='pre')
        yhat = model.predict(encoded, verbose=0)
        yhat = np.argmax(yhat, axis=1)
        word = ''
        for key, value in tokenizer.word_index.items():
            if value == yhat:
                word = key
                break
        seed_text += ' ' + word
        result.append(word)
    return ' '.join(result)

In [39]:
# Function to generate lyrics similar in manner to that of the artist and theme provided by the user
def user_interaction():
    artists = ['21 savage', 'artic monkeys', 'ariana grande', 'billie eilish', 'cardi b',
               'dj khaled', 'drake', 'ed sheeran', 'eminem', 'halsey', 'imagine dragons',
               'justin bieber', 'lady gaga', 'machine gun kelly', 'maroon 5', 'nirvana',
               'pink floyd', 'post malone', 'queen', 'taylor swift', 'the beatles', 'travis scott']
    print("Available artists:")
    for artist in artists:
        print(artist)
    selected_artist = input("Enter an artist from the list: ")

    df, embeddings = load_data_and_prepare_embeddings(selected_artist)
    if df is not None and not df.empty:
        print("Available themes:")
        themes = df['theme'].unique()
        for theme in themes:
            print(theme)
        selected_theme = input("Select a theme from the list: ")

        theme_data = df[df['theme'] == selected_theme]['processed_lyrics'].tolist()
        if not theme_data:
            print("No lyrics data available for the selected theme.")
            return

        X, Y, tokenizer, vocab_size = prepare_sequences(theme_data, seq_length=30)
        print(X.shape, Y.shape, tokenizer)
        if X.size > 0 and Y.size > 0:
            model = train_model(X, Y)
            if model:
                print("Model trained successfully. Please enter a seed phrase to generate lyrics.")
                seed_text = input("Enter a seed phrase of up to 5 words to start the lyrics: ").strip()
                while len(seed_text.split()) > 5:
                    print("Error: Seed phrase must be no more than 5 words.")
                    seed_text = input("Enter a seed phrase of up to 5 words: ").strip()

                generated_lyrics = generate_lyrics(model, tokenizer, 30, seed_text, num_words=100)
                print("Generated Lyrics:\n", generated_lyrics)

                reference_data = theme_data[int(len(theme_data) * 0.8):]
                generated_words = generated_lyrics.split()
                reference_words = [ref.split() for ref in reference_data]

                bleu_score = sentence_bleu(reference_words, generated_words)
                print(f"BLEU Score: {bleu_score}")

                rouge = Rouge()
                rouge_scores = rouge.get_scores(' '.join(generated_words), ' '.join(reference_words[0]))
                print("ROUGE Scores:", rouge_scores)
            else:
                print("Failed to train the model due to an error.")
        else:
            print("Insufficient data to train the model.")
    else:
        print("No data available for the selected artist or theme.")

In [40]:
# Executing the function
user_interaction()

Available artists:
21 savage
artic monkeys
ariana grande
billie eilish
cardi b
dj khaled
drake
ed sheeran
eminem
halsey
imagine dragons
justin bieber
lady gaga
machine gun kelly
maroon 5
nirvana
pink floyd
post malone
queen
taylor swift
the beatles
travis scott


Batches: 100%|██████████| 15/15 [00:03<00:00,  4.04it/s]


Available themes:
Emotional Connections
Identity & Recognition
Life & Aspirations
No lyrics data available for the selected theme.
