In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
import nltk
import re
import keras
import random
import io
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import LSTM, Dense, Dropout, Embedding, Bidirectional  # Import Embedding layer
from keras.optimizers import Adamax
from sklearn.metrics import mean_squared_error
from PIL import Image
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import warnings
warnings.filterwarnings("ignore")
from google.colab import drive

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


ModuleNotFoundError: No module named 'seaborn'

In [2]:
# Mount Google Drive
drive.mount('/content/drive')

# Load LSTM data
df_lstm = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DL_mp/data/lyrics-data.csv")
df_lstm.drop(['ALink','SName','SLink'],axis=1,inplace=True)
df_lstm = df_lstm[:700]

# Load RNN data
df_rnn = pd.read_csv("/content/drive/MyDrive/Colab Notebooks/DL_mp/data/Songs.csv")
df_rnn.drop(['Artist'],axis=1,inplace=True)

Mounted at /content/drive


In [3]:
# Define LSTM model
def lstm_model(df):
    # Preprocessing
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(df['Lyric'].astype(str).str.lower())
    total_words = len(tokenizer.word_index)+1
    tokenized_sentences = tokenizer.texts_to_sequences(df['Lyric'].astype(str))

    # Slash sequences into n gram sequence
    input_sequences = []
    for i in tokenized_sentences:
        for t in range(1, len(i)):
            n_gram_sequence = i[:t+1]
            input_sequences.append(n_gram_sequence)

    # Pre-padding
    max_sequence_len = max([len(x) for x in input_sequences])
    input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

    # Create predictors and label
    X, labels = input_sequences[:,:-1],input_sequences[:,-1]
    y = to_categorical(labels, num_classes=total_words)

    # Create model
    model = Sequential()
    model.add(Embedding(total_words, 40, input_length=max_sequence_len-1))
    model.add(Bidirectional(LSTM(250)))
    model.add(Dropout(0.1))
    model.add(Dense(total_words, activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Train model
    history = model.fit(X, y, epochs=10, verbose=1)

    return model, tokenizer, max_sequence_len

In [4]:
# Define RNN model
def rnn_model(df):
    # Preprocessing
    Corpus = ''.join(df['Lyrics']).lower()
    to_remove = ['{', '}', '~', '©', 'à', 'á', 'ã', 'ä', 'ç', 'è', 'é', 'ê', 'ë', 'í', 'ñ', 'ó', 'ö', 'ü', 'ŏ',
                 'е', 'ا', 'س', 'ل', 'م', 'و', '\u2005', '\u200a', '\u200b', '–', '—', '‘', '’', '‚', '“', '”',
                 '…', '\u205f', '\ufeff', '!', '&', '(', ')', '*', '-',  '/', ]
    for symbol in to_remove:
        Corpus = Corpus.replace(symbol, " ")

    symb = sorted(list(set(Corpus)))
    mapping = dict((c, i) for i, c in enumerate(symb))
    reverse_mapping = dict((i, c) for i, c in enumerate(symb))

    # Splitting the Corpus
    length = 40
    features, targets = [], []
    for i in range(0, len(Corpus) - length, 1):
        feature = Corpus[i:i + length]
        target = Corpus[i + length]
        features.append([mapping[j] for j in feature])
        targets.append(mapping[target])

    X = (np.reshape(features, (len(features), length, 1)))/ float(len(symb))
    y = to_categorical(targets)

    # Create RNN model
    model = Sequential()
    model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
    model.add(Dense(y.shape[1], activation='softmax'))
    opt = Adamax(learning_rate=0.01)
    model.compile(loss='categorical_crossentropy', optimizer=opt)

    # Train RNN model
    history = model.fit(X, y, batch_size=128, epochs=10)

    return model, reverse_mapping

In [5]:
# Generate lyrics from LSTM model
def generate_lyrics_lstm(seed_text, model, tokenizer, max_sequence_len, length):
    completed_song = seed_text
    for _ in range(length):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len, padding='pre')
        predicted = model.predict(token_list, verbose=0)[0]
        predicted_index = np.argmax(predicted)
        output_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_index:
                output_word = word
                break
        seed_text += " " + output_word
        completed_song += " " + output_word
    return completed_song

In [6]:
# Generate lyrics from RNN model
def generate_lyrics_rnn(seed_text, model, reverse_mapping, length):
    generated= ""
    starter = seed_text
    seed = [mapping[char] for char in starter]
    generated += starter
    for i in range(length):
        seed = [mapping[char] for char in starter]
        x_pred = np.reshape(seed, (1, len(seed), 1))
        x_pred = x_pred/ float(len(symb))
        prediction = model.predict(x_pred, verbose=0)[0]
        prediction = np.asarray(prediction).astype('float64')
        prediction = np.log(prediction) / 1.0
        exp_preds = np.exp(prediction)
        prediction = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, prediction, 1)
        index = np.argmax(prediction)
        next_char = reverse_mapping[index]
        generated += next_char
        starter = starter[1:] + next_char
    return generated

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
# Define input text
input_text = "the sky is blue"

# Train LSTM model
lstm_model, lstm_tokenizer, lstm_max_sequence_len = lstm_model(df_lstm)

# Train RNN model
rnn_model, reverse_mapping = rnn_model(df_rnn)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [10]:
# Generate lyrics from LSTM model
generated_lyrics_lstm = generate_lyrics_lstm(input_text, lstm_model, lstm_tokenizer, lstm_max_sequence_len, 140)
print(generated_lyrics_lstm)

# Generate lyrics from RNN model
generated_lyrics_rnn = generate_lyrics_rnn(input_text, rnn_model, reverse_mapping, 140)
print(generated_lyrics_rnn)

ValueError: in user code:

    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2440, in predict_function  *
        return step_function(self, iterator)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2425, in step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2413, in run_step  **
        outputs = model.predict_step(data)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/training.py", line 2381, in predict_step
        return self(x, training=False)
    File "/usr/local/lib/python3.10/dist-packages/keras/src/utils/traceback_utils.py", line 70, in error_handler
        raise e.with_traceback(filtered_tb) from None
    File "/usr/local/lib/python3.10/dist-packages/keras/src/engine/input_spec.py", line 298, in assert_input_compatibility
        raise ValueError(

    ValueError: Input 0 of layer "sequential_1" is incompatible with the layer: expected shape=(None, 584), found shape=(None, 585)


In [None]:
# Calculate MSE
from sklearn.metrics import mean_squared_error
mse_score = mean_squared_error(generated_lyrics_lstm, generated_lyrics_rnn)
print("MSE Score:", mse_score)

In [None]:
# Plot graphical representation
plt.plot(generated_lyrics_lstm, label='LSTM')
plt.plot(generated_lyrics_rnn, label='RNN')
plt.xlabel('Time')
plt.ylabel('Lyrics')
plt.title('Comparison of LSTM and RNN Generated Lyrics')
plt.legend()
plt.show()