# Import Libraries

In [144]:
import numpy as np
import tensorflow.keras.utils as ku 
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, LSTM, Embedding, Bidirectional, Flatten, Reshape, Dropout

# Read Data

Read data from txt file and split it to make it in form of poem

In [145]:
data = open('../input/lyrics-generation/lyrics_dataset.txt').read()

In [146]:
corpus = data.lower().split("\n")

# Data preprocessing

Remove duplicates to make unique corpus

In [147]:
corpus = list(set(corpus))

Convert text to number using tokenizer by give each word an id

In [148]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1

In [149]:
# create input sequences using list of tokens
input_sequences = []
for line in corpus:
    token_list = tokenizer.texts_to_sequences([line])[0]
    
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
    
    input_sequences.append(n_gram_sequence)

In [150]:
input_sequences[0]

In [151]:
total_words

pad sequence by adding zeros to make all lines equal un length

In [152]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences,
                       maxlen = max_sequence_len, padding='pre'))

The problem we are solving here is a supervised learning problem. So we will have to provide the model with some labels so that it can generalise the relation between the words used to predict and the predicted word.

So, we will use our input sequence and use the last word of all sequences as labels for all previous words.

In [153]:
train, labels = input_sequences[:,:-1],input_sequences[:,-1]

In [154]:
labels = ku.to_categorical(labels, num_classes=total_words)

# Model

In [155]:
model = Sequential()
model.add(Embedding(total_words, 50, input_length=max_sequence_len-1))
# Add an LSTM Layer
model.add(Bidirectional(LSTM(150, return_sequences=True)))  
# A dropout layer for regularisation
model.add(Dropout(0.2))
# Add another LSTM Layer
model.add(LSTM(100)) 
model.add(Dense(total_words/2, activation='relu'))  
# In the last layer, the shape should be equal to the total number of words present in our corpus
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics='accuracy')  #(# Pick a loss function and an optimizer)
model.summary()

In [156]:
labels.shape, train.shape

In [157]:
model.fit(train, labels, epochs= 100, verbose=1)

# Evaluate Model


In [160]:
seed_text = "youssef"
next_words = 90
  
for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    output_word = ""
    
    for word, index in tokenizer.word_index.items():
        if index == np.argmax(predicted):
            output_word = word
            break
    seed_text += " " + output_word
print(seed_text)