# English Poems Generator - NLP Project. 
The English Poems Generator is an NLP project that aims to generate English poems.

In [1]:
# Importing needed libraries
import pandas as pd
import numpy as np
import pickle

# Cleaning and Normalization
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dropout, Dense
from tensorflow.keras import regularizers
import random

In [2]:
from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


## Reading data
### "Poem Generation" dataset 
Its collection of poems designed specifically for training and developing generative models. This dataset provides a diverse range of poems encompassing different genres, themes, and styles.

In [3]:
en = open('/content/drive/MyDrive/Zubaida_poem.txt', encoding="utf8").read()

In [4]:
# Make the English poems dataset. 
poems_list = en.split("\n")
en_df = pd.DataFrame({'poem': poems_list})
en_df.head()

Unnamed: 0,poem
0,"Stay, I said"
1,to the cut flowers.
2,They bowed
3,their heads lower.
4,"Stay, I said to the spider,"


In [5]:
en_df.shape

(2478, 1)

## Preprocessing
Cleaning Text: The text data is preprocessed to remove irrelevant or unnecessary elements. This involves removing stopwords using the English stopwords from the NLTK corpus, removing special characters using regular expressions, and removing punctuation marks using regular expressions.

Normalizing Text: The text is further normalized to ensure consistency and ease of analysis. This includes lemmatizing the text using the WordNetLemmatizer from the NLTK library and converting the text to lowercase.

In [11]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [12]:
def remove_punctuations(text):
    """
    Removes all punctuation marks from a given text.

    Args:
        text (str): The text to remove punctuation from.

    Returns:
        The text with all punctuation marks removed.
    """
    return re.sub(r'[^\w\s]', '', text)

def remove_enstopwords(text):
    """
    Removes all stopwords from a given English text.

    Args:
        text (str): The text to remove stopwords from.

    Returns:
        The text with all stopwords removed.
    """
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return " ".join(filtered_tokens)


def remove_special_chars(text):
    """
    Removes all non-ASCII characters from a given text.

    Args:
        text (str): The text to remove special characters from.

    Returns:
        The text with all non-ASCII characters removed.
    """
    return re.sub(r'[^\x00-\x7f]', r'', text)


In [13]:
en_df["cleaned_poem"] = en_df["poem"].apply(remove_enstopwords)
en_df["cleaned_poem"] = en_df["poem"].apply(remove_special_chars)
en_df["cleaned_poem"] = en_df["poem"].apply(remove_punctuations)

In [14]:

def standardize_text(text):    """
    Args:
    text (str): The input text to be standardized.

    Returns:
        str: The standardized text.
    """
    return text.lower()


In [16]:
en_df["normalized_poem"] = en_df["cleaned_poem"].apply(standardize_text)
en_df.head()

Unnamed: 0,poem,cleaned_poem,normalized_poem
0,"Stay, I said",Stay I said,stay i said
1,to the cut flowers.,to the cut flowers,to the cut flowers
2,They bowed,They bowed,they bowed
3,their heads lower.,their heads lower,their heads lower
4,"Stay, I said to the spider,",Stay I said to the spider,stay i said to the spider


In [17]:
en_df.shape

(2478, 3)

In [18]:
train_data, test_data = train_test_split(en_df, test_size=0.4, shuffle=False)

train_data["cleaned_poem"].to_csv("en_train_data.txt", index=False)
test_data["cleaned_poem"].to_csv("en_eval_data.txt", index=False)

en_df=train_data

In [19]:
en_df.shape

(1486, 3)

## Tokenization
The tokenization step involves breaking down the normalized text into individual tokens or words. This is achieved using the word_tokenize function from the NLTK library. The tokens are stored as a list of lists, where each sublist represents the tokenized poem.


In [20]:
en_df["tokens"] = en_df["normalized_poem"].apply(lambda x: nltk.word_tokenize(x.lower()))
en_df.head()

Unnamed: 0,poem,cleaned_poem,normalized_poem,tokens
0,"Stay, I said",Stay I said,stay i said,"[stay, i, said]"
1,to the cut flowers.,to the cut flowers,to the cut flowers,"[to, the, cut, flowers]"
2,They bowed,They bowed,they bowed,"[they, bowed]"
3,their heads lower.,their heads lower,their heads lower,"[their, heads, lower]"
4,"Stay, I said to the spider,",Stay I said to the spider,stay i said to the spider,"[stay, i, said, to, the, spider]"


## LSTM Model
The English Poems Generator employs a deep learning model to generate poems. The model architecture consists of multiple layers, including an Embedding layer, Bidirectional LSTM layers, Dropout layers for regularization, and Dense layers for output prediction. The model is compiled with the categorical_crossentropy loss function and the Adam optimizer. Model training is performed on the predictors (input sequences) and labels (output sequences) obtained from the tokenized and padded data.

In [None]:
# get the vocabulary size
unique_words = []
for poem in en_df["tokens"]:
    unique_words.extend(poem)

vocabulary = list(set(unique_words))
vocabulary_size = len(vocabulary)
vocabulary_size

3108

In [None]:
# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on your data
tokenizer.fit_on_texts(en_df["tokens"])

# Save the tokenizer using pickle
with open('en_tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

In [None]:
# Converting the text into embeddings
input_sequences = []
for line in en_df["tokens"]:
	token_list = tokenizer.texts_to_sequences([line])[0]

	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=vocabulary_size+1)

In [None]:
model = Sequential()
model.add(Embedding(vocabulary_size+1, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense((vocabulary_size+1)//2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(vocabulary_size+1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 100)           310900    
                                                                 
 bidirectional (Bidirectiona  (None, 15, 512)          731136    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 15, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 15, 128)           328192    
                                                                 
 dropout_1 (Dropout)         (None, 15, 128)           0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                        

In [None]:
history = model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [None]:
model.save("en_poem_generation_model.h5")

## Generating the poems
To generate new poems, a seed text is provided as input. The seed text serves as the starting point for the model to generate subsequent words or tokens. Using the trained model, the generator predicts the next word based on the context and patterns learned during training. The generation process continues for a specified number of words, gradually expanding the generated poem. The temperature parameter is used to control the randomness of the generated output.

In [None]:
seed_text = "Happiness is "
next_words = 25
output_text = seed_text

temperature = 0.6  # Adjust the temperature (higher values for more randomness, lower for more determinism)

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]
    predicted_probs = np.log(predicted_probs) / temperature
    predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
    predicted_index = np.random.choice(len(predicted_probs), size=1, p=predicted_probs)[0]
    output_word = tokenizer.index_word[predicted_index]
    seed_text += " " + output_word
    output_text += " " + output_word

print(output_text)


Happiness is  the star heard a love i looked for my the white is not he a heart like happy bow this skin is know and in


## GPT2 Model

In [22]:
!pip install gpt_2_simple

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting gpt_2_simple
  Downloading gpt_2_simple-0.8.1.tar.gz (26 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting toposort (from gpt_2_simple)
  Downloading toposort-1.10-py3-none-any.whl (8.5 kB)
Collecting numpy (from gpt_2_simple)
  Downloading numpy-1.23.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.1/17.1 MB[0m [31m76.2 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: gpt_2_simple
  Building wheel for gpt_2_simple (setup.py) ... [?25l[?25hdone
  Created wheel for gpt_2_simple: filename=gpt_2_simple-0.8.1-py3-none-any.whl size=24559 sha256=c8e7232f813b7397969028b3473d16a5fff67e65177c024333ea2de39d3dc96c
  Stored in directory: /root/.cache/pip/wheels/df/6a/fe/10d3223f78d1ac3e4c83bb4c5e2d28dfb1789c2fb4cc7ea8d0
Successfully built gpt_2_simple
Installi

In [23]:
import gpt_2_simple as gpt2

In [24]:
gpt2.download_gpt2(model_name="124M")

Fetching checkpoint: 1.05Mit [00:00, 636Mit/s]                                                      
Fetching encoder.json: 1.05Mit [00:00, 4.95Mit/s]
Fetching hparams.json: 1.05Mit [00:00, 801Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [00:08, 59.7Mit/s]                                  
Fetching model.ckpt.index: 1.05Mit [00:00, 549Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:00, 5.70Mit/s]
Fetching vocab.bpe: 1.05Mit [00:00, 5.65Mit/s]


### Finetune GPT-2

The next cell will start the actual finetuning of GPT-2. It creates a persistent TensorFlow session which stores the training config, then runs the training for the specified number of `steps`. (to have the finetuning run indefinitely, set `steps = -1`)

The model checkpoints will be saved in `/checkpoint/run1` by default. The checkpoints are saved every 500 steps (can be changed) and when the cell is stopped.

<br><br>
Parameters:

* sess: The TensorFlow session in which the fine-tuning will take place.

* dataset: The name or path of the dataset file used for fine-tuning. It should be a plain text file where each training example is on a separate line.

* model_name: The model architecture to use for fine-tuning. It refers to the GPT-2 model variant, such as '124M', '355M', etc. The number represents the number of parameters in millions.

* steps: The number of training steps (iterations) to perform during fine-tuning.

* restore_from: The checkpoint from which to restore the model weights. It can take the following values:
    * 'fresh': Initialize the model weights randomly (starts training from scratch).
    * 'latest': Resume training from the latest checkpoint (continues training from the last saved checkpoint).
    * 'specific': Restore from a specific checkpoint by providing the path or name of the checkpoint file.
* run_name: The name of the run or experiment. It is used to identify and save the checkpoints and training logs associated with this specific run.

* print_every: The frequency (in steps) at which to print the training progress and loss during fine-tuning.

* sample_every: The frequency (in steps) at which to generate sample outputs from the model during fine-tuning. This can be useful to monitor the model's progress and generate creative text samples.

* save_every: The frequency (in steps) at which to save the model checkpoint during fine-tuning. It determines how often the model's weights and optimizer state are saved for future use or evaluation.

In [25]:
train_data.shape

(1486, 4)

In [32]:
file_name = "/content/en_train_data.txt"

sess = gpt2.start_tf_sess()

In [33]:
gpt2.reset_session(sess)

<tensorflow.python.client.session.Session at 0x7f500c166cb0>

In [34]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name,
              model_name='124M',
              steps=100,
              restore_from='fresh',
              run_name='run2',
              print_every=10,
              sample_every=50,
              save_every=10
              )

Loading checkpoint models/124M/model.ckpt
Loading dataset...


100%|██████████| 1/1 [00:00<00:00,  7.84it/s]

dataset has 13199 tokens
Training...





[10 | 165.65] loss=3.95 avg=3.95
Saving checkpoint/run2/model-10
[20 | 325.61] loss=3.21 avg=3.58
Saving checkpoint/run2/model-20


Instructions for updating:
Use standard file APIs to delete files with this prefix.


[30 | 482.04] loss=2.40 avg=3.18
Saving checkpoint/run2/model-30
[40 | 632.09] loss=1.41 avg=2.73
Saving checkpoint/run2/model-40
[50 | 778.06] loss=0.78 avg=2.33
Saving checkpoint/run2/model-50
 my people, as the wind brings new life to your cabin
from the sea and the mountains and other such things
choices you make
my women or your farm or your cabin
choices you make
you make
to feed the dead and the dying
youll find out soon enough
when my lambs are out in the yard
harbouring for your barn
youll find me there
choices you make
to feed and clothe your flock
and to keep the poor well off
youll find me there
choices you make
to feed and clothe your family
and to keep the poor and health conscious
choices you make
To feed my hungry and to clothe my child
Ive got four small pheasants
choices do you like very well
To feed and clothe my baby baby
Ive got four smart and do you like very well
To feed and clothe my baby baby
Ive got four pretty and do you like very well
To feed and clothe your

In [36]:
gpt2.copy_checkpoint_to_gdrive(run_name='run2')

In [37]:
gpt2.copy_checkpoint_from_gdrive(run_name='run2')

In [38]:
gpt2.load_gpt2(sess, run_name='run2')

ValueError: ignored

* length: The length of the generated text in terms of tokens. It determines how many tokens the generated output will contain. Note that the actual length of the output may vary depending on the model's behavior.

* temperature: A parameter that controls the randomness of the generated text. Higher values (e.g., above 1.0) result in more random and diverse output, while lower values (e.g., below 1.0) make the output more focused and deterministic.

* prefix: A starting prompt or seed text from which the generation begins. The generated text will continue from the given prefix.

* nsamples: The number of independent samples to generate. Each sample is a separate generated output. Setting a higher value for nsamples will result in multiple generated texts.

* batch_size: The number of samples to generate in parallel. Specifying a higher batch_size can improve generation speed but requires more computational resources.

In [39]:
text = gpt2.generate(sess,
                     length=100,
                     run_name="run2",
                     temperature=0.7,
                     prefix="Happines is"
                     )

Happines is a poetical bent that I can only remark
On the flowers of the field they are as neat and they are as fair
As the fields in the morning and they are still
When the moons are shining bright and the dew is shining
On the sweet tones of the sea and the land of their being
Kissed the lilywhite and the white cot and all
The tiniest twirl of white smoke hung over the bonnet
Had the white foam over the


In [59]:
eval_data = pd.read_csv("/content/en_eval_data.txt")
eval_data

Unnamed: 0,cleaned_poem
0,Her voice it was chanting melodious
1,She left me scarce able to go
2,My heart it is soothed in solace
3,My CailÃn deas crÃºite na mbÃ³
4,With courtesy I did salute her
...,...
987,gunwale Islington and Isle of Wight Housewife
988,verdict and indict Finally which rhymes with
989,enough Though through plough or dough or
990,cough Hiccough has the sound of cup


In [70]:
generated_text = "Happines is a poetical bent that I can only remark On the flowers of the field they are as neat and they are as fairAs the fields in the morning and they are still When the moons are shining bright and the dew is shining On the sweet tones of the sea and the land of their being Kissed the lilywhite and the white cot and all The tiniest twirl of white smoke hung over the bonnet Had the white foam over the"

## Evaluation 
To evaluate the generated poems, a measure of perplexity is calculated. Perplexity is a common metric used to assess the quality and fluency of language models. It measures how well a model predicts the next word in a sequence. The lower the perplexity, the better the model's performance. The perplexity is calculated by comparing the predicted probabilities of the true labels in the test set and averaging the log-likelihoods. A lower perplexity indicates a higher level of coherence and fluency in the generated poems.

In [None]:
test_data = pd.read_csv("test_data.csv")

# Apply the same preprocessing steps as done for training data
test_data["cleaned_poem"] = test_data["poem"].apply(remove_enstopwords)
test_data["cleaned_poem"] = test_data["poem"].apply(remove_special_chars)
test_data["cleaned_poem"] = test_data["poem"].apply(remove_punctuations)
test_data["normalized_poem"] = test_data["cleaned_poem"].apply(normalize_text)

# Tokenize the test data
test_sequences = tokenizer.texts_to_sequences(test_data["normalized_poem"])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_len - 1, padding='pre')
test_labels = tf.keras.utils.to_categorical(test_sequences[:, -1], num_classes=vocabulary_size+1)


In [None]:
# Use the model to predict probabilities for the test data
test_predictions = model.predict(test_sequences)

# Calculate the log-likelihoods of the true labels
true_label_indices = np.argmax(test_labels, axis=1)
log_likelihoods = np.log(test_predictions[np.arange(len(test_sequences)), true_label_indices])

# Calculate perplexity
perplexity = np.exp(-np.mean(log_likelihoods))
print("Perplexity:", perplexity)


Perplexity: 8834701.0


Some possible actions to improve the perplexity value include increasing the size and diversity of the training data, refining the model architecture, adjusting hyperparameters, or employing more advanced techniques such as transfer learning or fine-tuning.

In [None]:
train_data = pd.read_csv("train_data.csv")

# Apply the same preprocessing steps as done for training data
train_data["cleaned_poem"] = train_data["poem"].apply(remove_enstopwords)
train_data["cleaned_poem"] = train_data["poem"].apply(remove_special_chars)
train_data["cleaned_poem"] = train_data["poem"].apply(remove_punctuations)
train_data["normalized_poem"] = train_data["cleaned_poem"].apply(normalize_text)

# Tokenize the test data
train_sequences = tokenizer.texts_to_sequences(train_data["normalized_poem"])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_len - 1, padding='pre')
train_labels = tf.keras.utils.to_categorical(train_sequences[:, -1], num_classes=vocabulary_size+1)

embeddings similarity

In [None]:
embedding_weights = model.get_layer('embedding').get_weights()[0]


In [None]:
generated_poems = []

for poem in output_text.split("\n"):
    generated_poem = []
    for word in poem.split():
        word_index = tokenizer.word_index.get(word)
        if word_index is not None:
            embedding_vector = embedding_weights[word_index]
            generated_poem.append(embedding_vector)
    generated_poems.append(generated_poem)


In [None]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = []

for generated_poem in generated_poems:
    poem_similarity_scores = []
    for original_poem in en_df["tokens"]:
        original_poem_embedding = []
        for word in original_poem:
            if word in tokenizer.word_index and tokenizer.word_index[word] in embedding_weights:
                word_index = tokenizer.word_index[word]
                original_poem_embedding.append(embedding_weights[word_index])
        if len(original_poem_embedding) > 0:
            original_poem_embedding = np.mean(original_poem_embedding, axis=0)
            generated_poem_embedding = np.mean(generated_poem, axis=0)
            original_poem_embedding = np.reshape(original_poem_embedding, (1, -1))
            generated_poem_embedding = np.reshape(generated_poem_embedding, (1, -1))
            similarity_score = cosine_similarity(original_poem_embedding, generated_poem_embedding)[0][0]
            poem_similarity_scores.append(similarity_score)
    if len(poem_similarity_scores) > 0:
        similarity_scores.append(np.mean(poem_similarity_scores))

if len(similarity_scores) > 0:
    average_similarity = np.nanmean(similarity_scores)
else:
    average_similarity = 0.0

print("Average Similarity:", average_similarity)


Average Similarity: 0.0


### Evaluation of the GPT Model 