# English Poems Generator - NLP Project. 
The English Poems Generator is an NLP project that aims to generate English poems.

In [1]:
# Importing needed libraries
import pandas as pd
import numpy as np
import pickle

# Cleaning and Normalization
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dropout, Dense
from tensorflow.keras import regularizers
import random

## Reading data
### "Poem Generation" dataset 
Its collection of poems designed specifically for training and developing generative models. This dataset provides a diverse range of poems encompassing different genres, themes, and styles.

In [96]:
en = open('../PoemsNLP/en/poem.txt', encoding="utf8").read()

In [97]:
# Make the English poems dataset. 
poems_list = en.split("\n")
en_df = pd.DataFrame({'poem': poems_list})
en_df.head()

Unnamed: 0,poem
0,"Stay, I said"
1,to the cut flowers.
2,They bowed
3,their heads lower.
4,"Stay, I said to the spider,"


In [98]:
en_df.shape

(2478, 1)

## Preprocessing
Cleaning Text: The text data is preprocessed to remove irrelevant or unnecessary elements. This involves removing stopwords using the English stopwords from the NLTK corpus, removing special characters using regular expressions, and removing punctuation marks using regular expressions.

Normalizing Text: The text is further normalized to ensure consistency and ease of analysis. This includes lemmatizing the text using the WordNetLemmatizer from the NLTK library and converting the text to lowercase.

In [99]:
def remove_punctuations(text):
    """
    Removes all punctuation marks from a given text.

    Args:
        text (str): The text to remove punctuation from.

    Returns:
        The text with all punctuation marks removed.
    """
    return re.sub(r'[^\w\s]', '', text)

def remove_enstopwords(text):
    """
    Removes all stopwords from a given English text.

    Args:
        text (str): The text to remove stopwords from.

    Returns:
        The text with all stopwords removed.
    """
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return " ".join(filtered_tokens)


def remove_special_chars(text):
    """
    Removes all non-ASCII characters from a given text.

    Args:
        text (str): The text to remove special characters from.

    Returns:
        The text with all non-ASCII characters removed.
    """
    return re.sub(r'[^\x00-\x7f]', r'', text)


In [100]:
en_df["cleaned_poem"] = en_df["poem"].apply(remove_enstopwords)
en_df["cleaned_poem"] = en_df["poem"].apply(remove_special_chars)
en_df["cleaned_poem"] = en_df["poem"].apply(remove_punctuations)

In [101]:
def lemmatize_text(text):
    """ 
    Args:
    text (str): The input text to be lemmatized.
    Returns:
        str: The lemmatized text.
    """
    lemmatizer = WordNetLemmatizer()
    tokens = word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return " ".join(lemmatized_tokens)

def standardize_text(text):
    """
    Args:
    text (str): The input text to be standardized.

    Returns:
        str: The standardized text.
    """
    return text.lower()

def normalize_text(text):
    """
    Normalize a text by lowering the chars and lemmatizing the text. 

    Args:
        text (str): The text to clean.

    Returns:
        The cleaned text.
    """
    text = lemmatize_text(text)
    text = standardize_text(text)
    return text

In [102]:
en_df["normalized_poem"] = en_df["cleaned_poem"].apply(normalize_text)
en_df.head()

Unnamed: 0,poem,cleaned_poem,normalized_poem
0,"Stay, I said",Stay I said,stay i said
1,to the cut flowers.,to the cut flowers,to the cut flower
2,They bowed,They bowed,they bowed
3,their heads lower.,their heads lower,their head lower
4,"Stay, I said to the spider,",Stay I said to the spider,stay i said to the spider


In [111]:
train_data, test_data = train_test_split(en_df, test_size=0.4, shuffle=False)

train_data["cleaned_poem"].to_csv("en_train_data.txt", index=False)
test_data["cleaned_poem"].to_csv("en_eval_data.txt", index=False)

en_df=train_data

## Tokenization
The tokenization step involves breaking down the normalized text into individual tokens or words. This is achieved using the word_tokenize function from the NLTK library. The tokens are stored as a list of lists, where each sublist represents the tokenized poem.


In [9]:
en_df["tokens"] = en_df["normalized_poem"].apply(lambda x: nltk.word_tokenize(x.lower()))
en_df.head()

Unnamed: 0,poem,cleaned_poem,normalized_poem,tokens
2464,"out, joust and scour, scourging. Ear, but earn",out joust and scour scourging Ear but earn,out joust and scour scourging ear but earn,"[out, joust, and, scour, scourging, ear, but, ..."
1561,In Mullingar that night I rested limbs so wear...,In Mullingar that night I rested limbs so wear...,in mullingar that night i rested limb so weary...,"[in, mullingar, that, night, i, rested, limb, ..."
2469,(think of Psyche!) Is a paling stout and spikey?,think of Psyche Is a paling stout and spikey,think of psyche is a paling stout and spikey,"[think, of, psyche, is, a, paling, stout, and,..."
1513,With CailÃ­n deas crÃºite na mbÃ³.,With CailÃn deas crÃºite na mbÃ³,with cailãn dea crãºite na mbã³,"[with, cailãn, dea, crãºite, na, mbã³]"
1133,Far renownd for learnin and piety;,Far renownd for learnin and piety,far renownd for learnin and piety,"[far, renownd, for, learnin, and, piety]"


## LSTM Model
The English Poems Generator employs a deep learning model to generate poems. The model architecture consists of multiple layers, including an Embedding layer, Bidirectional LSTM layers, Dropout layers for regularization, and Dense layers for output prediction. The model is compiled with the categorical_crossentropy loss function and the Adam optimizer. Model training is performed on the predictors (input sequences) and labels (output sequences) obtained from the tokenized and padded data.

In [10]:
# get the vocabulary size
unique_words = []
for poem in en_df["tokens"]:
    unique_words.extend(poem)

vocabulary = list(set(unique_words))
vocabulary_size = len(vocabulary)
vocabulary_size

3108

In [11]:
# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on your data
tokenizer.fit_on_texts(en_df["tokens"])

# Save the tokenizer using pickle
with open('en_tokenizer.pickle', 'wb') as f:
    pickle.dump(tokenizer, f)

In [12]:
# Converting the text into embeddings
input_sequences = []
for line in en_df["tokens"]:
	token_list = tokenizer.texts_to_sequences([line])[0]

	for i in range(1, len(token_list)):
		n_gram_sequence = token_list[:i+1]
		input_sequences.append(n_gram_sequence)

max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))
predictors, label = input_sequences[:, :-1], input_sequences[:, -1]
label = tf.keras.utils.to_categorical(label, num_classes=vocabulary_size+1)

In [13]:
model = Sequential()
model.add(Embedding(vocabulary_size+1, 100, input_length=max_sequence_len-1))
model.add(Bidirectional(LSTM(256, return_sequences=True)))
model.add(Dropout(0.5))
model.add(LSTM(128, return_sequences=True))
model.add(Dropout(0.5))
model.add(LSTM(64))
model.add(Dense((vocabulary_size+1)//2, activation='relu', kernel_regularizer=regularizers.l2(0.01)))
model.add(Dense(vocabulary_size+1, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 15, 100)           310900    
                                                                 
 bidirectional (Bidirectiona  (None, 15, 512)          731136    
 l)                                                              
                                                                 
 dropout (Dropout)           (None, 15, 512)           0         
                                                                 
 lstm_1 (LSTM)               (None, 15, 128)           328192    
                                                                 
 dropout_1 (Dropout)         (None, 15, 128)           0         
                                                                 
 lstm_2 (LSTM)               (None, 64)                49408     
                                                        

In [14]:
history = model.fit(predictors, label, epochs=10, verbose=1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [15]:
model.save("en_poem_generation_model.h5")

## Generating the poems
To generate new poems, a seed text is provided as input. The seed text serves as the starting point for the model to generate subsequent words or tokens. Using the trained model, the generator predicts the next word based on the context and patterns learned during training. The generation process continues for a specified number of words, gradually expanding the generated poem. The temperature parameter is used to control the randomness of the generated output.

In [16]:
seed_text = "Happiness is "
next_words = 25
output_text = seed_text

temperature = 0.6  # Adjust the temperature (higher values for more randomness, lower for more determinism)

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
    predicted_probs = model.predict(token_list, verbose=0)[0]
    predicted_probs = np.log(predicted_probs) / temperature
    predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
    predicted_index = np.random.choice(len(predicted_probs), size=1, p=predicted_probs)[0]
    output_word = tokenizer.index_word[predicted_index]
    seed_text += " " + output_word
    output_text += " " + output_word

print(output_text)


Happiness is  the star heard a love i looked for my the white is not he a heart like happy bow this skin is know and in


## GPT2 Model

In [108]:
import gpt_2_simple as gpt2

In [109]:
gpt2.download_gpt2(model_name="124M")

Fetching checkpoint: 1.05Mit [00:00, 95.3Mit/s]                                                     
Fetching encoder.json: 1.05Mit [00:01, 629kit/s]                                                    
Fetching hparams.json: 1.05Mit [00:00, 521Mit/s]                                                    
Fetching model.ckpt.data-00000-of-00001: 498Mit [02:12, 3.76Mit/s]                                  
Fetching model.ckpt.index: 1.05Mit [00:00, 418Mit/s]                                                
Fetching model.ckpt.meta: 1.05Mit [00:01, 960kit/s]                                                 
Fetching vocab.bpe: 1.05Mit [00:01, 970kit/s]                                                       


### Finetune GPT-2

The next cell will start the actual finetuning of GPT-2. It creates a persistent TensorFlow session which stores the training config, then runs the training for the specified number of `steps`. (to have the finetuning run indefinitely, set `steps = -1`)

The model checkpoints will be saved in `/checkpoint/run1` by default. The checkpoints are saved every 500 steps (can be changed) and when the cell is stopped.

<br><br>
Parameters:

* sess: The TensorFlow session in which the fine-tuning will take place.

* dataset: The name or path of the dataset file used for fine-tuning. It should be a plain text file where each training example is on a separate line.

* model_name: The model architecture to use for fine-tuning. It refers to the GPT-2 model variant, such as '124M', '355M', etc. The number represents the number of parameters in millions.

* steps: The number of training steps (iterations) to perform during fine-tuning.

* restore_from: The checkpoint from which to restore the model weights. It can take the following values:
    * 'fresh': Initialize the model weights randomly (starts training from scratch).
    * 'latest': Resume training from the latest checkpoint (continues training from the last saved checkpoint).
    * 'specific': Restore from a specific checkpoint by providing the path or name of the checkpoint file.
* run_name: The name of the run or experiment. It is used to identify and save the checkpoints and training logs associated with this specific run.

* print_every: The frequency (in steps) at which to print the training progress and loss during fine-tuning.

* sample_every: The frequency (in steps) at which to generate sample outputs from the model during fine-tuning. This can be useful to monitor the model's progress and generate creative text samples.

* save_every: The frequency (in steps) at which to save the model checkpoint during fine-tuning. It determines how often the model's weights and optimizer state are saved for future use or evaluation.

In [112]:
train_data.shape

(951, 3)

In [119]:
file_name = "en_train_data.txt"

sess = gpt2.start_tf_sess()

In [134]:
gpt2.reset_session(sess)

<tensorflow.python.client.session.Session at 0x1bb8a1505b0>

In [135]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name,
              model_name='124M',
              steps=2000,
              restore_from='fresh',
              run_name='en_run2',  
              print_every=50,
              sample_every=200,
              save_every=500
             )


Loading checkpoint models\124M\model.ckpt
INFO:tensorflow:Restoring parameters from models\124M\model.ckpt
Loading dataset...


100%|██████████| 1/1 [00:00<00:00, 985.50it/s]

dataset has 9136 tokens
Training...





[50 | 2176.67] loss=0.64 avg=0.64
[100 | 4239.08] loss=0.04 avg=0.34
[150 | 6303.85] loss=0.02 avg=0.23
[200 | 8386.76] loss=0.02 avg=0.18
 guns and arrows
What kind of fish they had in Ukraine
Her dad kept a pig and my wife bore
She begged for forgiveness and promised soon she would
The Indies and the mountains made beautiful by her soul
Love and loathing estranged
And where are they keeping the girls
Because if you can be with joy
Thou in me forever
And with me need appear
That the world is one
This deep blue sky is over it all
There were the Banns am Bastards
The girls in their ribbons they got
And the divil the one that did me part
Gone a girl a judge none
She could not be with fairer color
White is the hairdresser's shade
When I begged for forgiveness
That bug my teeth were no longer sharp
With a diamond he is a scratch
The place where we were wed Mary I can see the traces of tears
I love thee with a love that goes into every word
Thou in me forever
And I think Ill see that little

UnicodeEncodeError: 'charmap' codec can't encode character '\xc3' in position 1362: character maps to <undefined>

In [131]:
gpt2.reset_session(sess)

<tensorflow.python.client.session.Session at 0x1bb6a4e6650>

In [132]:
sess = gpt2.start_tf_sess()

gpt2.finetune(sess,
              dataset=file_name,
              model_name='124M',
              steps=2000,
              restore_from='checkpoint/en_run2/model-161',  # Path to the checkpoint
              run_name='en_run2',  
              print_every=50,
              sample_every=200,
              save_every=500
             )

Loading checkpoint None


ValueError: Can't load save_path when it is None.

In [125]:
gpt2.save_checkpoint(run_name='en_run2', checkpoint_dir='../poemsNLP')

AttributeError: module 'gpt_2_simple' has no attribute 'save_checkpoint'

In [127]:
gpt2.generate(sess, run_name='en_run2')

FailedPreconditionError: Graph execution error:

Detected at node 'sample_sequence_2/model/h10/ln_2/add_1/ReadVariableOp' defined at (most recent call last):
    File "C:\Python310\lib\runpy.py", line 196, in _run_module_as_main
      return _run_code(code, main_globals, None,
    File "C:\Python310\lib\runpy.py", line 86, in _run_code
      exec(code, run_globals)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
      app.launch_new_instance()
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
      app.start()
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelapp.py", line 725, in start
      self.io_loop.start()
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
      self.asyncio_loop.run_forever()
    File "C:\Python310\lib\asyncio\base_events.py", line 600, in run_forever
      self._run_once()
    File "C:\Python310\lib\asyncio\base_events.py", line 1896, in _run_once
      handle._run()
    File "C:\Python310\lib\asyncio\events.py", line 80, in _run
      self._context.run(self._callback, *self._args)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue
      await self.process_one()
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 502, in process_one
      await dispatch(*args)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell
      await result
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
      reply_content = await reply_content
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute
      res = shell.run_cell(
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
      return super().run_cell(*args, **kwargs)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3009, in run_cell
      result = self._run_cell(
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3064, in _run_cell
      result = runner(coro)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
      coro.send(None)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3269, in run_cell_async
      has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3448, in run_ast_nodes
      if await self.run_code(code, result, async_=asy):
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
      exec(code_obj, self.user_global_ns, self.user_ns)
    File "C:\Users\User\AppData\Local\Temp\ipykernel_344\4165053288.py", line 1, in <module>
      gpt2.generate(sess, run_name='en_run2')
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\gpt_2.py", line 462, in generate
      output = sample.sample_sequence(
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\sample.py", line 67, in sample_sequence
      context_output = step(hparams, context[:, :-1])
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\sample.py", line 51, in step
      lm_output = model.model(hparams=hparams, X=tokens,
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 203, in model
      h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 158, in block
      m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
    File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 67, in norm
      x = x*g + b
Node: 'sample_sequence_2/model/h10/ln_2/add_1/ReadVariableOp'
Could not find variable model/h10/ln_2/b. This could mean that the variable has been deleted. In TF1, it can also mean the variable is uninitialized. Debug info: container=localhost, status error message=Container localhost does not exist. (Could not find resource: localhost/model/h10/ln_2/b)
	 [[{{node sample_sequence_2/model/h10/ln_2/add_1/ReadVariableOp}}]]

Original stack trace for 'sample_sequence_2/model/h10/ln_2/add_1/ReadVariableOp':
  File "C:\Python310\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Python310\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel_launcher.py", line 17, in <module>
    app.launch_new_instance()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\traitlets\config\application.py", line 1043, in launch_instance
    app.start()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelapp.py", line 725, in start
    self.io_loop.start()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tornado\platform\asyncio.py", line 195, in start
    self.asyncio_loop.run_forever()
  File "C:\Python310\lib\asyncio\base_events.py", line 600, in run_forever
    self._run_once()
  File "C:\Python310\lib\asyncio\base_events.py", line 1896, in _run_once
    handle._run()
  File "C:\Python310\lib\asyncio\events.py", line 80, in _run
    self._context.run(self._callback, *self._args)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 513, in dispatch_queue
    await self.process_one()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 502, in process_one
    await dispatch(*args)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 409, in dispatch_shell
    await result
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\kernelbase.py", line 729, in execute_request
    reply_content = await reply_content
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\ipkernel.py", line 422, in do_execute
    res = shell.run_cell(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\ipykernel\zmqshell.py", line 540, in run_cell
    return super().run_cell(*args, **kwargs)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3009, in run_cell
    result = self._run_cell(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3064, in _run_cell
    result = runner(coro)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\async_helpers.py", line 129, in _pseudo_sync_runner
    coro.send(None)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3269, in run_cell_async
    has_raised = await self.run_ast_nodes(code_ast.body, cell_name,
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3448, in run_ast_nodes
    if await self.run_code(code, result, async_=asy):
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\IPython\core\interactiveshell.py", line 3508, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "C:\Users\User\AppData\Local\Temp\ipykernel_344\4165053288.py", line 1, in <module>
    gpt2.generate(sess, run_name='en_run2')
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\gpt_2.py", line 462, in generate
    output = sample.sample_sequence(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\sample.py", line 67, in sample_sequence
    context_output = step(hparams, context[:, :-1])
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\sample.py", line 51, in step
    lm_output = model.model(hparams=hparams, X=tokens,
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 203, in model
    h, present = block(h, 'h%d' % layer, past=past, hparams=hparams)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 158, in block
    m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\gpt_2_simple\src\model.py", line 67, in norm
    x = x*g + b
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1459, in binary_op_wrapper
    return func(x, y, name=name)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\util\traceback_utils.py", line 150, in error_handler
    return fn(*args, **kwargs)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\util\dispatch.py", line 1176, in op_dispatch_handler
    return dispatch_target(*args, **kwargs)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\math_ops.py", line 1805, in _add_dispatch
    y = ops.convert_to_tensor(y, dtype_hint=x.dtype.base_dtype, name="y")
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\profiler\trace.py", line 183, in wrapped
    return func(*args, **kwargs)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\framework\ops.py", line 1625, in convert_to_tensor
    ret = conversion_func(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 2274, in _dense_var_to_tensor
    return var._dense_var_to_tensor(dtype=dtype, name=name, as_ref=as_ref)  # pylint: disable=protected-access
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 1516, in _dense_var_to_tensor
    return self.value()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 592, in value
    return self._read_variable_op()
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 753, in _read_variable_op
    result = read_and_set_handle(no_copy)
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\resource_variable_ops.py", line 743, in read_and_set_handle
    result = gen_resource_variable_ops.read_variable_op(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\ops\gen_resource_variable_ops.py", line 594, in read_variable_op
    _, _, _op, _outputs = _op_def_library._apply_op_helper(
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\framework\op_def_library.py", line 795, in _apply_op_helper
    op = g._create_op_internal(op_type_name, inputs, dtypes=None,
  File "c:\Users\User\PoemsNLP\poems\lib\site-packages\tensorflow\python\framework\ops.py", line 3814, in _create_op_internal
    ret = Operation(


* length: The length of the generated text in terms of tokens. It determines how many tokens the generated output will contain. Note that the actual length of the output may vary depending on the model's behavior.

* temperature: A parameter that controls the randomness of the generated text. Higher values (e.g., above 1.0) result in more random and diverse output, while lower values (e.g., below 1.0) make the output more focused and deterministic.

* prefix: A starting prompt or seed text from which the generation begins. The generated text will continue from the given prefix.

* nsamples: The number of independent samples to generate. Each sample is a separate generated output. Setting a higher value for nsamples will result in multiple generated texts.

* batch_size: The number of samples to generate in parallel. Specifying a higher batch_size can improve generation speed but requires more computational resources.

In [128]:
text = gpt2.generate(sess,
              length=100,
              temperature=0.7,
              prefix="Stay",
              nsamples=2,
              batch_size=3,
              )

AssertionError: 

## Evaluation 
To evaluate the generated poems, a measure of perplexity is calculated. Perplexity is a common metric used to assess the quality and fluency of language models. It measures how well a model predicts the next word in a sequence. The lower the perplexity, the better the model's performance. The perplexity is calculated by comparing the predicted probabilities of the true labels in the test set and averaging the log-likelihoods. A lower perplexity indicates a higher level of coherence and fluency in the generated poems.

In [49]:
test_data = pd.read_csv("test_data.csv")

# Apply the same preprocessing steps as done for training data
test_data["cleaned_poem"] = test_data["poem"].apply(remove_enstopwords)
test_data["cleaned_poem"] = test_data["poem"].apply(remove_special_chars)
test_data["cleaned_poem"] = test_data["poem"].apply(remove_punctuations)
test_data["normalized_poem"] = test_data["cleaned_poem"].apply(normalize_text)

# Tokenize the test data
test_sequences = tokenizer.texts_to_sequences(test_data["normalized_poem"])
test_sequences = pad_sequences(test_sequences, maxlen=max_sequence_len - 1, padding='pre')
test_labels = tf.keras.utils.to_categorical(test_sequences[:, -1], num_classes=vocabulary_size+1)


In [78]:
# Use the model to predict probabilities for the test data
test_predictions = model.predict(test_sequences)

# Calculate the log-likelihoods of the true labels
true_label_indices = np.argmax(test_labels, axis=1)
log_likelihoods = np.log(test_predictions[np.arange(len(test_sequences)), true_label_indices])

# Calculate perplexity
perplexity = np.exp(-np.mean(log_likelihoods))
print("Perplexity:", perplexity)


Perplexity: 8834701.0


Some possible actions to improve the perplexity value include increasing the size and diversity of the training data, refining the model architecture, adjusting hyperparameters, or employing more advanced techniques such as transfer learning or fine-tuning.

In [88]:
train_data = pd.read_csv("train_data.csv")

# Apply the same preprocessing steps as done for training data
train_data["cleaned_poem"] = train_data["poem"].apply(remove_enstopwords)
train_data["cleaned_poem"] = train_data["poem"].apply(remove_special_chars)
train_data["cleaned_poem"] = train_data["poem"].apply(remove_punctuations)
train_data["normalized_poem"] = train_data["cleaned_poem"].apply(normalize_text)

# Tokenize the test data
train_sequences = tokenizer.texts_to_sequences(train_data["normalized_poem"])
train_sequences = pad_sequences(train_sequences, maxlen=max_sequence_len - 1, padding='pre')
train_labels = tf.keras.utils.to_categorical(train_sequences[:, -1], num_classes=vocabulary_size+1)

embeddings similarity

In [62]:
embedding_weights = model.get_layer('embedding').get_weights()[0]


In [63]:
generated_poems = []

for poem in output_text.split("\n"):
    generated_poem = []
    for word in poem.split():
        word_index = tokenizer.word_index.get(word)
        if word_index is not None:
            embedding_vector = embedding_weights[word_index]
            generated_poem.append(embedding_vector)
    generated_poems.append(generated_poem)


In [70]:
from sklearn.metrics.pairwise import cosine_similarity

similarity_scores = []

for generated_poem in generated_poems:
    poem_similarity_scores = []
    for original_poem in en_df["tokens"]:
        original_poem_embedding = []
        for word in original_poem:
            if word in tokenizer.word_index and tokenizer.word_index[word] in embedding_weights:
                word_index = tokenizer.word_index[word]
                original_poem_embedding.append(embedding_weights[word_index])
        if len(original_poem_embedding) > 0:
            original_poem_embedding = np.mean(original_poem_embedding, axis=0)
            generated_poem_embedding = np.mean(generated_poem, axis=0)
            original_poem_embedding = np.reshape(original_poem_embedding, (1, -1))
            generated_poem_embedding = np.reshape(generated_poem_embedding, (1, -1))
            similarity_score = cosine_similarity(original_poem_embedding, generated_poem_embedding)[0][0]
            poem_similarity_scores.append(similarity_score)
    if len(poem_similarity_scores) > 0:
        similarity_scores.append(np.mean(poem_similarity_scores))

if len(similarity_scores) > 0:
    average_similarity = np.nanmean(similarity_scores)
else:
    average_similarity = 0.0

print("Average Similarity:", average_similarity)


Average Similarity: 0.0
