In [29]:
# Importing necessary libraries for data manipulation, neural networks, and visualization.
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time
import matplotlib.pyplot as plt

In [30]:
# Loading the dataset from a CSV file into a pandas DataFrame.
df = pd.read_csv("poems-100.csv")

In [31]:
# Concatenating all text from the first column, converting to lowercase, and then splitting into individual tokens (words).
text = " ".join(df.iloc[:, 0].astype(str).tolist()).lower()
tokens = text.split()

In [32]:
# Creating a vocabulary of unique words from the tokens and determining its size.
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 6989


In [33]:
# Creating mappings from words to numerical indices and vice-versa for efficient processing.
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

In [34]:
# Converting the list of words (tokens) into a list of their corresponding numerical indices.
indexed_tokens = [word_to_idx[w] for w in tokens]

In [35]:
# Preparing sequences for training: creating input sequences of a fixed length and their corresponding target words.
sequence_length = 5

inputs = []
targets = []

for i in range(len(indexed_tokens) - sequence_length):
    inputs.append(indexed_tokens[i:i+sequence_length])
    targets.append(indexed_tokens[i+sequence_length])

In [36]:
# Converting the input sequences and target words into PyTorch tensors.
X = torch.tensor(inputs)
y = torch.tensor(targets)

In [37]:
# Defining a Recurrent Neural Network (RNN) model with an embedding layer for text generation.
class RNN_Embedding(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.rnn = nn.RNN(embed_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)          # (batch, seq, embed_dim) - Embeds the input indices into dense vectors
        out, _ = self.rnn(x)           # (batch, seq, hidden) - Processes the embedded sequences with an RNN layer
        out = self.fc(out[:, -1, :])   # last time step - Applies a linear layer to the last output of the RNN to predict the next word
        return out

In [38]:
# Initializing model parameters, defining the model architecture, loss function, and optimizer.
embed_size = 100
hidden_size = 128
epochs = 50
batch_size = 32

model = RNN_Embedding(vocab_size, embed_size, hidden_size)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

In [39]:
# Training loop for the RNN model, iterating through epochs and batches, performing forward and backward passes, and updating model weights. Records loss and training time.
losses = []

start_time = time.time()

for epoch in range(epochs):

    epoch_loss = 0

    for i in range(0, len(X), batch_size):

        batch_inputs = X[i:i+batch_size]
        batch_targets = y[i:i+batch_size]

        optimizer.zero_grad()
        outputs = model(batch_inputs)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    num_batches = (len(X) + batch_size - 1) // batch_size
    avg_loss = epoch_loss / num_batches

    losses.append(avg_loss)

    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

end_time = time.time()

embedding_training_time = end_time - start_time
embedding_final_loss = losses[-1]

print("\nTraining Time:", embedding_training_time, "seconds")

Epoch 1/50, Loss: 7.3169
Epoch 2/50, Loss: 6.2197
Epoch 3/50, Loss: 5.6579
Epoch 4/50, Loss: 5.1357
Epoch 5/50, Loss: 4.6315
Epoch 6/50, Loss: 4.1092
Epoch 7/50, Loss: 3.5915
Epoch 8/50, Loss: 3.1180
Epoch 9/50, Loss: 2.6941
Epoch 10/50, Loss: 2.3113
Epoch 11/50, Loss: 1.9706
Epoch 12/50, Loss: 1.6726
Epoch 13/50, Loss: 1.4135
Epoch 14/50, Loss: 1.1882
Epoch 15/50, Loss: 0.9937
Epoch 16/50, Loss: 0.8257
Epoch 17/50, Loss: 0.6833
Epoch 18/50, Loss: 0.5636
Epoch 19/50, Loss: 0.4659
Epoch 20/50, Loss: 0.3852
Epoch 21/50, Loss: 0.3209
Epoch 22/50, Loss: 0.2683
Epoch 23/50, Loss: 0.2287
Epoch 24/50, Loss: 0.1999
Epoch 25/50, Loss: 0.1739
Epoch 26/50, Loss: 0.1523
Epoch 27/50, Loss: 0.1393
Epoch 28/50, Loss: 0.1282
Epoch 29/50, Loss: 0.1172
Epoch 30/50, Loss: 0.1089
Epoch 31/50, Loss: 0.1063
Epoch 32/50, Loss: 0.0977
Epoch 33/50, Loss: 0.0893
Epoch 34/50, Loss: 0.0871
Epoch 35/50, Loss: 0.0839
Epoch 36/50, Loss: 0.0793
Epoch 37/50, Loss: 0.0736
Epoch 38/50, Loss: 0.0673
Epoch 39/50, Loss: 0.

In [40]:
# Creating a pandas DataFrame to store and save the training results (model, time, loss, and observation) to a CSV file.
df = pd.DataFrame([{
    "Model": "Trainable Embedding (PyTorch)",
    "Training_Time_Seconds": embedding_training_time,
    "Final_Loss": embedding_final_loss
}])

df.to_csv("embedding_results.csv", index=False)

In [41]:
# Defining a function to generate text using the trained RNN model, starting from a given sequence of words.
def generate_text(start_words, length=20):

    model.eval()
    words = start_words.copy()

    for _ in range(length):

        seq = [word_to_idx[w] for w in words[-sequence_length:]]
        seq_tensor = torch.tensor([seq])

        with torch.no_grad():
            output = model(seq_tensor)

        probs = torch.softmax(output, dim=1)
        next_word_idx = torch.argmax(probs).item()

        words.append(idx_to_word[next_word_idx])

    return " ".join(words)

In [42]:
# Generating text using the `generate_text` function, starting with the first `sequence_length` tokens from the dataset.
start = tokens[:sequence_length]
print(generate_text(start, 30))

o my luve's like a red, red rose that’s newly sprung in june; o my luve's like the melodie that’s sweetly play'd in tune. as fair art thou, my bonnie lass, so deep in luve
