In [37]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import time
import matplotlib.pyplot as plt

In [38]:
# Load the dataset containing poems from a CSV file into a pandas DataFrame.
df = pd.read_csv("poems-100.csv")

In [39]:
# Concatenate all poems into a single string, convert to lowercase, and split into individual tokens (words).
text = " ".join(df.iloc[:, 0].astype(str).tolist()).lower()
tokens = text.split()

In [40]:
# Create a sorted list of unique tokens to form the vocabulary.
vocab = sorted(set(tokens))
vocab_size = len(vocab)
print("Vocabulary Size:", vocab_size)

Vocabulary Size: 6989


In [41]:
# Create mappings from words to their indices and vice-versa for numerical representation.
word_to_idx = {w: i for i, w in enumerate(vocab)}
idx_to_word = {i: w for w, i in word_to_idx.items()}

In [42]:
# Define the length of input sequences for the RNN model.
sequence_length = 5

# Initialize lists to store input sequences and their corresponding target words.
inputs = []
targets = []

# Iterate through the tokens to create input-target pairs.
for i in range(len(tokens) - sequence_length):
    # Extract a sequence of words as input.
    seq = tokens[i:i+sequence_length]
    # The word immediately following the sequence is the target.
    target = tokens[i+sequence_length]

    # Convert words in the input sequence and target word to their numerical indices.
    inputs.append([word_to_idx[w] for w in seq])
    targets.append(word_to_idx[target])

In [43]:
# Define a function to convert a batch of word indices into one-hot encoded tensors.
def one_hot_batch(batch_indices, vocab_size):
    # Get batch size and sequence length from the input.
    batch_size = len(batch_indices)
    seq_len = len(batch_indices[0])

    # Initialize a tensor of zeros for one-hot encoding.
    one_hot = torch.zeros(batch_size, seq_len, vocab_size)

    # Populate the one-hot tensor.
    for i in range(batch_size):
        for t in range(seq_len):
            one_hot[i, t, batch_indices[i][t]] = 1.0

    return one_hot

In [44]:
# Assign the generated input sequences to X and convert targets to a PyTorch tensor.
X = inputs
y = torch.tensor(targets)

In [45]:
# Define the RNN model for word prediction using one-hot encoding.
class RNN_OneHot(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        # Initialize the RNN layer.
        self.rnn = nn.RNN(input_size, hidden_size, batch_first=True)
        # Initialize the fully connected layer for output.
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # Pass input through the RNN layer.
        out, _ = self.rnn(x)
        # Pass the last hidden state through the fully connected layer.
        out = self.fc(out[:, -1, :])
        return out

In [46]:
# Define the size of the hidden layer for the RNN.
hidden_size = 128
# Instantiate the RNN model.
model = RNN_OneHot(vocab_size, hidden_size, vocab_size)

# Define the loss function (CrossEntropyLoss for classification).
criterion = nn.CrossEntropyLoss()
# Define the optimizer (Adam) with a learning rate.
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Set the number of training epochs and batch size.
epochs = 50
batch_size = 32

In [47]:
# Initialize a list to store loss values per epoch.
losses = []
# Record the start time for training duration calculation.
start_time = time.time()

# Loop through each epoch.
for epoch in range(epochs):

    epoch_loss = 0

    # Iterate through the dataset in batches.
    for i in range(0, len(X), batch_size):

        # Get the current batch of inputs and targets.
        batch_inputs = X[i:i+batch_size]
        batch_targets = y[i:i+batch_size]

        # Convert batch inputs to one-hot encoding.
        batch_onehot = one_hot_batch(batch_inputs, vocab_size)

        # Zero the gradients, perform a forward pass, calculate loss, and backpropagate.
        optimizer.zero_grad()
        outputs = model(batch_onehot)
        loss = criterion(outputs, batch_targets)
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()

    # Calculate the number of batches and average loss for the epoch.
    num_batches = (len(X) + batch_size - 1) // batch_size
    avg_loss = epoch_loss / num_batches

    # Store the average loss.
    losses.append(avg_loss)

    # Print epoch-wise loss.
    print(f"Epoch {epoch+1}/{epochs}, Loss: {avg_loss:.4f}")

# Record the end time and calculate total training duration.
end_time = time.time()

onehot_training_time = end_time - start_time
onehot_final_loss = losses[-1]

# Print the total training time.
print("\nTraining Time:", onehot_training_time, "seconds")

Epoch 1/50, Loss: 7.3192
Epoch 2/50, Loss: 6.4223
Epoch 3/50, Loss: 5.9624
Epoch 4/50, Loss: 5.6523
Epoch 5/50, Loss: 5.4486
Epoch 6/50, Loss: 5.1171
Epoch 7/50, Loss: 4.7126
Epoch 8/50, Loss: 4.3307
Epoch 9/50, Loss: 3.9245
Epoch 10/50, Loss: 3.5165
Epoch 11/50, Loss: 3.1032
Epoch 12/50, Loss: 2.7079
Epoch 13/50, Loss: 2.3529
Epoch 14/50, Loss: 2.0466
Epoch 15/50, Loss: 1.7829
Epoch 16/50, Loss: 1.5497
Epoch 17/50, Loss: 1.3453
Epoch 18/50, Loss: 1.1660
Epoch 19/50, Loss: 1.0096
Epoch 20/50, Loss: 0.8741
Epoch 21/50, Loss: 0.7558
Epoch 22/50, Loss: 0.6530
Epoch 23/50, Loss: 0.5590
Epoch 24/50, Loss: 0.4764
Epoch 25/50, Loss: 0.4032
Epoch 26/50, Loss: 0.3407
Epoch 27/50, Loss: 0.2873
Epoch 28/50, Loss: 0.2417
Epoch 29/50, Loss: 0.2025
Epoch 30/50, Loss: 0.1725
Epoch 31/50, Loss: 0.1468
Epoch 32/50, Loss: 0.1282
Epoch 33/50, Loss: 0.1097
Epoch 34/50, Loss: 0.0970
Epoch 35/50, Loss: 0.0868
Epoch 36/50, Loss: 0.0785
Epoch 37/50, Loss: 0.0699
Epoch 38/50, Loss: 0.0637
Epoch 39/50, Loss: 0.

In [48]:
# Create a DataFrame to store the training results for the one-hot encoding model.
df = pd.DataFrame([
    {
    "Model": "One-Hot Encoding (PyTorch)",
    "Training_Time_Seconds": onehot_training_time,
    "Final_Loss": onehot_final_loss
    }
])

# Save the results to a CSV file.
df.to_csv("onehot_results.csv", index=False)

In [49]:
# Define a function to generate text using the trained model.
def generate_text(start_words, length=20):

    # Set the model to evaluation mode.
    model.eval()
    # Create a copy of the starting words.
    words = start_words.copy()

    # Generate words one by one up to the specified length.
    for _ in range(length):

        # Get the last 'sequence_length' words and convert them to indices.
        seq = [word_to_idx[w] for w in words[-sequence_length:]]
        # Convert the sequence of indices to a one-hot encoded tensor.
        seq_tensor = one_hot_batch([seq], vocab_size)

        # Make a prediction without tracking gradients.
        with torch.no_grad():
            output = model(seq_tensor)

        # Calculate probabilities and select the word with the highest probability.
        probs = torch.softmax(output, dim=1)
        next_word_idx = torch.argmax(probs).item()

        # Append the predicted word to the list.
        words.append(idx_to_word[next_word_idx])

    # Join the generated words to form a coherent text.
    return " ".join(words)

In [50]:
# Get the initial sequence of words from the beginning of the token list.
start = tokens[:sequence_length]
# Generate and print text starting with the initial sequence.
print(generate_text(start, 30))

o my luve's like a red, red rose that’s newly sprung in june; o my luve's like the melodie that’s sweetly play'd in tune. as fair art thou, my bonnie lass, so deep in luve
