In [4]:
from datasets import load_dataset
import torch
import torch.nn as nn
import numpy as np
import random


In [5]:
# Load the dataset
dataset = load_dataset("roneneldan/TinyStories")

# Select a small subset of the dataset
num_samples = 25000 # Specify the number of samples you want to use
texts = dataset['train']['text'][:num_samples]

# Concatenate all texts into a single string
text = ' '.join(texts)

# Create a character vocabulary
chars = sorted(list(set(text)))
vocab_size = len(chars)
char_to_idx = {ch: i for i, ch in enumerate(chars)}
idx_to_char = {i: ch for i, ch in enumerate(chars)}

# Encode text as integer sequences
encoded_text = np.array([char_to_idx[ch] for ch in text])

In [6]:
class CharRNN(nn.Module):
    def __init__(self, vocab_size, hidden_size, num_layers):
        super(CharRNN, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        self.lstm = nn.LSTM(hidden_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        out, hidden = self.lstm(x, hidden)
        out = self.fc(out.reshape(out.size(0) * out.size(1), out.size(2)))
        return out, hidden

    def init_hidden(self, batch_size):
        weight = next(self.parameters()).data
        hidden = (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
                  weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
        return hidden


In [7]:
seq_length = 150  # Sequence length
batch_size = 64

def get_batches(arr, batch_size, seq_length):
    num_batches = (len(arr) - 1) // (batch_size * seq_length)
    arr = arr[:num_batches * batch_size * seq_length]
    arr = arr.reshape((batch_size, -1))
    for n in range(0, arr.shape[1], seq_length):
        x = arr[:, n:n + seq_length]
        y = arr[:, n + 1:n + seq_length + 1]
        if x.shape[1] == seq_length and y.shape[1] == seq_length:
            yield x, y

model = CharRNN(vocab_size, hidden_size=256, num_layers=2)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)


In [8]:
epochs = 10
model.train()

for epoch in range(epochs):
    hidden = model.init_hidden(batch_size)
    for x, y in get_batches(encoded_text, batch_size, seq_length):
        x = torch.tensor(x, dtype=torch.long)
        y = torch.tensor(y, dtype=torch.long)
        hidden = tuple([each.data for each in hidden])

        model.zero_grad()
        output, hidden = model(x, hidden)
        loss = criterion(output, y.view(-1))
        loss.backward()
        optimizer.step()

    print(f'Epoch: {epoch + 1}/{epochs}, Loss: {loss.item():.4f}')


Epoch: 1/10, Loss: 0.9209
Epoch: 2/10, Loss: 0.8596
Epoch: 3/10, Loss: 0.8255
Epoch: 4/10, Loss: 0.8077
Epoch: 5/10, Loss: 0.7942
Epoch: 6/10, Loss: 0.7863
Epoch: 7/10, Loss: 0.7788
Epoch: 8/10, Loss: 0.7773
Epoch: 9/10, Loss: 0.7705
Epoch: 10/10, Loss: 0.7763


In [19]:
def generate(model, start_str, predict_len=100, temperature=0.8):
    model.eval()
    hidden = model.init_hidden(1)
    start_input = torch.tensor([char_to_idx[ch] for ch in start_str], dtype=torch.long).unsqueeze(0)
    predicted = start_str

    with torch.no_grad():
        for p in range(len(start_str) - 1):
            _, hidden = model(start_input[:, p].unsqueeze(0), hidden)
        inp = start_input[:, -1]

        for _ in range(predict_len):
            output, hidden = model(inp.unsqueeze(0), hidden)
            output_dist = output.data.view(-1).div(temperature).exp()
            top_i = torch.multinomial(output_dist, 1)[0]

            predicted_char = idx_to_char[top_i.item()]
            predicted += predicted_char
            inp = torch.tensor([top_i], dtype=torch.long)

    return predicted

# Generate text
start_string = "rabbit and girl"
generated_text = generate(model, start_string, predict_len=200)
print(generated_text)


rabbit and girls married the house with a lot of fun. The boy looked so confused and a lot of things. But then he remembered to be harmless. 

However, they played a gate and their girl and the letter was warm.

But


In [12]:
torch.save(model.state_dict(),'storygen.pth')