In [15]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
import numpy as np

# Load the PTB dataset
text = open("ptb.train.txt", "r").read() # Replace this with the actual path to the PTB dataset
text = text.split()

# Prepare the vocabulary and create mapping from words to integers
vocab = set(text)
word_to_int = {word: ii for ii, word in enumerate(vocab)}
int_to_word = {ii: word for ii, word in enumerate(vocab)}

# Convert the text data to integer form
encoded = np.array([word_to_int[word] for word in text])

# Split the encoded text into chunks of length `seq_length`
seq_length = 32
data = []
target = []
for i in range(0, len(encoded) - seq_length):
    data.append(encoded[i: i + seq_length])
    target.append(encoded[i + seq_length])

# Convert the data and target to tensors
data = torch.LongTensor(data)
target = torch.LongTensor(target)

# Create a TensorDataset from data and target tensors
dataset = TensorDataset(data, target)

# Create a DataLoader from the TensorDataset
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# Define the language model
class LM(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, n_layers):
        super(LM, self).__init__()
        self.hidden_dim = hidden_dim
        self.n_layers = n_layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.lstm(x, hidden)
        print('x.shape', x.shape)
        x = x.contiguous().view(-1, x.shape[2])
        print('x.shape', x.shape)
        x = self.fc(x)
        print('x.shape', x.shape)
        return x, hidden
    
    def init_hidden(self, batch_size):
        return (torch.zeros(self.n_layers, batch_size, self.hidden_dim),
                torch.zeros(self.n_layers, batch_size, self.hidden_dim))

# Initialize the language model
model = LM(len(vocab), 128, 512, 2)

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())

# Train the language model
n_epochs = 10
for epoch in range(n_epochs):
    hidden = model.init_hidden(32)
    for i, batch in enumerate(dataloader):
        x, y = batch
        # print(x.shape, y.shape)
        optimizer.zero_grad()
        output, hidden = model(x, hidden)        
        loss = criterion(output, y)
        loss.backward()
        optimizer.step()

        if (i + 1) % 100 == 0:
            print(f'Epoch [{epoch + 1}/{n_epochs}], Step [{i + 1}/{len(dataloader)}], Loss: {loss.item():.4f}')

    torch.save(model, f'Epoch_{epoch}')

x.shape torch.Size([32, 32, 512])
x.shape torch.Size([1024, 512])
x.shape torch.Size([1024, 9999])


ValueError: Expected input batch_size (1024) to match target batch_size (32).