In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
with open('shakespeare.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [3]:
# Tokenization
tokens = text.split()

In [4]:
# Vocabulary creation
vocab = sorted(set(tokens))
token_to_idx = {token: idx for idx, token in enumerate(vocab)}
idx_to_token = {idx: token for token, idx in token_to_idx.items()}

In [5]:
# Sequence creation
seq_length = 30
sequences = []
for i in range(len(tokens) - seq_length):
    seq = tokens[i:i + seq_length]
    sequences.append(seq)

In [6]:
# Encoding
encoded_sequences = [[token_to_idx[token] for token in seq] for seq in sequences]
encoded_sequences = np.array(encoded_sequences)

In [7]:
# Train-test split
X_train, X_val = train_test_split(encoded_sequences, test_size=0.2, random_state=42)

In [8]:
class TextDataset(Dataset):
    def __init__(self, sequences):
        self.sequences = sequences

    def __len__(self):
        return len(self.sequences)

    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx][:-1], dtype=torch.long), torch.tensor(self.sequences[idx][1:], dtype=torch.long)

train_dataset = TextDataset(X_train)
val_dataset = TextDataset(X_val)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

In [9]:
class LSTMModel(nn.Module):
    def __init__(self, vocab_size, embed_size, hidden_size, num_layers):
        super(LSTMModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)
        self.lstm = nn.LSTM(embed_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x, _ = self.lstm(x)
        x = self.fc(x)
        return x

In [10]:
# Hyperparameters
vocab_size = len(vocab)
embed_size = 128
hidden_size = 256
num_layers = 2
num_epochs = 5  # Reduced number of epochs for quicker testing
batch_size = 128  # Increased batch size

model = LSTMModel(vocab_size, embed_size, hidden_size, num_layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Check if GPU is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

LSTMModel(
  (embedding): Embedding(67161, 128)
  (lstm): LSTM(128, 256, num_layers=2, batch_first=True)
  (fc): Linear(in_features=256, out_features=67161, bias=True)
)

In [11]:
import numpy as np
print(np.__version__)

1.26.4


In [12]:
# Training loop
for epoch in range(num_epochs):
    model.train()
    for inputs, targets in train_loader:
        inputs, targets = inputs.to(device), targets.to(device)
        outputs = model(inputs)
        loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    model.eval()
    val_loss = 0
    with torch.no_grad():
        for inputs, targets in val_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs.view(-1, vocab_size), targets.view(-1))
            val_loss += loss.item()

    print(f'Epoch {epoch+1}/{num_epochs}, Training Loss: {loss.item()}, Validation Loss: {val_loss/len(val_loader)}')

Epoch 1/5, Training Loss: 4.7308244705200195, Validation Loss: 4.611315283227622
Epoch 2/5, Training Loss: 3.8419575691223145, Validation Loss: 3.4938875328267263
Epoch 3/5, Training Loss: 3.2860748767852783, Validation Loss: 2.870502651941299
Epoch 4/5, Training Loss: 2.8938043117523193, Validation Loss: 2.476369350645488
Epoch 5/5, Training Loss: 2.640303134918213, Validation Loss: 2.2196526603168385


In [13]:
torch.save(model.state_dict(), 'task3/model.pth')