In [1]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import OneHotEncoder
from collections import defaultdict

# Read text file
file = open('sherlock.txt', 'r')
text = file.read()

# Tokenize text
tokenizer = defaultdict(lambda: len(tokenizer))  # unique index for each word
tokenized_text = [tokenizer[word] for word in text.split()]

# Set total word count
total_word_count = len(tokenizer)

# Generate input sequences for model training
input_sequences = []
for line in text.split('\n'):
    token_list = [tokenizer[word] for word in line.split() if word]
    for i in range(1, len(token_list)):
        n_grams = token_list[:i + 1]
        input_sequences.append(n_grams)

# Pad sequences

In [2]:
max_sequence_length = max(len(seq) for seq in input_sequences)
input_sequences = np.array([np.pad(seq, (max_sequence_length - len(seq), 0), 'constant') for seq in input_sequences])

# Split data into X and y
X = torch.tensor(input_sequences[:, :-1], dtype=torch.long)
y = torch.tensor(input_sequences[:, -1], dtype=torch.long)

# Define model
class TextGenerationModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(TextGenerationModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        out, _ = self.lstm(x)
        out = self.fc(out[:, -1, :])
        return out

In [3]:
vocab_size = total_word_count
embedding_dim = 100
hidden_dim = 100
model = TextGenerationModel(vocab_size, embedding_dim, hidden_dim)

# Set up training
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
X, y = X.to(device), y.to(device)

# Train model
epochs = 10
batch_size = 32
dataset = TensorDataset(X, y)
data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

for epoch in range(epochs):
    for batch_X, batch_y in data_loader:
        batch_X, batch_y = batch_X.to(device), batch_y.to(device)
        
        # Forward pass
        outputs = model(batch_X)
        loss = criterion(outputs, batch_y)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    
    print(f'Epoch [{epoch+1}/{epochs}], Loss: {loss.item():.4f}')

Epoch [1/10], Loss: 6.0895
Epoch [2/10], Loss: 5.9101
Epoch [3/10], Loss: 4.9870
Epoch [4/10], Loss: 4.1733
Epoch [5/10], Loss: 4.3087
Epoch [6/10], Loss: 4.5993
Epoch [7/10], Loss: 2.9191
Epoch [8/10], Loss: 3.5271
Epoch [9/10], Loss: 2.9597
Epoch [10/10], Loss: 2.4367
