In [1]:
import torch

# Check if CUDA is available
device = torch.device('cpu')
if torch.cuda.is_available():
    device = torch.device('cuda')

torch.set_default_device(device)
print(f"Using device = {torch.get_default_device()}")

Using device = cuda:0


In [None]:
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from collections import Counter
import string

class SimpleRNN(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim):
        super(SimpleRNN, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.RNN(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, vocab_size)
    
    def forward(self, x, hidden):
        batch_size = x.size(0)
        embeds = self.embedding(x) 
        out, hidden = self.rnn(embeds, hidden)
        out = self.fc(out)  # (batch_size, sequence_length, vocab_size)
        return out, hidden
    
    def init_hidden(self, batch_size):
        return torch.zeros(1, batch_size, self.hidden_dim)

In [None]:
class TextDataset(Dataset):
    def __init__(self, text, sequence_length):
        self.text = text
        self.sequence_length = sequence_length
        
        # Create vocabulary
        words = text.split()
        self.vocab = ['<PAD>'] + list(set(words))
        self.word_to_idx = {word: idx for idx, word in enumerate(self.vocab)}
        self.idx_to_word = {idx: word for word, idx in self.word_to_idx.items()}
        
        # Create sequences
        self.sequences = []
        self.targets = []
        
        for i in range(len(words) - sequence_length):
            seq = words[i:i + sequence_length]
            target = words[i + sequence_length]
            seq_idx = [self.word_to_idx[word] for word in seq]
            target_idx = self.word_to_idx[target]
            self.sequences.append(seq_idx)
            self.targets.append(target_idx)
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return (torch.tensor(self.sequences[idx]), 
                torch.tensor(self.targets[idx]))
    
    def vocab_size(self):
        return len(self.vocab)

In [None]:
def train_loop(dataloader, model, loss_fn, optimizer):
    model.train()
    for batch, (X, y) in enumerate(dataloader):
        y_hat = model(X)
        loss = loss_fn(y_hat, y)
        
        # compute gradient
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        
        if batch % 100 == 0:
            print(f"loss={loss.item()} batch={batch}")

In [None]:
def train_model(model, train_loader, num_epochs, learning_rate=0.001):
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    
    model.train()
    for i in num_epochs:
        print(f"epoch {i}")
        train_loop(train_loader, model, loss_fn, optimizer)

In [None]:
# Example usage
def main():
    # Sample text
    text = """I love learning deep learning in Stats 507."""
    
    # Hyperparameters
    sequence_length = 3
    embedding_dim = 64
    hidden_dim = 128
    batch_size = 2
    num_epochs = 10
    
    # Create dataset
    dataset = TextDataset(text, sequence_length)
    train_loader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
    
    # Initialize model
    model = SimpleRNN(dataset.vocab_size(), embedding_dim, hidden_dim)
    
    # Train the model
    train_model(model, train_loader, num_epochs)
    
    return model, dataset

if __name__ == "__main__":
    model, dataset = main()

['quick', 'example', 'removing', 'stopwords', 'normalizing', 'text']


In [None]:
text = open("harrypotter.txt", "rt", encoding="ISO-8859-1").read()
text

In [29]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# lowercase and remove punctuation
text = re.sub(r'[^a-z\s]', '', text.lower())

# tokenize and remove stopwords
words = [w for w in word_tokenize(text) if w not in stopwords.words('english')]

from nltk.stem import WordNetLemmatizer
lem = WordNetLemmatizer()
words = [lem.lemmatize(w) for w in words]

print(words)



In [30]:
len(set(words))

5329

In [1]:
@torch.no_grad()
def complete_text(model, ds, input_string, num_words=50, device="cpu"):
    model.eval()

    # 1. Convert to word tokens
    words = input_string.split()
    
    # drop OOV words
    tokens = []
    for w in words:
        if w in ds.word_to_idx:
            tokens.append(ds.word_to_idx[w])
        # else ignore silently

    # 2. Pad or truncate to the model's context window
    seq_len = ds.sequence_length
    pad_idx = ds.word_to_idx["<PAD>"]

    if len(tokens) >= seq_len:
        ctx = tokens[-seq_len:]
    else:
        ctx = [pad_idx] * (seq_len - len(tokens)) + tokens

    ctx = torch.tensor(ctx, device=device).unsqueeze(0)  # shape (1, seq_len)

    # 3. Initialize hidden state
    h = model.init_hidden(batch_size=1)

    # 4. Feed the context through the RNN
    for i in range(seq_len):
        x = ctx[:, i].unsqueeze(1)
        out, h = model(x, h)

    # 5. Autoregressive generation
    generated = tokens[:]  # actual non-pad part
    last_token = ctx[0, -1].item()

    x = torch.tensor([[last_token]], device=device)
    for _ in range(num_words):
        out, h = model(x, h)
        logits = out[:, -1, :]
        probs = torch.softmax(logits, dim=-1)
        next_token = torch.multinomial(probs, 1).item()

        generated.append(next_token)
        x = torch.tensor([[next_token]], device=device)

    # 6. Decode back to words
    generated_words = [ds.idx_to_word[i] for i in generated]

    return " ".join(generated_words)

NameError: name 'torch' is not defined

In [None]:
complete_text()