In [4]:
!pip uninstall torch torchtext -y
!pip install torch torchtext --index-url https://download.pytorch.org/whl/cu118
!pip install 'portalocker>=2.0.0'
!pip install 'numpy<2'

Found existing installation: torch 2.2.0+cu118
Uninstalling torch-2.2.0+cu118:
  Successfully uninstalled torch-2.2.0+cu118
Found existing installation: torchtext 0.17.0+cpu
Uninstalling torchtext-0.17.0+cpu:
  Successfully uninstalled torchtext-0.17.0+cpu
Looking in indexes: https://download.pytorch.org/whl/cu118
Collecting torch
  Using cached https://download.pytorch.org/whl/cu118/torch-2.7.0%2Bcu118-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchtext
  Using cached https://download.pytorch.org/whl/torchtext-0.17.0%2Bcpu-cp311-cp311-linux_x86_64.whl (2.0 MB)
Collecting sympy>=1.13.3 (from torch)
  Using cached https://download.pytorch.org/whl/sympy-1.13.3-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cudnn-cu11==9.1.0.70 (from torch)
  Using cached https://download.pytorch.org/whl/cu118/nvidia_cudnn_cu11-9.1.0.70-py3-none-manylinux2014_x86_64.whl (663.9 MB)
Collecting nvidia-nccl-cu11==2.21.5 (from torch)
  Using cached https://download.pytorch.org/whl/



In [19]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
import torch.nn as nn
import torch.optim as optim
from torchtext.datasets import IMDB
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence # For padding
import torch.nn.functional as F # For softmax and multinomial sampling
import torch.quantization
from torch.cuda.amp import autocast, GradScaler
import warnings
warnings.filterwarnings("ignore")

Using device: cuda


In [23]:
# --- 0. Setup Global Variables and Special Tokens ---
# Define special tokens and their indices
UNK_IDX, PAD_IDX, BOS_IDX, EOS_IDX = 0, 1, 2, 3
special_tokens = ['<unk>', '<pad>', '<bos>', '<eos>']

In [24]:
# --- 1. Data Loading and Preprocessing ---
train_iter, test_iter = IMDB(split=('train', 'test'))
tokenizer = get_tokenizer('basic_english')

def yield_tokens(data_iter):
    for _, text in data_iter:
        yield tokenizer(text)

In [39]:
# Build vocabulary from training data, including special tokens
vocab = build_vocab_from_iterator(yield_tokens(train_iter),
                                  min_freq=1,
                                  specials=special_tokens,
                                  special_first=True) # Ensure special tokens are at the beginning
vocab.set_default_index(UNK_IDX) # Set default index for unknown tokens

# Text processing pipeline: converts raw text string to a list of token IDs
def text_pipeline(text):
    return vocab(tokenizer(text))

# Collate function for DataLoader: Pads sequences and creates input/target pairs
# In collate_batch function, before pad_sequence
MAX_SEQUENCE_LENGTH = 16 # Define this globally, adjust as needed
def collate_batch(batch):
    data = []
    for _, text in batch:
        token_ids = [BOS_IDX] + text_pipeline(text) + [EOS_IDX]
        # Truncate sequences that are too long
        if len(token_ids) > MAX_SEQUENCE_LENGTH:
            token_ids = token_ids[:MAX_SEQUENCE_LENGTH - 1] + [EOS_IDX] # Ensure EOS is present
        data.append(torch.tensor(token_ids, dtype=torch.long))
    data = pad_sequence(data, batch_first=True, padding_value=PAD_IDX)
    input_seq = data[:, :-1]
    target_seq = data[:, 1:]
    return input_seq, target_seq

In [40]:
# --- 2. Model Definition (Text Generator) ---
class TextGenerator(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim):
        super().__init__()
        # Embedding layer: Converts token IDs to dense vectors
        # `padding_idx` ensures that PAD tokens are ignored (zeroed out)
        self.embedding = nn.Embedding(vocab_size, embed_dim, padding_idx=PAD_IDX)
        # LSTM layer: Processes sequences. `batch_first=True` matches our (batch_size, seq_len) input
        self.lstm = nn.LSTM(embed_dim, hidden_dim, batch_first=True)
        # Linear layer: Maps LSTM output to vocabulary size (logits for next token prediction)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.init_weights()
        self.hidden_dim = hidden_dim # Store hidden dimension for potentially initializing hidden states

    def init_weights(self):
        # Initialize weights with a uniform distribution for better training stability
        initrange = 0.1
        self.embedding.weight.data.uniform_(-initrange, initrange)
        self.fc.weight.data.uniform_(-initrange, initrange)
        self.fc.bias.data.zero_()
        # LSTM weights are often initialized by PyTorch's defaults, or more sophisticated methods.

    def forward(self, text, hidden=None):
        # `text` shape: (batch_size, seq_len)
        embedded = self.embedding(text) # Output shape: (batch_size, seq_len, embed_dim)
        # Pass embedded sequence through LSTM.
        # `hidden` can be passed for sequential inference (e.g., generating token by token).
        output, hidden = self.lstm(embedded, hidden) # `output` shape: (batch_size, seq_len, hidden_dim)
        # Apply linear layer to each time step's LSTM output
        output = self.fc(output) # Output shape: (batch_size, seq_len, vocab_size) - logits for each token in sequence
        return output, hidden # Return logits and the final hidden state

# Model parameters
VOCAB_SIZE = len(vocab)
EMBED_DIM = 8
HIDDEN_DIM = 16 # New parameter for LSTM's hidden state size

model = TextGenerator(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM).to(device)

In [41]:
# --- 3. Training Setup ---
# Create DataLoaders with the new collate_batch function
BATCH_SIZE = 16 # Batch size for training
train_dataloader = DataLoader(train_iter, batch_size=BATCH_SIZE, shuffle=True, collate_fn=collate_batch)
test_dataloader = DataLoader(test_iter, batch_size=BATCH_SIZE, shuffle=False, collate_fn=collate_batch)

# Criterion for text generation is CrossEntropyLoss, ignoring PAD tokens
criterion = nn.CrossEntropyLoss(ignore_index=PAD_IDX)
optimizer = optim.Adam(model.parameters(), lr=0.001)
num_epochs = 2 # Reduced number of epochs for faster example; text generation needs more training

In [44]:
# --- 4. Training Loop ---
scaler = GradScaler() # Initialize once

In [45]:
def train(dataloader, model, criterion, optimizer, epoch):
    model.train()
    total_loss = 0
    total_batches = 0 # <--- ADD THIS
    for idx, (data, targets) in enumerate(dataloader):
        data, targets = data.to(device), targets.to(device)

        optimizer.zero_grad()
        with autocast():
            predicted_logits, _ = model(data)
            loss = criterion(predicted_logits.view(-1, VOCAB_SIZE), targets.view(-1))

        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
        scaler.step(optimizer)
        scaler.update()

        del data, targets, predicted_logits
        torch.cuda.empty_cache()

        total_loss += loss.item()
        total_batches += 1 # <--- ADD THIS

        if idx % 100 == 0 and idx > 0:
            print(f'Epoch {epoch}, Batch {idx}: Loss: {loss.item():.4f}')

    print(f'Epoch {epoch}: Train Loss: {total_loss/total_batches:.4f}') # <--- CHANGE THIS

def evaluate(dataloader, model, criterion):
    model.eval()
    total_loss = 0
    total_batches = 0 # <--- ADD THIS
    with torch.no_grad():
        for data, targets in dataloader:
            data, targets = data.to(device), targets.to(device)

            predicted_logits, _ = model(data)
            loss = criterion(predicted_logits.view(-1, VOCAB_SIZE), targets.view(-1))
            total_loss += loss.item()
            total_batches += 1 # <--- ADD THIS

            del data, targets, predicted_logits
            torch.cuda.empty_cache()

    return total_loss/total_batches # <--- CHANGE THIS

In [46]:
# Train the model
for epoch in range(1, num_epochs + 1):
    train(train_dataloader, model, criterion, optimizer, epoch)
    val_loss = evaluate(test_dataloader, model, criterion)
    print(f'Epoch {epoch}: Test Loss: {val_loss:.4f}')

Epoch 1, Batch 100: Loss: 6.6406
Epoch 1, Batch 200: Loss: 6.5776
Epoch 1, Batch 300: Loss: 6.4466
Epoch 1, Batch 400: Loss: 6.8973
Epoch 1, Batch 500: Loss: 6.5997
Epoch 1, Batch 600: Loss: 6.1975
Epoch 1, Batch 700: Loss: 6.5565
Epoch 1, Batch 800: Loss: 6.2795
Epoch 1, Batch 900: Loss: 6.4087
Epoch 1, Batch 1000: Loss: 6.7015
Epoch 1, Batch 1100: Loss: 6.0879
Epoch 1, Batch 1200: Loss: 6.2047
Epoch 1, Batch 1300: Loss: 6.3930
Epoch 1, Batch 1400: Loss: 5.9754
Epoch 1, Batch 1500: Loss: 5.9356
Epoch 1: Train Loss: 6.5010
Epoch 1: Test Loss: 6.4479
Epoch 2, Batch 100: Loss: 6.3337
Epoch 2, Batch 200: Loss: 6.7159
Epoch 2, Batch 300: Loss: 6.4747
Epoch 2, Batch 400: Loss: 6.7742
Epoch 2, Batch 500: Loss: 6.2233
Epoch 2, Batch 600: Loss: 6.2449
Epoch 2, Batch 700: Loss: 6.4533
Epoch 2, Batch 800: Loss: 6.2593
Epoch 2, Batch 900: Loss: 6.3526
Epoch 2, Batch 1000: Loss: 6.4142
Epoch 2, Batch 1100: Loss: 6.3068
Epoch 2, Batch 1200: Loss: 6.3856
Epoch 2, Batch 1300: Loss: 6.6566
Epoch 2, Ba

In [47]:
# --- 5. Quantization (Post-Training Dynamic) ---
print("\nApplying quantization...")

# Create a copy of the model and load its trained weights for quantization
model_to_quantize = TextGenerator(VOCAB_SIZE, EMBED_DIM, HIDDEN_DIM)
model_to_quantize.load_state_dict(model.state_dict())
model_to_quantize.eval() # Set to evaluation mode before quantization

# Apply dynamic quantization to Linear and LSTM modules
quantized_model = torch.quantization.quantize_dynamic(
    model_to_quantize, {nn.Linear, nn.LSTM}, dtype=torch.qint8 # Include nn.LSTM
)
print("Quantization complete.")


Applying quantization...
Quantization complete.


In [56]:
# --- 6. Text Generation Example ---
print("\nExample text generation:")

def generate_text(model, vocab, start_text, max_len=50, temperature=0.8):
    model.eval() # Set model to evaluation mode
    # Convert starting text to token IDs, prepending BOS
    input_ids = [BOS_IDX] + text_pipeline(start_text)
    generated_ids = list(input_ids)

    # Initialize LSTM's hidden state (h_0, c_0) to None
    hidden = None
    model_device = next(model.parameters()).device

    with torch.no_grad():
        for _ in range(max_len):
            # For generation, feed only the *last* generated token as input
            # This is crucial for autoregressive generation
            current_input_tensor = torch.tensor([[generated_ids[-1]]], dtype=torch.long).to(model_device) # Shape (1, 1)

            # Pass the single token and the current hidden state to the model
            output_logits, hidden = model(current_input_tensor, hidden)

            # Apply temperature to logits for creativity/randomness
            # We care about the prediction for the single token in `current_input_tensor`
            prediction_logits = output_logits[:, -1, :] / temperature
            probabilities = F.softmax(prediction_logits, dim=-1) # Convert logits to probabilities

            # Sample the next token from the probability distribution
            next_token_id = torch.multinomial(probabilities, num_samples=1).item()

            generated_ids.append(next_token_id) # Add the sampled token to the generated sequence

            # Stop generation if EOS token is predicted
            if next_token_id == EOS_IDX:
                break

    # Convert generated token IDs back to human-readable text
    generated_text = ' '.join(vocab.lookup_tokens(generated_ids))
    # Clean up special tokens for display
    generated_text = generated_text.replace(vocab.lookup_token(BOS_IDX), '')
    generated_text = generated_text.replace(vocab.lookup_token(EOS_IDX), '')
    generated_text = generated_text.replace(vocab.lookup_token(PAD_IDX), '')
    return ' '.join(generated_text.split()) # Remove any extra spaces caused by token replacement


Example text generation:


In [57]:
# Test generation with the original (full precision) model
start_phrase = "This movie was"
generated_sentence = generate_text(model, vocab, start_phrase, max_len=30)
print(f"Prompt: '{start_phrase}'\nGenerated (Original): '{generated_sentence}'")

Prompt: 'This movie was'
Generated (Original): 'this movie was foundationally compared one . performance t little'


In [58]:
# Test generation with the quantized model
start_phrase_quant = "I did not like"
generated_sentence_quant = generate_text(quantized_model, vocab, start_phrase_quant, max_len=30)
print(f"Prompt: '{start_phrase_quant}'\nGenerated (Quantized): '{generated_sentence_quant}'")

Prompt: 'I did not like'
Generated (Quantized): 'i did not like air-conditioned as i is it enjoyed . this a theat four . , a stupid incognizant , , was his movie a be , ' this this was'
