In [17]:
import os
import pdfplumber
import torch
import torch.nn as nn
from torch.nn import functional as F

# --------------------- Hyperparameters ---------------------
batch_size = 16      # sequences per batch
block_size = 32      # context length
max_iters = 5000
eval_interval = 100
learning_rate = 1e-3
device = 'cuda' if torch.cuda.is_available() else 'cpu'
eval_iters = 200
n_embd = 64
n_head = 4
n_layer = 4
dropout = 0.0
# -----------------------------------------------------------

torch.manual_seed(1337)

# current_path = os.path.dirname(os.path.abspath(__file__))
# File paths
input_pdf = "datasets/alice.pdf"

# Step 1. Extract text from the input PDF using pdfplumber
extracted_pages = []
with pdfplumber.open(input_pdf) as pdf:
    for page in pdf.pages:
        text = page.extract_text()  # you can also extract tables or images as needed
        extracted_pages.append(text)

alice_book = " ".join(extracted_pages)
# 1) Load and combine corpora
with open('datasets/alice_qa.txt', 'r', encoding='utf-8') as f:
    qa_data = f.read()

# Combine text and build vocabulary
text = alice_book + "\n\n" + qa_data
chars = sorted(list(set(text)))
vocab_size = len(chars)
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }
encode = lambda s: [stoi[c] for c in s]
decode = lambda l: ''.join(itos[i] for i in l)

# Create train/val splits
data = torch.tensor(encode(alice_book), dtype=torch.long)
qa_data_tensor = torch.tensor(encode(qa_data), dtype=torch.long)
n = int(0.9 * len(data))
m = int(0.9 * len(qa_data_tensor))
train_data = torch.cat((data[:n], qa_data_tensor[:m]))
test_data = torch.cat((data[n:], qa_data_tensor[m:]))
# train_data = data[:n]
# test_data   = data[n:]

# Batch generator
def get_batch(split):
    data_split = train_data if split=='train' else test_data
    ix = torch.randint(len(data_split) - block_size, (batch_size,))
    x = torch.stack([data_split[i:i+block_size] for i in ix])
    y = torch.stack([data_split[i+1:i+block_size+1] for i in ix])
    return x.to(device), y.to(device)

# Loss estimation
@torch.no_grad()
def estimate_loss():
    out = {}
    model.eval()
    for split in ('train','val'):
        losses = torch.zeros(eval_iters, device=device)
        for k in range(eval_iters):
            X, Y = get_batch(split)
            _, loss = model(X, Y)
            losses[k] = loss
        out[split] = losses.mean().item()
    model.train()
    return out

# --------------------- Model Definition ---------------------

class Head(nn.Module):
    """ One head of self-attention """
    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,hs)
        q = self.query(x) # (B,T,hs)
        # attention scores
        wei = q @ k.transpose(-2,-1) * (C ** -0.5)  # (B,T,T)
        wei = wei.masked_fill(self.tril[:T,:T]==0, float('-inf'))
        wei = F.softmax(wei, dim=-1)
        wei = self.dropout(wei)
        # aggregate values
        v = self.value(x)                           # (B,T,hs)
        out = wei @ v                               # (B,T,hs)
        return out

class MultiHeadAttention(nn.Module):
    """ Multiple heads of self-attention """
    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)  # (B,T,n_embd)
        out = self.dropout(self.proj(out))
        return out

class FeedForward(nn.Module):
    """ Simple Feed-Forward Network """
    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )
    def forward(self, x):
        return self.net(x)

class Block(nn.Module):
    """ Transformer Block """
    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)
    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

class BigramLanguageModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.token_embedding_table    = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[
            Block(n_embd, n_head=n_head) for _ in range(n_layer)
        ])
        self.ln_f = nn.LayerNorm(n_embd)
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape
        tok_emb = self.token_embedding_table(idx)                   # (B,T,C)
        pos_emb = self.position_embedding_table(
            torch.arange(T, device=device)
        )                                                           # (T,C)
        x = tok_emb + pos_emb                                       # (B,T,C)
        x = self.blocks(x)                                          # (B,T,C)
        x = self.ln_f(x)                                            # (B,T,C)
        logits = self.lm_head(x)                                    # (B,T,vocab)
        if targets is None:
            return logits, None
        # reshape for loss
        B, T, V = logits.shape
        logits = logits.view(B*T, V)
        targets = targets.view(B*T)
        loss = F.cross_entropy(logits, targets)
        return logits, loss

    def generate(self, idx, max_new_tokens):
        for _ in range(max_new_tokens):
            idx_cond = idx[:, -block_size:]
            logits, _ = self(idx_cond)
            logits = logits[:, -1, :]               # (B, vocab)
            probs = F.softmax(logits, dim=-1)       # (B, vocab)
            idx_next = torch.multinomial(probs, num_samples=1)  # (B,1)
            idx = torch.cat([idx, idx_next], dim=1)            # (B, T+1)
        return idx

# Instantiate model
model = BigramLanguageModel().to(device)
print(f"Number of parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad)}")
print("running on", device)
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)

# ----------------------- Inference ----------------------------
def answer_question(question, max_len=100):
    prompt = f"Q: {question}\nA:"
    idx = torch.tensor([encode(prompt)], device=device)
    out = model.generate(idx, max_new_tokens=max_len)[0].tolist()
    # only decode the new tokens after the prompt
    generated = decode(out[len(prompt):])
    return generated.strip()

# Example
# print("Q: Who wrote Hamlet?")
# print("A:", answer_question("Who wrote Hamlet?"))


CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, defaulting to MediaBox
CropBox missing from /Page, def

Number of parameters: 213212
running on cuda


In [18]:
# ----------------------- Training Loop -----------------------
for iter in range(max_iters):

      # Evaluate loss periodically
      if iter % eval_interval == 0 or iter == max_iters-1:
          losses = estimate_loss()
          print(f"step {iter}: train loss {losses['train']:.4f},"
                f" val loss {losses['val']:.4f}")

      xb, yb = get_batch('train')
      logits, loss = model(xb, yb)
      optimizer.zero_grad(set_to_none=True)
      loss.backward()
      optimizer.step()

step 0: train loss 4.7317, val loss 4.7558
step 100: train loss 2.6597, val loss 3.0354
step 200: train loss 2.5089, val loss 2.9801
step 300: train loss 2.4153, val loss 2.9246
step 400: train loss 2.3215, val loss 2.8999
step 500: train loss 2.2377, val loss 2.8414
step 600: train loss 2.1688, val loss 2.8064
step 700: train loss 2.0919, val loss 2.7396
step 800: train loss 2.0302, val loss 2.7021
step 900: train loss 1.9726, val loss 2.6693
step 1000: train loss 1.9422, val loss 2.6546
step 1100: train loss 1.8956, val loss 2.6211
step 1200: train loss 1.8587, val loss 2.5202
step 1300: train loss 1.8351, val loss 2.6143
step 1400: train loss 1.7850, val loss 2.5377
step 1500: train loss 1.7650, val loss 2.5588
step 1600: train loss 1.7494, val loss 2.4881
step 1700: train loss 1.7253, val loss 2.4535
step 1800: train loss 1.7040, val loss 2.4303
step 1900: train loss 1.6784, val loss 2.4210
step 2000: train loss 1.6600, val loss 2.3970
step 2100: train loss 1.6561, val loss 2.4052


In [19]:
from pathlib import Path

# 1. Create models directory
MODEL_PATH = Path("models")
MODEL_PATH.mkdir(parents=True, exist_ok=True)

# 2. Create model save path
MODEL_NAME = "alice_model_1.pth"
MODEL_SAVE_PATH = MODEL_PATH / MODEL_NAME

In [20]:
# 3. Save the model state dict
import os
if not os.path.exists(MODEL_SAVE_PATH):
  print(f"Saving model to: {MODEL_SAVE_PATH}")
  torch.save(obj=model.state_dict(), # only saving the state_dict() only saves the models learned parameters
            f=MODEL_SAVE_PATH)

Saving model to: models\alice_model_1.pth


In [None]:
# Load the model for evaluation
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"Loading model from: {MODEL_SAVE_PATH}", "on device:", device)
save_model = BigramLanguageModel()
save_model.load_state_dict(torch.load(MODEL_SAVE_PATH, map_location=device))
model = save_model
model.to(device)
model.eval() # Set the model to evaluation mode

Loading model from: models\alice_model_1.pth on device: cuda


BigramLanguageModel(
  (token_embedding_table): Embedding(92, 64)
  (position_embedding_table): Embedding(32, 64)
  (blocks): Sequential(
    (0): Block(
      (sa): MultiHeadAttention(
        (heads): ModuleList(
          (0-3): 4 x Head(
            (key): Linear(in_features=64, out_features=16, bias=False)
            (query): Linear(in_features=64, out_features=16, bias=False)
            (value): Linear(in_features=64, out_features=16, bias=False)
            (dropout): Dropout(p=0.0, inplace=False)
          )
        )
        (proj): Linear(in_features=64, out_features=64, bias=True)
        (dropout): Dropout(p=0.0, inplace=False)
      )
      (ffwd): FeedForward(
        (net): Sequential(
          (0): Linear(in_features=64, out_features=256, bias=True)
          (1): ReLU()
          (2): Linear(in_features=256, out_features=64, bias=True)
          (3): Dropout(p=0.0, inplace=False)
        )
      )
      (ln1): LayerNorm((64,), eps=1e-05, elementwise_affine=True)
   

In [21]:
# Generate some sample text from the trained model
context = torch.zeros((1, 1), dtype=torch.long, device=device)
generated_text = decode(model.generate(context, max_new_tokens=200)[0].tolist())
print("Generated text:")
print(generated_text)

Generated text:

alreed to
herself her with take it had both that it?”)owed the Lent, “Itanwambled as wan was intering diffulling that those
busands saw chame for this, but the
works!”
Just thee of a camchasir, she wa


In [1]:
# Example usage of the question-answering functionality:
question = 'Who wrote "Alice Adventures in Wonderland"?'
answer = answer_question(question, max_len=100)
print("\nQuestion:", question)
print("Answer:", answer)

NameError: name 'answer_question' is not defined