# Light Q&A Demo with RNN, LSTM, GRU, Transformer, BERT
Tiny dataset demo for educational purposes. Seq2Seq models train quickly, BERT uses pretrained model.

In [1]:
!pip install -q tensorflow transformers sentencepiece --upgrade
print('Install complete')

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.1/40.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m620.7/620.7 MB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.6/11.6 MB[0m [31m73.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.5/5.5 MB[0m [31m81.3 MB/s[0m eta [36m0:00:00[0m
[?25h[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
tensorflow-decision-forests 1.12.0 requires tensorflow==2.19.0, but you have tensorflow 2.20.0 which is incompatible.
tensorflow-text 2.19.0 requires tensorflow<2.20,>=2.19.0, but you have tensorflow 2.20.0 which is incompatible.
tf-keras 2.19.0 requires tensorflow<2.20,>=2.19, but you have tensorflow 2.20.0 which is incompatible.[0m[

## 1) Tiny Q&A Dataset

In [2]:

import random

data = [
    {"context": "Albert Einstein was a physicist who developed the theory of relativity.",
     "question": "Who developed the theory of relativity?", "answer": "Albert Einstein"},
    {"context": "The capital of France is Paris.",
     "question": "What is the capital of France?", "answer": "Paris"},
    {"context": "The Sun rises in the east.",
     "question": "Where does the Sun rise?", "answer": "in the east"},
    {"context": "Python is a popular programming language.",
     "question": "Which language is popular for programming?", "answer": "Python"},
    {"context": "Mount Everest is the highest mountain in the world.",
     "question": "What is the highest mountain?", "answer": "Mount Everest"}
]
random.shuffle(data)
for d in data:
    print("Q:", d["question"], "| A:", d["answer"])


Q: Which language is popular for programming? | A: Python
Q: Who developed the theory of relativity? | A: Albert Einstein
Q: Where does the Sun rise? | A: in the east
Q: What is the capital of France? | A: Paris
Q: What is the highest mountain? | A: Mount Everest


## 2) Tokenization & Sequence Preparation

In [3]:

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import numpy as np

all_texts = [d["context"] + " " + d["question"] for d in data]
all_answers = [d["answer"] for d in data]

src_tok = Tokenizer(filters='', oov_token='<oov>')
src_tok.fit_on_texts(all_texts)
src_vocab_size = len(src_tok.word_index)+1

tgt_tok = Tokenizer(filters='', oov_token='<oov>')
tgt_tok.fit_on_texts(all_answers)
tgt_vocab_size = len(tgt_tok.word_index)+1

src_seqs = src_tok.texts_to_sequences(all_texts)
tgt_seqs = tgt_tok.texts_to_sequences(all_answers)

max_src_len = max(len(s) for s in src_seqs)
max_tgt_len = max(len(s) for s in tgt_seqs)

encoder_input = pad_sequences(src_seqs, maxlen=max_src_len, padding='post')
decoder_input = pad_sequences([s[:-1] for s in tgt_seqs], maxlen=max_tgt_len-1, padding='post')
decoder_target = pad_sequences([s[1:] for s in tgt_seqs], maxlen=max_tgt_len-1, padding='post')

print('src_vocab_size:', src_vocab_size, 'tgt_vocab_size:', tgt_vocab_size)


src_vocab_size: 41 tgt_vocab_size: 11


## 3) Seq2Seq Models Helper (RNN/LSTM/GRU)

In [4]:

from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, SimpleRNN, LSTM, GRU, Dense, TimeDistributed

def build_seq2seq(cell_type='rnn', embedding_dim=64, latent_dim=64):
    encoder_inputs = Input(shape=(max_src_len,))
    enc_emb = Embedding(src_vocab_size, embedding_dim, mask_zero=True)(encoder_inputs)
    if cell_type=='rnn':
        _, state_h = SimpleRNN(latent_dim, return_state=True)(enc_emb)
        encoder_states = [state_h]
    elif cell_type=='lstm':
        _, state_h, state_c = LSTM(latent_dim, return_state=True)(enc_emb)
        encoder_states = [state_h, state_c]
    elif cell_type=='gru':
        _, state_h = GRU(latent_dim, return_state=True)(enc_emb)
        encoder_states = [state_h]
    else:
        raise ValueError('Invalid cell_type')

    decoder_inputs = Input(shape=(max_tgt_len-1,))
    dec_emb = Embedding(tgt_vocab_size, embedding_dim, mask_zero=True)(decoder_inputs)
    if cell_type=='rnn':
        dec_outputs, _ = SimpleRNN(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=encoder_states[0])
    elif cell_type=='lstm':
        dec_outputs, _, _ = LSTM(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=encoder_states)
    elif cell_type=='gru':
        dec_outputs, _ = GRU(latent_dim, return_sequences=True, return_state=True)(dec_emb, initial_state=encoder_states[0])

    outputs = TimeDistributed(Dense(tgt_vocab_size, activation='softmax'))(dec_outputs)
    model = Model([encoder_inputs, decoder_inputs], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model


## 4) Train & Demo Seq2Seq Models

In [5]:

def train_seq2seq(model, epochs=50):
    model.fit([encoder_input, decoder_input], decoder_target[..., None], epochs=epochs, batch_size=4, verbose=0)
    return model

def greedy_decode(model, src_text):
    seq = src_tok.texts_to_sequences([src_text])
    seq = pad_sequences(seq, maxlen=max_src_len, padding='post')
    dec_in = np.zeros((1, max_tgt_len-1), dtype='int32')
    dec_in[0,0] = tgt_tok.word_index.get('<oov>', 1)  # start token simplified
    for t in range(1, max_tgt_len-1):
        preds = model.predict([seq, dec_in], verbose=0)
        next_id = np.argmax(preds[0, t-1, :])
        dec_in[0,t] = next_id
    out_words = [tgt_tok.index_word.get(i,'') for i in dec_in[0]]
    return ' '.join([w for w in out_words if w != ''])

models = {}
for cell in ['rnn','lstm','gru']:
    print('\nTraining', cell.upper())
    m = build_seq2seq(cell_type=cell, embedding_dim=64, latent_dim=64)
    train_seq2seq(m, epochs=100)  # very light training
    models[cell] = m
    for d in data:
        pred = greedy_decode(m, d['context'] + ' ' + d['question'])
        print("Q:", d['question'], "| GOLD:", d['answer'], "| PRED:", pred)



Training RNN
Q: Which language is popular for programming? | GOLD: Python | PRED: <oov> the
Q: Who developed the theory of relativity? | GOLD: Albert Einstein | PRED: <oov> einstein
Q: Where does the Sun rise? | GOLD: in the east | PRED: <oov> the
Q: What is the capital of France? | GOLD: Paris | PRED: <oov> einstein
Q: What is the highest mountain? | GOLD: Mount Everest | PRED: <oov> everest

Training LSTM
Q: Which language is popular for programming? | GOLD: Python | PRED: <oov> everest
Q: Who developed the theory of relativity? | GOLD: Albert Einstein | PRED: <oov> einstein
Q: Where does the Sun rise? | GOLD: in the east | PRED: <oov> the
Q: What is the capital of France? | GOLD: Paris | PRED: <oov> everest
Q: What is the highest mountain? | GOLD: Mount Everest | PRED: <oov> everest

Training GRU
Q: Which language is popular for programming? | GOLD: Python | PRED: <oov> everest
Q: Who developed the theory of relativity? | GOLD: Albert Einstein | PRED: <oov> einstein
Q: Where does t

## 5) Tiny Transformer Encoder-Decoder

In [6]:

from tensorflow.keras.layers import MultiHeadAttention, LayerNormalization

def build_tiny_transformer(src_vocab, tgt_vocab, embedding_dim=64, num_heads=2, ff_dim=128):
    enc_in = Input(shape=(max_src_len,))
    enc_emb = Embedding(src_vocab, embedding_dim)(enc_in)
    enc_att = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(enc_emb, enc_emb)
    enc_out = LayerNormalization(epsilon=1e-6)(enc_att + enc_emb)
    ff = Dense(ff_dim, activation='relu')(enc_out)
    ff = Dense(embedding_dim)(ff)
    enc_outputs = LayerNormalization(epsilon=1e-6)(ff + enc_out)

    dec_in = Input(shape=(max_tgt_len-1,))
    dec_emb = Embedding(tgt_vocab, embedding_dim)(dec_in)
    cross_att = MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(dec_emb, enc_outputs)
    dec_out = LayerNormalization(epsilon=1e-6)(cross_att + dec_emb)
    ff2 = Dense(ff_dim, activation='relu')(dec_out)
    ff2 = Dense(embedding_dim)(ff2)
    dec_out2 = LayerNormalization(epsilon=1e-6)(ff2 + dec_out)
    outputs = TimeDistributed(Dense(tgt_vocab, activation='softmax'))(dec_out2)

    model = Model([enc_in, dec_in], outputs)
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

trans_model = build_tiny_transformer(src_vocab_size, tgt_vocab_size)
trans_model.fit([encoder_input, decoder_input], decoder_target[..., None], epochs=100, batch_size=4, verbose=0)

for d in data:
    seq = src_tok.texts_to_sequences([d['context'] + ' ' + d['question']])
    seq = pad_sequences(seq, maxlen=max_src_len, padding='post')
    dec_in = np.zeros((1, max_tgt_len-1), dtype='int32')
    dec_in[0,0] = tgt_tok.word_index.get('<oov>', 1)
    for t in range(1, max_tgt_len-1):
        preds = trans_model.predict([seq, dec_in], verbose=0)
        next_id = np.argmax(preds[0, t-1, :])
        dec_in[0,t] = next_id
    out_words = [tgt_tok.index_word.get(i,'') for i in dec_in[0]]
    pred_text = ' '.join([w for w in out_words if w != ''])
    print("Q:", d['question'], "| GOLD:", d['answer'], "| PRED:", pred_text)


Q: Which language is popular for programming? | GOLD: Python | PRED: <oov>
Q: Who developed the theory of relativity? | GOLD: Albert Einstein | PRED: <oov> einstein
Q: Where does the Sun rise? | GOLD: in the east | PRED: <oov> east
Q: What is the capital of France? | GOLD: Paris | PRED: <oov>
Q: What is the highest mountain? | GOLD: Mount Everest | PRED: <oov> everest


## 6) BERT Q&A using Pretrained Model

In [7]:

from transformers import pipeline

qa_pipeline = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

print("\nBERT Predictions:")
for d in data:
    result = qa_pipeline(question=d['question'], context=d['context'])
    print("Q:", d['question'], "| GOLD:", d['answer'], "| PRED:", result['answer'])


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/451 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/265M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Device set to use cpu



BERT Predictions:
Q: Which language is popular for programming? | GOLD: Python | PRED: Python
Q: Who developed the theory of relativity? | GOLD: Albert Einstein | PRED: Albert Einstein
Q: Where does the Sun rise? | GOLD: in the east | PRED: the east
Q: What is the capital of France? | GOLD: Paris | PRED: Paris
Q: What is the highest mountain? | GOLD: Mount Everest | PRED: Mount Everest


## Notes
- Seq2Seq models and Transformer are trained on a tiny dataset.
- BERT uses a pretrained model for extractive QA.
- This notebook is educational and results are illustrative.