#
# ---------------------------------------------------------------------------------
#  FFFFF   A   QQQQ      BBBB   OOO  TTTTT
#  F      A A  Q  Q      B  B  O   O   T
#  FFF   AAAAA Q  Q      BBBB  O   O   T
#  F     A   A Q  Q      B  B  O   O   T
#  F     A   A  QQ Q     BBBB   OOO    T
# 
#  Welcome to the "Build Your First FAQ Language Model" Lab!
#
#  In this notebook, you will build a complete language model from scratch
#  and train it to be a simple FAQ chatbot.
#
# ---------------------------------------------------------------------------------
#

# =================================================================================
#  ✅ Part 1: Setup and Data Preparation
# =================================================================================
#
#  First, we need to prepare our data. The "fuel" for our AI model is a text file
#  containing questions and answers. We will also import the necessary libraries.
#
# ---------------------------------------------------------------------------------


# --- Step 1.1: Import Libraries ---
# We'll be using PyTorch for building our neural network.

In [36]:
import torch
import torch.nn as nn
from torch.nn import functional as F
import ipywidgets as widgets
from IPython.display import display

In [None]:
# --- Step 1.2: Create and Load the Dataset ---
# In a real workshop, you would have students generate this text file using the
# prompt template and save it as 'faq.txt'. For this example, we'll define it here.
#
# ----------------- YOUR FAQ DATA GOES HERE -----------------
#
#  Instructions:
#  1. Use the prompt template provided in the workshop to generate your FAQ content.
#  2. Paste the generated text into the triple-quoted string below.
#  3. Make sure the file is saved in the same directory as this notebook.
#

In [3]:
faq_text = """
Q: What are the store hours?
A: Our store is open from 9 AM to 8 PM, Monday to Saturday.

Q: What is the return policy?
A: You can return any item within 30 days of purchase with a valid receipt.

Q: Do you offer gift wrapping?
A: Yes, we offer complimentary gift wrapping for all in-store purchases.

Q: Where are you located?
A: We are located at 123 Main Street, Anytown, USA.

Q: Can I place an order online?
A: Yes, you can place an order through our website at www.example-store.com.

Q: What payment methods do you accept?
A: We accept all major credit cards, debit cards, and mobile payments.

Q: Is there parking available?
A: Yes, there is a free parking lot available for all our customers behind the store.

Q: Do you have a loyalty program?
A: Yes, you can sign up for our free loyalty program to earn points on every purchase.
"""

In [None]:
# --- Step 1.2: Read data from file ---
# Read `faq.txt`file created by the student
#
# Uncomment below
#with open('faq.txt', 'r', encoding='utf-8') as f:
#    faq_text = f.read()

In [None]:
# ----------------------------------------------------------

# Let's see the data we're working with

In [5]:
print("--- Sample of our dataset ---")
print(faq_text[:200])
print("-----------------------------\n")

--- Sample of our dataset ---

Q: What are the store hours?
A: Our store is open from 9 AM to 8 PM, Monday to Saturday.

Q: What is the return policy?
A: You can return any item within 30 days of purchase with a valid receipt.

Q:
-----------------------------



In [None]:
# --- Step 1.3: Create the Vocabulary ---
# Our model can't understand letters. It only understands numbers. So, we need to
# create a "vocabulary" and map each unique character to a unique integer.

In [11]:
chars = sorted(list(set(faq_text)))
vocab_size = len(chars)

print(f"Our vocabulary contains {vocab_size} unique characters.")
print(f"Vocabulary: {''.join(chars)}\n")

Our vocabulary contains 49 unique characters.
Vocabulary: 
 ,-.012389:?ACDIMOPQSUWYabcdefghijklmnoprstuvwxy



In [None]:
# Create the mapping from character to integer (stoi) and integer to character (itos)

In [12]:
stoi = { ch:i for i,ch in enumerate(chars) }
itos = { i:ch for i,ch in enumerate(chars) }

In [None]:
# Define our encoding and decoding functions

In [13]:
encode = lambda s: [stoi[c] for c in s] # encoder: take a string, output a list of integers
decode = lambda l: ''.join([itos[i] for i in l]) # decoder: take a list of integers, output a string

In [None]:
# Let's test them out

In [14]:
test_string = "store hours"
encoded_string = encode(test_string)
decoded_string = decode(encoded_string)
print(f"Original: '{test_string}'")
print(f"Encoded: {encoded_string}")
print(f"Decoded: '{decoded_string}'\n")

Original: 'store hours'
Encoded: [42, 43, 39, 41, 29, 1, 32, 39, 44, 41, 42]
Decoded: 'store hours'



In [None]:
# --- Step 1.4: Tokenize the Dataset ---
# Now, we'll convert our entire text dataset into a single sequence of numbers.
# PyTorch uses a data structure called "tensors" to work with numbers.

In [15]:
data = torch.tensor(encode(faq_text), dtype=torch.long)
print(f"Dataset shape: {data.shape}")
print(f"First 100 tokens: {data[:100]}\n")

Dataset shape: torch.Size([842])
First 100 tokens: tensor([ 0, 20, 11,  1, 23, 32, 25, 43,  1, 25, 41, 29,  1, 43, 32, 29,  1, 42,
        43, 39, 41, 29,  1, 32, 39, 44, 41, 42, 12,  0, 13, 11,  1, 18, 44, 41,
         1, 42, 43, 39, 41, 29,  1, 33, 42,  1, 39, 40, 29, 38,  1, 30, 41, 39,
        37,  1, 10,  1, 13, 17,  1, 43, 39,  1,  9,  1, 19, 17,  2,  1, 17, 39,
        38, 28, 25, 48,  1, 43, 39,  1, 21, 25, 43, 44, 41, 28, 25, 48,  4,  0,
         0, 20, 11,  1, 23, 32, 25, 43,  1, 33])



In [None]:
# --- Step 1.5: Split Data into Training and Validation Sets ---
# We use most of our data for training the model (learning) and a small part
# for validation (to check if it's learning correctly without just memorizing).

In [16]:
n = int(0.9 * len(data)) # first 90% will be train, rest val
train_data = data[:n]
val_data = data[n:]

In [None]:
# =================================================================================
#  ✅ Part 2: Understanding Context and Batches
# =================================================================================
#
#  A language model learns by seeing a chunk of text (context) and trying to
#  predict the very next character.
#
# ---------------------------------------------------------------------------------


In [None]:
# --- Step 2.1: Define Context Size ---
# `block_size` is the maximum length of the context the model can see.

In [18]:
block_size = 64
print(f"A single training example (x): {train_data[:block_size].tolist()}")
print(f"The target for each character in x (y): {train_data[1:block_size+1].tolist()}\n")

A single training example (x): [0, 20, 11, 1, 23, 32, 25, 43, 1, 25, 41, 29, 1, 43, 32, 29, 1, 42, 43, 39, 41, 29, 1, 32, 39, 44, 41, 42, 12, 0, 13, 11, 1, 18, 44, 41, 1, 42, 43, 39, 41, 29, 1, 33, 42, 1, 39, 40, 29, 38, 1, 30, 41, 39, 37, 1, 10, 1, 13, 17, 1, 43, 39, 1]
The target for each character in x (y): [20, 11, 1, 23, 32, 25, 43, 1, 25, 41, 29, 1, 43, 32, 29, 1, 42, 43, 39, 41, 29, 1, 32, 39, 44, 41, 42, 12, 0, 13, 11, 1, 18, 44, 41, 1, 42, 43, 39, 41, 29, 1, 33, 42, 1, 39, 40, 29, 38, 1, 30, 41, 39, 37, 1, 10, 1, 13, 17, 1, 43, 39, 1, 9]



In [None]:
# --- Step 2.2: Create a Batch Function ---
# We train the model on small, random chunks of data called "batches". This
# helps the training process be more efficient and stable.

In [19]:
batch_size = 32 # How many independent sequences will we process in parallel?

In [20]:
def get_batch(split):
    # Generate a small batch of data of inputs x and targets y
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix])
    y = torch.stack([data[i+1:i+block_size+1] for i in ix])
    return x, y

In [None]:
# Let's see a sample batch

In [21]:
xb, yb = get_batch('train')
print("--- Sample Batch ---")
print(f"Inputs (xb) shape: {xb.shape}")
print(f"Targets (yb) shape: {yb.shape}")
print("--------------------\n")

--- Sample Batch ---
Inputs (xb) shape: torch.Size([32, 64])
Targets (yb) shape: torch.Size([32, 64])
--------------------



In [None]:
# =================================================================================
#  ✅ Part 3: Building the Transformer Model (From Scratch!)
# =================================================================================
#
#  This is the most exciting part! We will build the core components of the
#  Transformer architecture, which is the foundation of models like GPT.
#
# ---------------------------------------------------------------------------------


In [None]:
# --- Hyperparameters ---
# These are the settings for our model. You can experiment with these later!

In [22]:
n_embd = 128       # The size of the embedding for each character
n_head = 4         # The number of attention heads
n_layer = 4        # The number of transformer blocks
dropout = 0.2      # A regularization technique to prevent overfitting

In [None]:
# -----------------------
# --- Step 3.1: The Self-Attention Head ---
# This is the fundamental component. An "attention head" allows the model to look
# at other characters in the context and decide which ones are most important
# for predicting the next character.

In [23]:
class Head(nn.Module):
    """ one head of self-attention """

    def __init__(self, head_size):
        super().__init__()
        self.key = nn.Linear(n_embd, head_size, bias=False)
        self.query = nn.Linear(n_embd, head_size, bias=False)
        self.value = nn.Linear(n_embd, head_size, bias=False)
        self.register_buffer('tril', torch.tril(torch.ones(block_size, block_size)))
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        B, T, C = x.shape
        k = self.key(x)   # (B,T,C)
        q = self.query(x) # (B,T,C)
        # compute attention scores ("affinities")
        wei = q @ k.transpose(-2, -1) * C**-0.5 # (B, T, C) @ (B, C, T) -> (B, T, T)
        wei = wei.masked_fill(self.tril[:T, :T] == 0, float('-inf')) # (B, T, T)
        wei = F.softmax(wei, dim=-1) # (B, T, T)
        wei = self.dropout(wei)
        # perform the weighted aggregation of the values
        v = self.value(x) # (B,T,C)
        out = wei @ v # (B, T, T) @ (B, T, C) -> (B, T, C)
        return out

In [None]:
# --- Step 3.2: Multi-Head Attention ---
# To make the model more powerful, we use multiple attention heads in parallel
# and combine their results.

In [24]:
class MultiHeadAttention(nn.Module):
    """ multiple heads of self-attention in parallel """

    def __init__(self, num_heads, head_size):
        super().__init__()
        self.heads = nn.ModuleList([Head(head_size) for _ in range(num_heads)])
        self.proj = nn.Linear(n_embd, n_embd)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        out = torch.cat([h(x) for h in self.heads], dim=-1)
        out = self.dropout(self.proj(out))
        return out

In [None]:
# --- Step 3.3: The Feed-Forward Network ---
# After the attention mechanism, each character's representation is passed
# through a simple neural network to process the information gathered.

In [25]:
class FeedForward(nn.Module):
    """ a simple linear layer followed by a non-linearity """

    def __init__(self, n_embd):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(n_embd, 4 * n_embd),
            nn.ReLU(),
            nn.Linear(4 * n_embd, n_embd),
            nn.Dropout(dropout),
        )

    def forward(self, x):
        return self.net(x)

In [None]:
# --- Step 3.4: The Transformer Block ---
# Now we combine the attention and feed-forward components into a single
# "Transformer Block". A real LLM is just many of these blocks stacked together.

In [26]:
class Block(nn.Module):
    """ Transformer block: communication followed by computation """

    def __init__(self, n_embd, n_head):
        super().__init__()
        head_size = n_embd // n_head
        self.sa = MultiHeadAttention(n_head, head_size)
        self.ffwd = FeedForward(n_embd)
        self.ln1 = nn.LayerNorm(n_embd)
        self.ln2 = nn.LayerNorm(n_embd)

    def forward(self, x):
        x = x + self.sa(self.ln1(x))
        x = x + self.ffwd(self.ln2(x))
        return x

In [None]:
# --- Step 3.5: The Full Language Model ---
# Finally, we assemble everything into our complete language model!

In [27]:
class LanguageModel(nn.Module):

    def __init__(self):
        super().__init__()
        # each token directly reads off the logits for the next token from a lookup table
        self.token_embedding_table = nn.Embedding(vocab_size, n_embd)
        self.position_embedding_table = nn.Embedding(block_size, n_embd)
        self.blocks = nn.Sequential(*[Block(n_embd, n_head=n_head) for _ in range(n_layer)])
        self.ln_f = nn.LayerNorm(n_embd) # final layer norm
        self.lm_head = nn.Linear(n_embd, vocab_size)

    def forward(self, idx, targets=None):
        B, T = idx.shape

        # idx and targets are both (B,T) tensor of integers
        tok_emb = self.token_embedding_table(idx) # (B,T,C)
        pos_emb = self.position_embedding_table(torch.arange(T)) # (T,C)
        x = tok_emb + pos_emb # (B,T,C)
        x = self.blocks(x) # (B,T,C)
        x = self.ln_f(x) # (B,T,C)
        logits = self.lm_head(x) # (B,T,vocab_size)

        if targets is None:
            loss = None
        else:
            B, T, C = logits.shape
            logits = logits.view(B*T, C)
            targets = targets.view(B*T)
            loss = F.cross_entropy(logits, targets)

        return logits, loss

    def generate(self, idx, max_new_tokens):
        # idx is (B, T) array of indices in the current context
        for _ in range(max_new_tokens):
            # crop idx to the last block_size tokens
            idx_cond = idx[:, -block_size:]
            # get the predictions
            logits, loss = self(idx_cond)
            # focus only on the last time step
            logits = logits[:, -1, :] # becomes (B, C)
            # apply softmax to get probabilities
            probs = F.softmax(logits, dim=-1) # (B, C)
            # sample from the distribution
            idx_next = torch.multinomial(probs, num_samples=1) # (B, 1)
            # append sampled index to the running sequence
            idx = torch.cat((idx, idx_next), dim=1) # (B, T+1)
        return idx

In [None]:
# Let's create an instance of our model!

In [28]:
model = LanguageModel()
print("Language Model created successfully!")

Language Model created successfully!


In [None]:
# =================================================================================
#  ✅ Part 4: Training the Model
# =================================================================================
#
#  Now we'll feed our data to the model and let it learn. This process involves
#  showing the model batches of data, calculating how "wrong" its predictions
#  are (the "loss"), and then slightly adjusting its internal parameters to
#  make it better.
#
# ---------------------------------------------------------------------------------


In [None]:
# --- Step 4.1: Create the Optimizer ---
# The optimizer is the algorithm that adjusts the model's parameters.
# AdamW is a popular and effective choice.

In [29]:
optimizer = torch.optim.AdamW(model.parameters(), lr=1e-3)

In [None]:
# --- Step 4.2: The Training Loop ---
# This loop will run for a set number of steps. In each step, we'll get a
# batch of data, ask the model for a prediction, and update the model.
#
# NOTE: This will take a few minutes to run!
#

In [30]:
max_iters = 5000 # How many training steps? (More is better, but takes longer)
eval_interval = 500 # How often to check the validation loss?

In [31]:
print("\n--- Starting Training ---")
for iter in range(max_iters):

    # every once in a while evaluate the loss on train and val sets
    if iter % eval_interval == 0:
        # We'll create a function to estimate the loss to avoid repeating code
        @torch.no_grad()
        def estimate_loss():
            out = {}
            model.eval()
            for split in ['train', 'val']:
                losses = torch.zeros(200)
                for k in range(200):
                    X, Y = get_batch(split)
                    logits, loss = model(X, Y)
                    losses[k] = loss.item()
                out[split] = losses.mean()
            model.train()
            return out
        losses = estimate_loss()
        print(f"step {iter}: train loss {losses['train']:.4f}, val loss {losses['val']:.4f}")

    # sample a batch of data
    xb, yb = get_batch('train')

    # evaluate the loss
    logits, loss = model(xb, yb)
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()

print("--- Training Complete! ---\n")


--- Starting Training ---
step 0: train loss 4.0805, val loss 4.0609
step 500: train loss 0.0755, val loss 5.1587
step 1000: train loss 0.0627, val loss 4.7291
step 1500: train loss 0.0612, val loss 5.0100
step 2000: train loss 0.0591, val loss 5.1008
step 2500: train loss 0.0593, val loss 4.9941
step 3000: train loss 0.0584, val loss 4.9174
step 3500: train loss 0.0588, val loss 5.2184
step 4000: train loss 0.0572, val loss 5.0626
step 4500: train loss 0.0562, val loss 5.0061
--- Training Complete! ---



In [None]:
# =================================================================================
#  ✅ Part 5: Generating Answers with our FAQ Bot!
# =================================================================================
#
#  This is the moment of truth! Let's use our trained model to answer questions.
#  We'll give it a question as a "prompt" and see what it generates.
#
# ---------------------------------------------------------------------------------


In [None]:
# --- Step 5.1: The Generation Function ---
# Let's write a simple function to interact with our bot.

In [32]:
def ask_bot(question):
    """
    Takes a question string and uses the model to generate an answer.
    """
    # Prepare the prompt for the model
    prompt = f"Q: {question}\nA:"
    print(prompt, end='') # Print the prompt without a newline

    # Encode the prompt and create a tensor
    context = torch.tensor(encode(prompt), dtype=torch.long).unsqueeze(0)

    # Generate the answer
    generated_output = model.generate(context, max_new_tokens=50)[0].tolist()

    # Decode and print the result
    answer = decode(generated_output)
    # We only want the generated part, so we find where the answer starts
    answer_part = answer[len(prompt):]
    print(answer_part.split('Q:')[0]) # Stop printing if it starts a new question


In [None]:
# --- Step 5.2: Let's test it! ---

In [34]:
print("--- Ask the FAQ Bot ---")
ask_bot("what are the store hours?") # Using lowercase
ask_bot("do you offer gift wrapping") # Lowercase and no question mark
ask_bot("return policy?") # A more ambiguous, partial question
print("-----------------------")

--- Ask the FAQ Bot ---
Q: what are the store hours?
A: Our store is open from 9 AM to 8 PM, Monday to Sa
Q: do you offer gift wrapping
A: Yes, we offer complimentary gift wrapping for all
Q: parking?
A: Yes, there is a free parking lot available for al
-----------------------


In [None]:
# =================================================================================
#  ✅ Part 6: Generating Answers with our Interactive FAQ Bot!
# =================================================================================
#
#  Now, let's create a simple user interface right here in the notebook
#  so you can interact with your bot.
#
# ---------------------------------------------------------------------------------


In [None]:
# --- Step 6.1: Build the User Interface ---
# We will use the `ipywidgets` library to create a text box and a button.

# Create the widgets

In [37]:
question_input = widgets.Text(
    value='What are the store hours?',
    placeholder='Type your question here',
    description='Question:',
    disabled=False,
    layout=widgets.Layout(width='80%')
)

submit_button = widgets.Button(
    description='Ask Bot',
    button_style='success', # 'success', 'info', 'warning', 'danger' or ''
    tooltip='Click to get an answer',
    icon='question-circle'
)

output_area = widgets.Output()

In [None]:
# --- Step 5.2: Define the Button Click Action ---
# This function will run every time you click the "Ask Bot" button.


In [42]:
def on_button_clicked(b):
    with output_area:
        output_area.clear_output() # Clear previous answers
        question = question_input.value

        # Prepare the prompt for the model
        prompt = f"Q: {question}\nA:"
        print(prompt, end='')

        # Encode the prompt and create a tensor
        known_chars_prompt = ''.join([c for c in prompt if c in stoi])
        context = torch.tensor(encode(known_chars_prompt), dtype=torch.long).unsqueeze(0)

        # Generate the answer
        generated_output = model.generate(context, max_new_tokens=100)[0].tolist()

        # Decode and print the result
        answer = decode(generated_output)
        answer_part = answer[len(known_chars_prompt):]
        # Stop printing if it starts a new question or gets stuck in a loop
        final_answer = answer_part.split('Q:')[0].split('\n\n')[0]
        print(final_answer)

In [38]:
# Link the button to the function


NameError: name 'on_button_clicked' is not defined

In [43]:
submit_button.on_click(on_button_clicked)

In [None]:
# --- Step 5.3: Display the Bot! ---
# Now, we display the UI elements. Type a question and click the button!

In [44]:
print("--- Interactive FAQ Bot ---")
display(question_input, submit_button, output_area)

--- Interactive FAQ Bot ---


Text(value='do you have parking?', description='Question:', layout=Layout(width='80%'), placeholder='Type your…

Button(button_style='success', description='Ask Bot', icon='question-circle', style=ButtonStyle(), tooltip='Cl…

Output(outputs=({'name': 'stdout', 'text': 'Q: do you have parking?\nA: Yes, there is a free parkilable for al…

In [None]:
# --- Final Thoughts & Next Steps ---
#
# Congratulations! You've built a language model from scratch.
#
# This is a "seed" project. It's not perfect, but it demonstrates the core concepts.
# To make it better, you could:
#   1.  Add more data to your faq.txt file.
#   2.  Train for more iterations (increase `max_iters`).
#   3.  Experiment with the hyperparameters (e.g., `n_embd`, `n_head`, `n_layer`).
#   4.  Build a simple user interface for it.
#