### Tokenization

In [None]:
import nltk
nltk.download('punkt_tab')
from nltk.tokenize import sent_tokenize
from nltk.tokenize import word_tokenize

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
corpus = '''
"Speech and Language Processing" by Dan Jurafsky and James H. Martin: This textbook covers a wide range of topics in natural language processing (NLP) and speech recognition.
"Designing Bots: Creating Conversational Experiences" by Amir Shevat: This book focuses on the design and development of conversational interfaces. It provides practical insights and best practices for creating engaging and effective chatbots.
'''

In [None]:
documents=sent_tokenize(corpus)
for sentence in documents:
  print(sentence)


"Speech and Language Processing" by Dan Jurafsky and James H. Martin: This textbook covers a wide range of topics in natural language processing (NLP) and speech recognition.
"Designing Bots: Creating Conversational Experiences" by Amir Shevat: This book focuses on the design and development of conversational interfaces.
It provides practical insights and best practices for creating engaging and effective chatbots.


In [None]:
for sentence in documents:
  print(word_tokenize(sentence))

["''", 'Speech', 'and', 'Language', 'Processing', "''", 'by', 'Dan', 'Jurafsky', 'and', 'James', 'H.', 'Martin', ':', 'This', 'textbook', 'covers', 'a', 'wide', 'range', 'of', 'topics', 'in', 'natural', 'language', 'processing', '(', 'NLP', ')', 'and', 'speech', 'recognition', '.']
['``', 'Designing', 'Bots', ':', 'Creating', 'Conversational', 'Experiences', "''", 'by', 'Amir', 'Shevat', ':', 'This', 'book', 'focuses', 'on', 'the', 'design', 'and', 'development', 'of', 'conversational', 'interfaces', '.']
['It', 'provides', 'practical', 'insights', 'and', 'best', 'practices', 'for', 'creating', 'engaging', 'and', 'effective', 'chatbots', '.']


### BPE

Tokenizer (including GPT-4o): https://tiktokenizer.vercel.app/?model=gpt2

In [None]:
%%capture
!pip install tiktoken -q

In [None]:
sentence = "Is the distance between Bengaluru and Delhi more than 2000 kms?"

In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("gpt2")
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = [tokenizer.decode([token_id]) for token_id in token_ids]
print("Tokens:", decoded_tokens)

Token IDs: [3792, 262, 5253, 1022, 28630, 14717, 290, 12517, 517, 621, 4751, 479, 907, 30]
Tokens: ['Is', ' the', ' distance', ' between', ' Bengal', 'uru', ' and', ' Delhi', ' more', ' than', ' 2000', ' k', 'ms', '?']


In [None]:
import tiktoken
tokenizer = tiktoken.get_encoding("o200k_base") # GPT-4o mini
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = [tokenizer.decode([token_id]) for token_id in token_ids]
print("Tokens:", decoded_tokens)

Token IDs: [3031, 290, 9324, 2870, 174589, 326, 30076, 945, 1572, 220, 1179, 15, 109434, 30]
Tokens: ['Is', ' the', ' distance', ' between', ' Bengaluru', ' and', ' Delhi', ' more', ' than', ' ', '200', '0', ' kms', '?']


In [None]:
tokenizer.decode([64])

'a'

In [None]:
import unicodedata

def get_stats(ids, counts=None):
    counts = {} if counts is None else counts
    for pair in zip(ids, ids[1:]):
        counts[pair] = counts.get(pair, 0) + 1
    return counts

def merge(ids, pair, idx):
    newids = []
    i = 0
    while i < len(ids):
        if ids[i] == pair[0] and i < len(ids) - 1 and ids[i + 1] == pair[1]:
            newids.append(idx)
            i += 2
        else:
            newids.append(ids[i])
            i += 1
    return newids

def train(text, vocab_size, verbose=False):
    assert vocab_size >= 256
    num_merges = vocab_size - 256
    if verbose:
      print(f'Text: {list(text)}')
    text_bytes = text.encode("utf-8")
    ids = list(text_bytes)
    if verbose:
      print(f'ids: {ids}')
    merges = {}
    vocab = {idx: bytes([idx]) for idx in range(256)}
    for i in range(num_merges):
      if verbose:
        print(f'\nIteration {str(i+1)}:')
      stats = get_stats(ids)
      temp = [{(decode([int(k[0])],vocab),decode([int(k[1])],vocab)):v} for k,v in stats.items()]
      if verbose:
        print(f'Frequency of pairs: {temp}')
      pair = max(stats, key=stats.get)
      idx = 256 + i
      if verbose:
        print(f'Merging pairs: {(decode([int(pair[0])],vocab),decode([int(pair[1])],vocab))} ==> {idx}')
      ids = merge(ids, pair, idx)
      if verbose:
        print(f'Merged ids: {ids}')
      merges[pair] = idx
      vocab[idx] = vocab[pair[0]] + vocab[pair[1]]
      toks = [decode([int(id)],vocab) for id in ids]
      if verbose:
        print(f'Compressed Text: {toks}')
    return merges, vocab

def encode(text, merges):
    text_bytes = text.encode("utf-8")
    ids = list(text_bytes)
    while len(ids) >= 2:
        stats = get_stats(ids)
        pair = min(stats, key=lambda p: merges.get(p, float("inf")))
        if pair not in merges:
            break
        idx = merges[pair]
        ids = merge(ids, pair, idx)
    return ids

def decode(ids, vocab):
    text_bytes = b"".join(vocab[idx] for idx in ids)
    text = text_bytes.decode("utf-8", errors="replace")
    return text

In [None]:
mergesRequired=3
merges, vocab = train('pay papaya', vocab_size = 256+mergesRequired, verbose=True)
print(f'\nMerged ids to new id: {list(merges.items())[-mergesRequired:]}')
print(f'\nNew vocabulary (id, byte):{list(vocab.items())[-mergesRequired:]}')

Text: ['p', 'a', 'y', ' ', 'p', 'a', 'p', 'a', 'y', 'a']
ids: [112, 97, 121, 32, 112, 97, 112, 97, 121, 97]

Iteration 1:
Frequency of pairs: [{('p', 'a'): 3}, {('a', 'y'): 2}, {('y', ' '): 1}, {(' ', 'p'): 1}, {('a', 'p'): 1}, {('y', 'a'): 1}]
Merging pairs: ('p', 'a') ==> 256
Merged ids: [256, 121, 32, 256, 256, 121, 97]
Compressed Text: ['pa', 'y', ' ', 'pa', 'pa', 'y', 'a']

Iteration 2:
Frequency of pairs: [{('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', 'pa'): 1}, {('pa', 'pa'): 1}, {('y', 'a'): 1}]
Merging pairs: ('pa', 'y') ==> 257
Merged ids: [257, 32, 256, 257, 97]
Compressed Text: ['pay', ' ', 'pa', 'pay', 'a']

Iteration 3:
Frequency of pairs: [{('pay', ' '): 1}, {(' ', 'pa'): 1}, {('pa', 'pay'): 1}, {('pay', 'a'): 1}]
Merging pairs: ('pay', ' ') ==> 258
Merged ids: [258, 256, 257, 97]
Compressed Text: ['pay ', 'pa', 'pay', 'a']

Merged ids to new id: [((112, 97), 256), ((256, 121), 257), ((257, 32), 258)]

New vocabulary (id, byte):[(256, b'pa'), (257, b'pay'), (258, b'pay ')]


In [None]:
mergesRequired=3
merges, vocab = train('25 pay 5 papaya', vocab_size = 256+mergesRequired, verbose=True)
print(f'\nMerged ids to new id: {list(merges.items())[-mergesRequired:]}')
print(f'\nNew vocabulary (id, byte):{list(vocab.items())[-mergesRequired:]}')

Text: ['2', '5', ' ', 'p', 'a', 'y', ' ', '5', ' ', 'p', 'a', 'p', 'a', 'y', 'a']
ids: [50, 53, 32, 112, 97, 121, 32, 53, 32, 112, 97, 112, 97, 121, 97]

Iteration 1:
Frequency of pairs: [{('2', '5'): 1}, {('5', ' '): 2}, {(' ', 'p'): 2}, {('p', 'a'): 3}, {('a', 'y'): 2}, {('y', ' '): 1}, {(' ', '5'): 1}, {('a', 'p'): 1}, {('y', 'a'): 1}]
Merging pairs: ('p', 'a') ==> 256
Merged ids: [50, 53, 32, 256, 121, 32, 53, 32, 256, 256, 121, 97]
Compressed Text: ['2', '5', ' ', 'pa', 'y', ' ', '5', ' ', 'pa', 'pa', 'y', 'a']

Iteration 2:
Frequency of pairs: [{('2', '5'): 1}, {('5', ' '): 2}, {(' ', 'pa'): 2}, {('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', '5'): 1}, {('pa', 'pa'): 1}, {('y', 'a'): 1}]
Merging pairs: ('5', ' ') ==> 257
Merged ids: [50, 257, 256, 121, 32, 257, 256, 256, 121, 97]
Compressed Text: ['2', '5 ', 'pa', 'y', ' ', '5 ', 'pa', 'pa', 'y', 'a']

Iteration 3:
Frequency of pairs: [{('2', '5 '): 1}, {('5 ', 'pa'): 2}, {('pa', 'y'): 2}, {('y', ' '): 1}, {(' ', '5 '): 1}, {('pa', 'pa

In [None]:
# Trained for Vocab of 512
text = open("india.txt", "r", encoding="utf-8").read()
merges, vocab = train(text, vocab_size=512, verbose=False)
token_ids = encode(sentence, merges)
tokens = [decode([tokenID], vocab) for tokenID in token_ids]
print('Tokens generated:', tokens)
print('Decoded sentence:', decode(token_ids, vocab))

Tokens generated: ['I', 's ', 'the ', 'di', 'st', 'anc', 'e ', 'b', 'et', 'w', 'e', 'en ', 'B', 'en', 'g', 'al', 'ur', 'u', ' and ', 'D', 'el', 'hi', ' m', 'or', 'e ', 'th', 'an ', '20', '00 ', 'k', 'm', 's', '?']
Decoded sentence: Is the distance between Bengaluru and Delhi more than 2000 kms?


In [None]:
# Trained for Vocab of 1024
text = open("india.txt", "r", encoding="utf-8").read()
merges, vocab = train(text, vocab_size=1024, verbose=False)
token_ids = encode(sentence, merges)
tokens = [decode([tokenID], vocab) for tokenID in token_ids]
print('Tokens generated:', tokens)
print('Decoded sentence:', decode(token_ids, vocab))

Tokens generated: ['I', 's ', 'the ', 'di', 'st', 'ance ', 'between ', 'Beng', 'al', 'ur', 'u', ' and ', 'Delhi', ' m', 'or', 'e ', 'than ', '20', '00 ', 'k', 'm', 's', '?']
Decoded sentence: Is the distance between Bengaluru and Delhi more than 2000 kms?


In [None]:
# Difference between RegexTokeniser - GPT-3

### WordPiece

In [None]:
# Likelihood score instead of frequency
# standard special characters like !?@~
# special tokens used for BERT, ex : [SEP], [CLS], [MASK], [UNK], [PAD], [EOS]

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# https://www.tensorflow.org/text/api_docs/python/text/BertTokenizer
token_ids = tokenizer.encode(sentence)
print("Token IDs:", token_ids)
decoded_tokens = tokenizer.convert_ids_to_tokens(token_ids)
print("Tokens:", decoded_tokens)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Token IDs: [101, 2003, 1996, 3292, 2090, 8191, 14129, 1998, 6768, 2062, 2084, 2456, 2463, 2015, 1029, 102]
Tokens: ['[CLS]', 'is', 'the', 'distance', 'between', 'bengal', '##uru', 'and', 'delhi', 'more', 'than', '2000', 'km', '##s', '?', '[SEP]']


In [None]:
import math
from collections import defaultdict

def get_token_probabilities(vocab):
    total_count = sum(vocab.values())
    return {token: count / total_count if total_count > 0 else 0 for token, count in vocab.items()} # Added a check to handle an empty vocabulary

def get_stats_wordpiece(ids, vocab, vocab_id_reverse):
    token_probabilities = get_token_probabilities(vocab)

    pair_counts = defaultdict(int)
    pair_scores = {}

    for i in range(len(ids) - 1):
        pair = (ids[i], ids[i + 1])
        pair_counts[pair] += 1

    for pair, count in pair_counts.items():
        token1 = pair[0]
        token2 = pair[1]
        token1_str = vocab_id_reverse[token1] if isinstance(token1, int) else token1
        token2_str = vocab_id_reverse[token2] if isinstance(token2, int) else token2

        combined_token = token1_str + token2_str

        prob_combined = vocab.get(combined_token, 0) / sum(vocab.values()) if vocab.get(combined_token, 0) != 0 else 1e-7
        prob1 = token_probabilities.get(token1_str, 1e-7)
        prob2 = token_probabilities.get(token2_str, 1e-7)

        score = prob_combined / (prob1 * prob2) if prob1 != 0 and prob2 != 0 else 0

        pair_scores[pair] = score

    return pair_scores

def merge_wordpiece(ids, pair, new_token, vocab, vocab_id_reverse):
    new_ids = []
    i = 0
    while i < len(ids):
        if i < len(ids) - 1 and ids[i] == pair[0] and ids[i + 1] == pair[1]:
            new_ids.append(new_token)  # Append new token string
            i += 2
        else:
            new_ids.append(ids[i])
            i += 1
    # Add the merged tokens to vocab and vocab_id_reverse
    if new_token not in vocab:
        new_id = max(vocab_id_reverse.keys()) + 1 if vocab_id_reverse else 0
        vocab[new_token] = 0 # Initialize count for new token
    for i in range(len(new_ids)):
        if(new_ids[i] == new_token):
            vocab[new_token] +=1
    if new_token not in vocab_id_reverse.values():
      new_id = max(vocab_id_reverse.keys()) + 1 if vocab_id_reverse else 0
      vocab_id_reverse[new_id] = new_token

    new_ids_int = []
    for i in range(len(new_ids)):
        if isinstance(new_ids[i], int):
            new_ids_int.append(new_ids[i])
        else:
            key = [key for key, value in vocab_id_reverse.items() if value == new_ids[i]]
            if(key):
                new_ids_int.append(key[0])
            else:
                print("Error")

    return new_ids_int, vocab, vocab_id_reverse

def train_wordpiece(text, vocab_size, verbose=False):
    # 1. Initial Vocabulary (with counts)
    vocab = defaultdict(int)
    for char in text:
        vocab[char] += 1

    # Initialize vocab_id_reverse here
    vocab_id_reverse = {idx: char for idx, char in enumerate(vocab.keys())}

    # 2. Initial IDs
    ids = [ [key for key, value in vocab_id_reverse.items() if value == char ][0]for char in text]

    merges = {}
    num_merges = vocab_size - len(vocab)

    for i in range(num_merges):
        # 3. Get Pair Statistics (using likelihood)
        pair_scores = get_stats_wordpiece(ids, vocab, vocab_id_reverse)
        if not pair_scores:
            break  # Stop if no more pairs can be merged

        # 4. Find Best Pair
        pair = max(pair_scores, key=pair_scores.get)

        token1 = pair[0]
        token2 = pair[1]
        token1_str = vocab_id_reverse[token1] if isinstance(token1, int) else token1
        token2_str = vocab_id_reverse[token2] if isinstance(token2, int) else token2

        # 5. Create New Token
        new_token = token1_str + token2_str

        # 6. Merge IDs and Update Vocabulary
        ids, vocab, vocab_id_reverse = merge_wordpiece(ids, pair, new_token, vocab, vocab_id_reverse)

        # 7. Store Merge
        merges[pair] = new_token

        if verbose:
            token1_for_print = vocab_id_reverse[pair[0]] if isinstance(pair[0],int) else pair[0]
            token2_for_print = vocab_id_reverse[pair[1]] if isinstance(pair[1],int) else pair[1]
            print(
                f"Merge {i + 1}/{num_merges}: ({token1_for_print}, {token2_for_print}) -> {new_token} (likelihood: {pair_scores[pair]:.4f})"
            )

    return merges, vocab, vocab_id_reverse

In [None]:
text = "pay papaya pay papaya pay papaya pay papaya"
vocab_size = 10
merges, vocab, vocab_id_reverse = train_wordpiece(text, vocab_size, verbose=True)
print("Final Merges:", merges)
print("Final Vocab:", vocab)
print("Final Vocab reverse:", vocab_id_reverse)

Merge 1/6: (y,  ) -> y  (likelihood: 0.0000)
Merge 2/6: (y , p) -> y p (likelihood: 0.0000)
Merge 3/6: (a, y p) -> ay p (likelihood: 0.0000)
Merge 4/6: (p, ay p) -> pay p (likelihood: 0.0000)
Merge 5/6: ( , pay p) ->  pay p (likelihood: 0.0000)
Merge 6/6: (a,  pay p) -> a pay p (likelihood: 0.0000)
Final Merges: {(2, 3): 'y ', (4, 0): 'y p', (1, 5): 'ay p', (0, 6): 'pay p', (3, 7): ' pay p', (1, 8): 'a pay p'}
Final Vocab: defaultdict(<class 'int'>, {'p': 12, 'a': 16, 'y': 8, ' ': 7, 'y ': 4, 'y p': 4, 'ay p': 4, 'pay p': 4, ' pay p': 3, 'a pay p': 3})
Final Vocab reverse: {0: 'p', 1: 'a', 2: 'y', 3: ' ', 4: 'y ', 5: 'y p', 6: 'ay p', 7: 'pay p', 8: ' pay p', 9: 'a pay p'}


### Text Generation using GPT 2

In [None]:
%%capture
!pip install transformers
!pip install --upgrade numpy

In [None]:
## Import necessary libraries
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the pre-trained GPT-2 model and tokenizer
model_name = "gpt2"  # You can use "gpt2-medium", "gpt2-large", "gpt2-xl" for larger models
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Function to generate text
def generate_text(prompt, max_length=100):
    # Encode the input text
    inputs = tokenizer.encode(prompt, return_tensors="pt")
    # Generate text
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1, no_repeat_ngram_size=2, early_stopping=True)
    # Decode the generated text
    text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return text


In [None]:
# Input prompt
prompt = "Once upon a time"

# Generate and display the text
generated_text = generate_text(prompt)
print(generated_text)

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Once upon a time, the world was a place of great beauty and great danger. The world of the gods was the place where the great gods were born, and where they were to live.

The world that was created was not the same as the one that is now. It was an endless, endless world. And the Gods were not born of nothing. They were created of a single, single thing. That was why the universe was so beautiful. Because the cosmos was made of two


### Simple Weather Conversational Agent (GPT-2)

In [None]:
# This script demonstrates:
# 1. Baseline: GPT-2 without tools (sees limitations)
# 2. Tool Definition: Define weather function schema with Pydantic
# 3. Tool Selection: GPT-2 with structured output decides when to use tool
# 4. Execution: Call actual weather API (Tavily)
# 5. Response Generation: GPT-2 creates natural answer

In [None]:
%%capture
!pip install transformers torch pydantic tavily-python requests

In [None]:
import os
import json
import requests
from typing import Optional, Literal
from pydantic import BaseModel, Field
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import torch

In [None]:
TAVILY_API_KEY = os.getenv("TAVILY_API_KEY", "Your_TAVILY_KEY")

In [None]:
# Load GPT-2 model
print("Loading GPT-2 model...")
tokenizer = GPT2Tokenizer.from_pretrained("gpt2-medium")  # Using medium for better quality
model = GPT2LMHeadModel.from_pretrained("gpt2-medium")
tokenizer.pad_token = tokenizer.eos_token
model.eval()
print("✓ GPT-2 model loaded successfully\n")

Loading GPT-2 model...


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/718 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.52G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

✓ GPT-2 model loaded successfully



In [None]:
class FunctionCall(BaseModel):
    """Structured output for function calls"""
    action: Literal["call_function", "direct_response"] = Field(
        description="Whether to call a function or respond directly"
    )
    function_name: Optional[str] = Field(
        default=None,
        description="Name of the function to call (get_current_weather)"
    )
    location: Optional[str] = Field(
        default=None,
        description="Location extracted from user query"
    )
    reasoning: str = Field(
        description="Why this action was chosen"
    )

In [None]:
class WeatherToolDefinition(BaseModel):
    """Tool schema definition"""
    name: str = "get_current_weather"
    description: str = "Get current weather information for a specific location"
    parameters: dict = {
        "location": {
            "type": "string",
            "description": "City name, e.g., Mumbai, London, New York",
            "required": True
        }
    }

In [None]:
def generate_with_gpt2(prompt: str, max_length: int = 150, temperature: float = 0.7) -> str:
    """
    Generate text using GPT-2
    """
    inputs = tokenizer.encode(prompt, return_tensors="pt", max_length=1024, truncation=True)

    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=inputs.shape[1] + max_length,
            temperature=temperature,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            no_repeat_ngram_size=3
        )

    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    # Remove the prompt from output
    response = generated_text[len(prompt):].strip()

    return response

In [None]:
def extract_json_from_text(text: str) -> Optional[dict]:
    """
    Extract JSON from GPT-2 output (handles imperfect formatting)
    """
    # Try to find JSON in the text
    start_idx = text.find('{')
    end_idx = text.rfind('}')

    if start_idx != -1 and end_idx != -1:
        json_str = text[start_idx:end_idx + 1]
        try:
            return json.loads(json_str)
        except json.JSONDecodeError:
            # Try to clean up common issues
            json_str = json_str.replace('\n', ' ').replace('  ', ' ')
            try:
                return json.loads(json_str)
            except:
                pass

    return None

In [None]:
def stage_1_baseline(query: str) -> str:
    """
    Stage 1: Baseline - GPT-2 without tools

    Demonstrates: GPT-2 cannot access real-time data
    """
    print("\n" + "="*80)
    print("STAGE 1: BASELINE - GPT-2 WITHOUT TOOLS")
    print("="*80)
    print("Limitation: GPT-2 cannot access real-time weather data\n")

    prompt = f"""You are a weather assistant.

User: {query}
Assistant:"""

    answer = generate_with_gpt2(prompt, max_length=100)

    print(f"Query: {query}")
    print(f"GPT-2 Response: {answer}\n")
    print("❌ Problem: GPT-2 has no way to access current weather data")
    print("   It will either hallucinate or admit it doesn't know\n")

    return answer

In [None]:
def stage_2_tool_definition() -> WeatherToolDefinition:
    """
    Stage 2: Tool Definition with Pydantic

    Demonstrates: Defining structured schema for tools
    """
    print("\n" + "="*80)
    print("STAGE 2: TOOL DEFINITION - Define Weather Function Schema")
    print("="*80)

    tool = WeatherToolDefinition()

    print("Pydantic Tool Schema:")
    print(json.dumps(tool.model_dump(), indent=2))
    print("\n✓ Schema defined using Pydantic for validation")
    print("✓ Will be used to prompt GPT-2 for structured output\n")

    return tool

In [None]:
def stage_3_tool_selection(query: str, tool: WeatherToolDefinition) -> FunctionCall:
    """
    Stage 3: Tool Selection using GPT-2 with structured output

    Demonstrates: Prompting GPT-2 to output structured JSON for tool calling
    """
    print("\n" + "="*80)
    print("STAGE 3: TOOL SELECTION - GPT-2 Decides with Structured Output")
    print("="*80)

    # Craft a detailed prompt for GPT-2 to output structured JSON
    prompt = f"""You are a decision-making assistant. Analyze the user's query and decide if you need to call a weather function.

Available Tool:
- Function: {tool.name}
- Description: {tool.description}
- Parameters: location (string, required)

User Query: "{query}"

Task: Output ONLY a JSON object with this structure:
{{
  "action": "call_function" or "direct_response",
  "function_name": "get_current_weather" (if action is call_function),
  "location": "extracted city name" (if action is call_function),
  "reasoning": "why you chose this action"
}}

Decision JSON:
{{"""

    print(f"Query: {query}")
    print("\n🤖 GPT-2 is analyzing the query and generating structured output...\n")

    # Generate with GPT-2
    response = generate_with_gpt2(prompt, max_length=150, temperature=0.3)

    # Add back the opening brace that was in the prompt
    json_response = "{" + response

    print(f"Raw GPT-2 Output:\n{json_response}\n")

    # Extract and parse JSON
    parsed_json = extract_json_from_text(json_response)

    if parsed_json:
        try:
            # Validate with Pydantic
            function_call = FunctionCall(**parsed_json)
            print("✓ Successfully parsed structured output:")
            print(f"  - Action: {function_call.action}")
            if function_call.function_name:
                print(f"  - Function: {function_call.function_name}")
                print(f"  - Location: {function_call.location}")
            print(f"  - Reasoning: {function_call.reasoning}\n")

            return function_call
        except Exception as e:
            print(f"⚠️  Pydantic validation failed: {e}")
            # Fallback: create a function call based on keywords
            return create_fallback_decision(query)
    else:
        print("⚠️  Could not parse JSON from GPT-2 output")
        return create_fallback_decision(query)

In [None]:
def create_fallback_decision(query: str) -> FunctionCall:
    """
    Fallback: Use simple keyword matching if GPT-2 output parsing fails
    """
    query_lower = query.lower()
    weather_keywords = ['weather', 'temperature', 'rain', 'sunny', 'climate', 'forecast']

    # Check if query contains weather-related keywords
    is_weather_query = any(keyword in query_lower for keyword in weather_keywords)

    # Extract location (simple approach)
    location = None
    if is_weather_query:
        # Try to find city name (this is simplified)
        words = query.split()
        for i, word in enumerate(words):
            if word.lower() in ['in', 'at', 'for']:
                if i + 1 < len(words):
                    location = words[i + 1].strip('?.,!').capitalize()
                    break

    if is_weather_query and location:
        print(f"📋 Using fallback: Detected weather query for {location}")
        return FunctionCall(
            action="call_function",
            function_name="get_current_weather",
            location=location,
            reasoning="Detected weather query using keyword matching"
        )
    else:
        return FunctionCall(
            action="direct_response",
            reasoning="Not a weather query or could not extract location"
        )

In [None]:
def get_weather_tavily(location: str) -> dict:
    """
    Execute weather search using Tavily API
    """
    try:
        from tavily import TavilyClient

        tavily = TavilyClient(api_key=TAVILY_API_KEY)

        query = f"current weather in {location} temperature humidity conditions today"
        response = tavily.search(query, max_results=3)

        # Extract relevant information
        weather_info = {
            "location": location,
            "source": "tavily_web_search",
            "success": True,
            "data": []
        }

        for result in response.get('results', [])[:2]:
            weather_info["data"].append({
                "title": result.get("title", ""),
                "content": result.get("content", "")[:300]  # First 300 chars
            })

        return weather_info

    except Exception as e:
        return {
            "location": location,
            "success": False,
            "error": f"Tavily search failed: {str(e)}"
        }

In [None]:
def stage_4_execution(function_call: FunctionCall) -> Optional[str]:
    """
    Stage 4: Execution - Call actual weather API

    Demonstrates: Executing the function that GPT-2 decided to call
    """
    if function_call.action != "call_function":
        return None

    print("\n" + "="*80)
    print("STAGE 4: EXECUTION - Call Tavily Weather API")
    print("="*80)

    print(f"\n🔧 Executing weather lookup for: {function_call.location}")

    weather_data = get_weather_tavily(function_call.location)

    if weather_data.get("success"):
        print(f"✓ Weather data retrieved successfully\n")
        print("Sample data:")
        for item in weather_data.get("data", [])[:1]:
            print(f"  {item['content'][:150]}...\n")
    else:
        print(f"❌ Error: {weather_data.get('error')}\n")

    return json.dumps(weather_data, indent=2)

In [None]:
def stage_5_response_generation(query: str, weather_data: str, location: str) -> str:
    """
    Stage 5: Response Generation

    Demonstrates: GPT-2 transforms technical data into natural response
    """
    print("\n" + "="*80)
    print("STAGE 5: RESPONSE GENERATION - GPT-2 Creates Natural Answer")
    print("="*80)

    # Parse the weather data to extract key info
    data = json.loads(weather_data)
    weather_summary = ""

    if data.get("success") and data.get("data"):
        for item in data["data"][:2]:
            weather_summary += item.get("content", "")[:200] + " "

    prompt = f"""You are a friendly weather assistant.

User asked: "{query}"

Weather Information for {location}:
{weather_summary[:400]}

Provide a natural, conversational response about the weather. Be concise and friendly.

Response:"""

    print(f"\n🤖 GPT-2 generating natural response...\n")

    final_response = generate_with_gpt2(prompt, max_length=100, temperature=0.7)

    print(f"✓ Final Natural Response:")
    print(f"{final_response}\n")

    return final_response

In [None]:
def run_complete_demo():
    """
    Run the complete 5-stage demonstration with GPT-2
    """
    print("\n" + "="*80)
    print("WEATHER CONVERSATIONAL AGENT - GPT-2 + PYDANTIC")
    print("5 STAGES DEMONSTRATION")
    print("="*80)

    # Example query
    query = "What's the weather like in Mumbai?"

    # Stage 1: Baseline (GPT-2 without tools)
    stage_1_baseline(query)

    # Stage 2: Tool Definition
    tool = stage_2_tool_definition()

    # Stage 3: Tool Selection (GPT-2 with structured output)
    function_call = stage_3_tool_selection(query, tool)

    # Stage 4: Execution
    weather_data = None
    if function_call.action == "call_function":
        weather_data = stage_4_execution(function_call)

    # Stage 5: Response Generation
    if weather_data:
        stage_5_response_generation(query, weather_data, function_call.location)
    else:
        print("\n✓ Query handled without function call\n")

    print("\n" + "="*80)
    print("DEMONSTRATION COMPLETE")
    print("="*80)
    print("\nKey Takeaways:")
    print("1. ❌ Baseline GPT-2 cannot access real-time data")
    print("2. ✓ Pydantic models define structured schemas")
    print("3. ✓ Careful prompting makes GPT-2 output structured JSON")
    print("4. ✓ Tavily API provides real-time weather via web search")
    print("5. ✓ GPT-2 transforms technical data into natural responses")
    print("\nNote: GPT-2 requires more careful prompting than modern LLMs")
    print("      with native function calling (like GPT-4)")
    print("="*80 + "\n")

In [None]:
run_complete_demo()


WEATHER CONVERSATIONAL AGENT - GPT-2 + PYDANTIC
5 STAGES DEMONSTRATION

STAGE 1: BASELINE - GPT-2 WITHOUT TOOLS
Limitation: GPT-2 cannot access real-time weather data

Query: What's the weather like in Mumbai?
GPT-2 Response: I'm here to help you.
, (not really)

,

;

: A weather assistant in Mumbai, how come you have to come up with a new name?

A weather assistant, a weather helper, a meteorologist, a doctor, a scientist, a lawyer, a social worker, a housekeeper, a maid, a cook, a baker, a gardener, a chauffeur, a hotelier, a taxi driver, a security guard

❌ Problem: GPT-2 has no way to access current weather data
   It will either hallucinate or admit it doesn't know


STAGE 2: TOOL DEFINITION - Define Weather Function Schema
Pydantic Tool Schema:
{
  "name": "get_current_weather",
  "description": "Get current weather information for a specific location",
  "parameters": {
    "location": {
      "type": "string",
      "description": "City name, e.g., Mumbai, London, New York",
