In [2]:
import tensorflow as tf
import numpy as np
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer
import json
import os

In [3]:
# Load the GPT-2 tokenizer and TensorFlow model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = TFGPT2LMHeadModel.from_pretrained('gpt2')

# Define the allowed letters and get their corresponding token IDs.
allowed_letters = ['A', 'B', 'C', 'D']
allowed_token_ids = [tokenizer.encode(letter, add_prefix_space=False)[0] for letter in allowed_letters]

def create_prompt_with_content(question_obj):
    """
    Create a prompt that uses the provided context, question, and options.
    Instructs the model to output only one letter (A, B, C, or D) with no extra text.
    """
    content = question_obj.get("content", "")
    options = question_obj['options']
    
    # Map options to letters (A, B, C, D, etc.)
    option_map = {i: chr(65 + i) for i in range(len(options))}
    options_text = "\n".join([f"{option_map[i]}. {option}" for i, option in enumerate(options)])
    
    prompt = (
        
        f"Content: {content}\n\n"
        f"Question: {question_obj['question']}\n\n"
        f"Options:\n{options_text}\n\n"
        "Please choose the correct option by outputting only one letter (A, B, C, or D) with no extra text.\n"
        "Your Answer: "
    )
    return prompt

def generate_one_allowed_token(prompt, allowed_token_ids, temperature=1, do_sample=True):
    """
    Generate one token after the prompt, restricting selection to allowed_token_ids.
    If do_sample=True, sample from the distribution using the provided temperature.
    """
    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    
    # Run the model to get logits with return_dict=True
    outputs = model(input_ids, return_dict=True)
    logits = outputs.logits  # shape: (batch_size, seq_length, vocab_size)
    
    # Get logits for the last token (the next-token logits)
    last_token_logits = logits[:, -1, :]  # shape: (1, vocab_size)
    last_token_logits_np = last_token_logits.numpy()  # convert to numpy array
    
    # Create a masked logits vector: set non-allowed tokens to a very low score.
    masked_logits = np.full(last_token_logits_np.shape, -1e9)
    for token_id in allowed_token_ids:
        masked_logits[0, token_id] = last_token_logits_np[0, token_id]
    
    if do_sample:
        # Apply temperature scaling
        scaled_logits = masked_logits / temperature
        
        # Compute probabilities using softmax
        exp_logits = np.exp(scaled_logits)
        probs = exp_logits / np.sum(exp_logits)
        
        # Sample one token from allowed tokens
        next_token_id = int(np.random.choice(len(probs[0]), p=probs[0]))
    else:
        # Deterministic: take the highest probability token
        next_token_id = int(np.argmax(masked_logits))
        
    return next_token_id

def evaluate_answer(question_obj, generated_letter):
    """
    Evaluate whether the generated letter corresponds to the correct answer.
    Maps the correct answer text to its corresponding letter and compares.
    """
    correct_answer_text = question_obj["correct_answer"]
    options = question_obj["options"]
    try:
        correct_index = options.index(correct_answer_text)
        correct_letter = chr(65 + correct_index)
    except ValueError:
        return False, None
    
    is_correct = (generated_letter == correct_letter)
    return is_correct, correct_letter

# # Example question object
# question_obj = {
#     "id": "hp_004",
#     "question": "What are the animal mascots of Gryffindor, Ravenclaw, Hufflepuff, and Slytherin, respectively?",
#     "options": [
#       "Lion, Snake, Rat, Cow",
#       "Lion, Eagle, Snake, Badger",
#       "Sheep, Pig, Snake, Cow",
#       "Lion, Hawk, Snake, Otter"
#     ],
#     "correct_answer": "Lion, Eagle, Snake, Badger",
#     "content": "Each house in Hogwarts has a corresponding animal: Gryffindor (Lion), Ravenclaw (Eagle), Hufflepuff (Badger), and Slytherin (Snake), as explained in 'Harry Potter and the Philosopher’s Stone'."
#   }

# # Create the prompt (with content)
# prompt = create_prompt_with_content(question_obj)

# # Generate one token restricted to the allowed letters
# next_token_id = generate_one_allowed_token(prompt, allowed_token_ids)
# answer_generated = tokenizer.decode([next_token_id]).strip()

# # Evaluate the answer
# is_correct, correct_letter = evaluate_answer(question_obj, answer_generated)

# print("Prompt:\n", prompt)
# print("Generated Answer:", answer_generated)
# print("Evaluation:", "Correct" if is_correct else f"Incorrect (expected {correct_letter})")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


In [4]:
# Change this file_path
# D:\NLP_Project\NLP_project\data\Harry_Potter_Data_updated.json
# D:\NLP_Project\NLP_project\data\Harry_Potter_Data_updated_shuffled.json
file_path = os.path.abspath(r"D:\NLP_Project\NLP_project\data\Harry_Potter_Data_updated_shuffled.json")
with open(file_path, "r") as f:
    data = json.load(f)

In [5]:
count = 0
correct_count = 0 
generate_answers = []
for item in data:
    question_obj = item

    # Create the prompt (with content)
    prompt = create_prompt_with_content(question_obj)

    # Generate one token restricted to the allowed letters
    next_token_id = generate_one_allowed_token(prompt, allowed_token_ids)
    answer_generated = tokenizer.decode([next_token_id]).strip()

    # Evaluate the answer
    is_correct, correct_letter = evaluate_answer(question_obj, answer_generated)
    count += 1

    if is_correct:
        correct_count += 1
    
    # record the generated answer
    
    generate_answers.append(answer_generated)
    # print("Prompt:\n", prompt)
    # print("Generated Answer:", answer_generated)
    # print("Evaluation:", "Correct" if is_correct else f"Incorrect (expected {correct_letter})")
# print num of A B C D 
print(f'A: {generate_answers.count("A")}')
print(f'B: {generate_answers.count("B")}')
print(f'C: {generate_answers.count("C")}')
print(f'D: {generate_answers.count("D")}')
print(f'correct_count: {correct_count}')
print(f'count: {count}')
print(f'accuracy: {correct_count/count}')


A: 70
B: 9
C: 13
D: 28
correct_count: 13
count: 120
accuracy: 0.10833333333333334


In [6]:
# Few-shot examples to include in the prompt.
few_shot_examples = (
    "Example 1:\n"
    "Content: In 'Harry Potter and the Chamber of Secrets', it is revealed that a mysterious basilisk terrorizes the school.\n"
    "Question: What creature is terrorizing Hogwarts in this example?\n"
    "Options:\n"
    "A. Dragon\n"
    "B. Basilisk\n"
    "C. Troll\n"
    "D. Unicorn\n"
    "Your Answer: B\n\n"
    "Example 2:\n"
    "Content: In 'Harry Potter and the Prisoner of Azkaban', it is shown that Remus Lupin is a werewolf.\n"
    "Question: What is Remus Lupin's secret condition?\n"
    "Options:\n"
    "A. Vampire\n"
    "B. Werewolf\n"
    "C. Ghost\n"
    "D. Muggle\n"
    "Your Answer: B\n\n"
)

def create_few_shot_prompt_with_content(question_obj, few_shot_text=few_shot_examples):
    """
    Create a prompt that includes a few-shot demonstration of how to answer,
    followed by the current question with its content, question, and options.
    Instructs the model to output only one letter (A, B, C, or D) with no extra text.
    """
    content = question_obj.get("content", "")
    options = question_obj['options']
    
    # Map options to letters (A, B, C, D, etc.)
    option_map = {i: chr(65 + i) for i in range(len(options))}
    options_text = "\n".join([f"{option_map[i]}. {option}" for i, option in enumerate(options)])
    
    test_prompt = (
        f"Content: {content}\n\n"
        f"Question: {question_obj['question']}\n\n"
        f"Options:\n{options_text}\n\n"
        "Please choose the correct option by outputting only one letter (A, B, C, or D) with no extra text.\n"
        "Your Answer: "
    )
    
    # Combine the few-shot examples with the test question.
    full_prompt = few_shot_text + test_prompt
    return full_prompt

# The rest of your code for generation and evaluation remains the same.
def generate_one_allowed_token(prompt, allowed_token_ids, temperature=1, do_sample=True):
    """
    Generate one token after the prompt, restricting selection to allowed_token_ids.
    If do_sample=True, sample from the distribution using the provided temperature.
    """
    # Tokenize the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='tf')
    
    # Run the model to get logits with return_dict=True
    outputs = model(input_ids, return_dict=True)
    logits = outputs.logits  # shape: (batch_size, seq_length, vocab_size)
    
    # Get logits for the last token (the next-token logits)
    last_token_logits = logits[:, -1, :]  # shape: (1, vocab_size)
    last_token_logits_np = last_token_logits.numpy()  # convert to numpy array
    
    # Create a masked logits vector: set non-allowed tokens to a very low score.
    masked_logits = np.full(last_token_logits_np.shape, -1e9)
    for token_id in allowed_token_ids:
        masked_logits[0, token_id] = last_token_logits_np[0, token_id]
    
    if do_sample:
        # Apply temperature scaling
        scaled_logits = masked_logits / temperature
        
        # Compute probabilities using softmax
        exp_logits = np.exp(scaled_logits)
        probs = exp_logits / np.sum(exp_logits)
        
        # Sample one token from allowed tokens
        next_token_id = int(np.random.choice(len(probs[0]), p=probs[0]))
    else:
        # Deterministic: take the highest probability token
        next_token_id = int(np.argmax(masked_logits))
        
    return next_token_id

def evaluate_answer(question_obj, generated_letter):
    """
    Evaluate whether the generated letter corresponds to the correct answer.
    Maps the correct answer text to its corresponding letter and compares.
    """
    correct_answer_text = question_obj["correct_answer"]
    options = question_obj["options"]
    try:
        correct_index = options.index(correct_answer_text)
        correct_letter = chr(65 + correct_index)
    except ValueError:
        return False, None
    
    is_correct = (generated_letter == correct_letter)
    return is_correct, correct_letter

# Example question object (test question)
question_obj = {
    "id": "hp_001",
    "question": "What curse caused the death of Harry Potter's parents?",
    "options": [
        "Cruciatus Curse",
        "Imperius Curse",
        "Avada Kedavra",
        "Soul Extraction Curse"
    ],
    "correct_answer": "Avada Kedavra",
    "content": "In 'Harry Potter and the Philosopher's Stone', it is revealed that Lord Voldemort killed James and Lily Potter using the Avada Kedavra curse. This is one of the three Unforgivable Curses and causes instant death without physical harm."
}

# Create the few-shot prompt (with content)
prompt = create_few_shot_prompt_with_content(question_obj)

# Generate one token restricted to the allowed letters
next_token_id = generate_one_allowed_token(prompt, allowed_token_ids, temperature=1, do_sample=True)
answer_generated = tokenizer.decode([next_token_id]).strip()

# Evaluate the answer
is_correct, correct_letter = evaluate_answer(question_obj, answer_generated)

print("Full Prompt:\n", prompt)
print("Generated Answer:", answer_generated)
print("Evaluation:", "Correct" if is_correct else f"Incorrect (expected {correct_letter})")


Full Prompt:
 Example 1:
Content: In 'Harry Potter and the Chamber of Secrets', it is revealed that a mysterious basilisk terrorizes the school.
Question: What creature is terrorizing Hogwarts in this example?
Options:
A. Dragon
B. Basilisk
C. Troll
D. Unicorn
Your Answer: B

Example 2:
Content: In 'Harry Potter and the Prisoner of Azkaban', it is shown that Remus Lupin is a werewolf.
Question: What is Remus Lupin's secret condition?
Options:
A. Vampire
B. Werewolf
C. Ghost
D. Muggle
Your Answer: B

Content: In 'Harry Potter and the Philosopher's Stone', it is revealed that Lord Voldemort killed James and Lily Potter using the Avada Kedavra curse. This is one of the three Unforgivable Curses and causes instant death without physical harm.

Question: What curse caused the death of Harry Potter's parents?

Options:
A. Cruciatus Curse
B. Imperius Curse
C. Avada Kedavra
D. Soul Extraction Curse

Please choose the correct option by outputting only one letter (A, B, C, or D) with no extra te

In [7]:
import tensorflow as tf
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# # Load pre-trained GPT-2 tokenizer and TensorFlow model
# tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
# model = TFGPT2Model.from_pretrained("gpt2", output_hidden_states=True)

# model.trainable = False  # We don't need to fine-tune here
# Load model with LM head
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)
model.trainable = False  # For inference
# Define a function to compute the embedding for a given text passage
def embed_text(text):
    # Tokenize the input text and convert to TensorFlow tensors
    inputs = tokenizer(text, return_tensors="tf")
    # Forward pass through GPT-2 to get hidden states (returns a tuple)
    outputs = model(inputs)
    # Get the last hidden state (the first element of the tuple)
    last_hidden_state = outputs[0]
    # Average the token embeddings to form a single sentence embedding
    sentence_embedding = tf.reduce_mean(last_hidden_state, axis=1)
    return sentence_embedding

# Example dataset: a list of chunk dictionaries
data = [
    {
        "title_num": 1,
        "title": "Harry Potter and the Philosopher's Stone",
        "chapter_num": 1,
        "chapter_name": "The Boy Who Lived",
        "passage": (
            "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say "
            "that they were perfectly normal, thank you very much. They were the last "
            "people you'd expect to be involved in anything strange or mysterious, because "
            "they just didn't hold with such nonsense. Mr. Dursley was the director of a firm "
            "called Grunnings, which made drills. He was a big, beefy man with hardly any neck, "
            "although he did have a very large mustache. Mrs. Dursley was thin and blonde and had "
            "nearly twice the usual amount of neck, which came in very useful as she spent so much "
            "of her time craning over garden fences, spying on the neighbors."
        )
    }
    # Add additional chunks as needed...
]

# Process each chunk and compute its embedding
embeddings = []
for chunk in data:
    passage = chunk["passage"]
    embedding = embed_text(passage)
    embeddings.append(embedding)

# Print the shape of each embedding to verify
for i, emb in enumerate(embeddings):
    print(f"Embedding for chunk {i+1}: {emb.shape}")


All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Embedding for chunk 1: (1, 50257)


In [14]:
import tensorflow as tf
import numpy as np
from transformers import TFGPT2LMHeadModel, GPT2Tokenizer

# Load model with LM head (already including output_hidden_states=True)
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
model = TFGPT2LMHeadModel.from_pretrained("gpt2", output_hidden_states=True)
model.trainable = False  # For inference

# Define the embedding function (using average pooling over last hidden state)
def embed_text(text):
    inputs = tokenizer(text, return_tensors="tf")
    outputs = model(inputs)  # outputs is a tuple; use outputs[0] for last hidden state
    last_hidden_state = outputs[0]
    sentence_embedding = tf.reduce_mean(last_hidden_state, axis=1)
    return sentence_embedding  # shape: (1, hidden_dim)

# Example dataset: a list of document chunks
data = [
    {
        "title_num": 1,
        "title": "Harry Potter and the Philosopher's Stone",
        "chapter_num": 1,
        "chapter_name": "The Boy Who Lived",
        "passage": (
            "Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say "
            "that they were perfectly normal, thank you very much. They were the last "
            "people you'd expect to be involved in anything strange or mysterious, because "
            "they just didn't hold with such nonsense. Mr. Dursley was the director of a firm "
            "called Grunnings, which made drills. He was a big, beefy man with hardly any neck, "
            "although he did have a very large mustache. Mrs. Dursley was thin and blonde and had "
            "nearly twice the usual amount of neck, which came in very useful as she spent so much "
            "of her time craning over garden fences, spying on the neighbors."
        )
    }
    # You can add more chunks if needed
]

# Precompute and store embeddings for all passages
embeddings = []
for chunk in data:
    passage = chunk["passage"]
    emb = embed_text(passage)
    embeddings.append(emb)

# Define a function to compute cosine similarity between two embeddings
def cosine_similarity(a, b):
    a_norm = tf.nn.l2_normalize(a, axis=1)
    b_norm = tf.nn.l2_normalize(b, axis=1)
    return tf.reduce_sum(a_norm * b_norm, axis=1)  # returns a tensor with similarity score

# --- Retrieval and QA Pipeline ---

# 1. Define your question
question = "What is Mr. Dursley’s role at Grunnings?"

# 2. Compute the embedding for the question
question_emb = embed_text(question)  # shape: (1, hidden_dim)

# 3. Compare with stored passage embeddings to find the most relevant passage
similarities = []
for emb in embeddings:
    sim = cosine_similarity(question_emb, emb)
    similarities.append(sim.numpy()[0])  # Extract scalar similarity

best_idx = np.argmax(similarities)
retrieved_passage = data[best_idx]["passage"]

# 4. Construct a prompt that includes the retrieved passage
prompt = (
    "You are an assistant with expert knowledge of the Harry Potter series. "
    "Based on the following passage, answer the question concisely in one sentence.\n\n"
    "Passage:\n" + retrieved_passage + "\n\n"
    "Question: " + question + "\n"
    "Answer:"
)

# 5. Tokenize and generate the answer
inputs = tokenizer(prompt, return_tensors="tf")
outputs = model.generate(
    **inputs,
    max_length=250,
    pad_token_id=50256,
    num_beams=5,
    no_repeat_ngram_size=3,
    early_stopping=True
)
generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
print("Generated Answer:", generated_text)



All model checkpoint layers were used when initializing TFGPT2LMHeadModel.

All the layers of TFGPT2LMHeadModel were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2LMHeadModel for predictions without further training.


Generated Answer: You are an assistant with expert knowledge of the Harry Potter series. Based on the following passage, answer the question concisely in one sentence.

Passage:
Mr. and Mrs. Dursley, of number four, Privet Drive, were proud to say that they were perfectly normal, thank you very much. They were the last people you'd expect to be involved in anything strange or mysterious, because they just didn't hold with such nonsense. Mr. Dursley was the director of a firm called Grunnings, which made drills. He was a big, beefy man with hardly any neck, although he did have a very large mustache. Mrs. Dursley was thin and blonde and had nearly twice the usual amount of neck, which came in very useful as she spent so much of her time craning over garden fences, spying on the neighbors.

Question: What is Mr. Dursley’s role at Grunnings?
Answer:

He is the head of the firm, and he is responsible for all of the work that goes on at the firm. He is also responsible for the production of