In [4]:
import json
import tensorflow as tf
from transformers import TFGPT2Model, GPT2Tokenizer
from tqdm import tqdm

# Load GPT-2 model in TensorFlow with hidden states enabled
model = TFGPT2Model.from_pretrained("gpt2", output_hidden_states=True)
model.trainable = False  # Disable training mode

# Load the tokenizer and set a padding token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token  # Set pad_token to eos_token

def get_embedding(texts, model, tokenizer):
    inputs = tokenizer(
        texts,
        return_tensors="tf",
        padding=True,
        truncation=True,
        add_special_tokens=True
    )
    
    outputs = model(inputs, return_dict=True)
    
    last_hidden_state = outputs.last_hidden_state
    
    embeddings = tf.reduce_mean(last_hidden_state, axis=1)
    return embeddings.numpy()


# Load the dataset from file (assumes a JSON file with a list of entries)
#input_file = r"D:\NLP_Project\NLP_project\data\hp1_chunked.json"
input_file = r'/Users/aaroncui/Desktop/UCL/NLP/NLP_project/data/hp1_chunked.json'
dataset = []

with open(input_file, "r", encoding="utf-8") as f:
    for line in f:
        line = line.strip()
        if line:  # Skip empty lines
            dataset.append(json.loads(line))

print(f"Loaded {len(dataset)} JSON objects.")
batch_size = 32
new_dataset = []

# Process the dataset in batches
for i in tqdm(range(0, len(dataset), batch_size)):
    batch = dataset[i:i+batch_size]
    # Extract the passage field from each document
    texts = [doc["passage"] for doc in batch]
    embeddings = get_embedding(texts, model, tokenizer)
    
    # Append the computed embedding to each document
    for doc, emb in zip(batch, embeddings):
        doc["embeddings"] = emb.tolist()  # Convert numpy array to list for JSON serialization
        new_dataset.append(doc)

# Save the new dataset with embeddings to a new JSON file
output_file = r"D:\NLP_Project\NLP_project\data\hp1_chunked_embedded_tf.json"
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(new_dataset, f, indent=2)


All PyTorch model weights were used when initializing TFGPT2Model.

All the weights of TFGPT2Model were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Loaded 726 JSON objects.


100%|██████████| 23/23 [01:52<00:00,  4.87s/it]


In [5]:
import tensorflow as tf

def compute_attention_score(query_text, passage_text, model, tokenizer):
    """
    Computes a relevance score between a query and a passage using token-level attention.
    This function:
      1. Obtains token-level embeddings (last hidden states) for both query and passage.
      2. Computes a similarity matrix (dot-product) between each query token and each passage token.
      3. Applies softmax over the passage tokens to get attention weights.
      4. Uses the attention weights to form an attended representation of the passage for each query token.
      5. Computes a cosine similarity between the query tokens and the attended representations.
      6. Averages these similarities to produce a single relevance score.
    """
    # Tokenize query and passage separately.
    query_inputs = tokenizer(query_text, return_tensors="tf", truncation=True, padding=True)
    passage_inputs = tokenizer(passage_text, return_tensors="tf", truncation=True, padding=True)
    
    # Get token-level embeddings (last_hidden_state).
    # Note: Use return_dict=True so we can access the outputs by name.
    query_outputs = model(query_inputs, return_dict=True)
    passage_outputs = model(passage_inputs, return_dict=True)
    
    # Extract the last hidden state tensors.
    # Shape: [1, query_length, hidden_size] and [1, passage_length, hidden_size]
    query_embeds = query_outputs.last_hidden_state  # shape: (1, q_len, hidden)
    passage_embeds = passage_outputs.last_hidden_state  # shape: (1, p_len, hidden)
    
    # Compute similarity scores between every query token and every passage token.
    # This gives a similarity matrix of shape [1, query_length, passage_length]
    sim_matrix = tf.matmul(query_embeds, passage_embeds, transpose_b=True)
    sim_matrix = tf.squeeze(sim_matrix, axis=0)  # shape: (q_len, p_len)
    
    # For each query token, compute attention weights over passage tokens.
    # Using softmax along the passage dimension.
    attn_weights = tf.nn.softmax(sim_matrix, axis=-1)  # shape: (q_len, p_len)
    
    # Compute an attended passage representation for each query token:
    # For each query token, it's a weighted sum of passage token embeddings.
    # passage_embeds[0] has shape: (p_len, hidden)
    attended_passage = tf.matmul(attn_weights, passage_embeds[0])  # shape: (q_len, hidden)
    
    # Normalize query embeddings and attended passage representations.
    query_norm = tf.nn.l2_normalize(query_embeds[0], axis=-1)        # shape: (q_len, hidden)
    attended_norm = tf.nn.l2_normalize(attended_passage, axis=-1)       # shape: (q_len, hidden)
    
    # Compute cosine similarity for each query token vs. its attended passage vector.
    token_similarities = tf.reduce_sum(query_norm * attended_norm, axis=-1)  # shape: (q_len,)
    
    # Aggregate the token-level similarities to produce a single relevance score.
    relevance_score = tf.reduce_mean(token_similarities)  # scalar
    
    return relevance_score.numpy()

# Example usage:
query = "Who is the director of Grunnings?"
candidate_passages = [
    "Mr. Dursley was the director of a firm called Grunnings, which made drills.",
    "Harry attended Hogwarts and learned about magic.",
    "The board meeting was held in London at a fancy hotel."
]

# Assuming you already have your TensorFlow GPT-2 model and tokenizer loaded (with pad token set, etc.)
# For each candidate passage, compute an attention-based relevance score.
scores = []
for passage in candidate_passages:
    score = compute_attention_score(query, passage, model, tokenizer)
    scores.append(score)
    print(f"Passage: {passage}\nScore: {score}\n")

# Re-rank candidate passages based on the computed scores.
ranked_indices = sorted(range(len(scores)), key=lambda i: scores[i], reverse=True)
ranked_passages = [candidate_passages[i] for i in ranked_indices]

print("Ranked Passages:")
for i, passage in enumerate(ranked_passages):
    print(f"{i+1}. {passage}")


Passage: Mr. Dursley was the director of a firm called Grunnings, which made drills.
Score: 0.968715250492096

Passage: Harry attended Hogwarts and learned about magic.
Score: 0.9665529727935791

Passage: The board meeting was held in London at a fancy hotel.
Score: 0.9707780480384827

Ranked Passages:
1. The board meeting was held in London at a fancy hotel.
2. Mr. Dursley was the director of a firm called Grunnings, which made drills.
3. Harry attended Hogwarts and learned about magic.


In [15]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Assume:
# - all_passages: a list of all passage texts in your dataset.
# - passage_embeddings: a numpy array of shape (num_passages, hidden_dim) with pre-computed embeddings.
# - get_embedding: function that computes a query embedding.
# - compute_attention_score: function defined earlier for attention-based scoring.
# - model and tokenizer are already loaded.

query = "What is Harry Potter's wand made of?"
# Step 1: Compute query embedding using the same method used for passages
query_embedding = get_embedding([query], model, tokenizer)  # shape: (1, hidden_dim)
passage_embeddings = np.array([doc["embeddings"] for doc in new_dataset])
# Step 2: Compute cosine similarities with all passage embeddings
similarities = cosine_similarity(query_embedding, passage_embeddings)[0]
# Assuming new_dataset is your loaded dataset
all_passages = [doc["passage"] for doc in new_dataset]

# Step 3: Retrieve the top-N candidate passages based on cosine similarity
N = 50  # you can adjust this number as needed
top_indices = np.argsort(similarities)[::-1][:N]
initial_candidates = [all_passages[i] for i in top_indices]

# Step 4: Re-rank the candidates using the attention-based scoring function
attention_scores = []
for passage in initial_candidates:
    score = compute_attention_score(query, passage, model, tokenizer)
    attention_scores.append(score)

# Sort the candidate passages by the attention score (higher means more relevant)
re_ranked_indices = np.argsort(attention_scores)[::-1]
final_candidates = [initial_candidates[i] for i in re_ranked_indices]

# Optionally, pick the top 5 for further downstream processing in your QA system.
top_5_candidates = final_candidates[:5]

print("Final Ranked Passages:")
# Print the top 5 passages and scores
for i, passage in enumerate(top_5_candidates, 1):
    print(f"{i}. {passage}")
    print(f"Attention Score: {attention_scores[re_ranked_indices[i-1]]:.4f}\n")

Final Ranked Passages:
1. He had been given a week in his cupboard for this, even though he had tried to explain that he couldn't explain how it had grown back so quickly. Another time, Aunt Petunia had been trying to force him into a revolting old sweater of Dudley's (brown with orange puff balls). The harder she tried to pull it over his head, the smaller it seemed to become, until finally it might have fitted a hand puppet, but certainly wouldn't fit Harry. Aunt Petunia had decided it must have shrunk in the wash and, to his great relief, Harry wasn't punished. On the other hand, he'd gotten into terrible trouble for being found on the roof of the school kitchens. Dudley's gang had been chasing him as usual when, as much to Harry's surprise as anyone else's, there he was sitting on the chimney. The Dursleys had received a very angry letter from Harry's headmistress telling them Harry had been climbing school buildings. But all he'd tried to do (as he shouted at Uncle Vernon through 

In [8]:
from transformers import TFAutoModelForQuestionAnswering, AutoTokenizer

qa_model_name = "distilbert-base-uncased-distilled-squad"
qa_model = TFAutoModelForQuestionAnswering.from_pretrained(qa_model_name)
qa_tokenizer = AutoTokenizer.from_pretrained(qa_model_name)

def answer_question(question, context, qa_model, qa_tokenizer):
    inputs = qa_tokenizer.encode_plus(question, context, return_tensors="tf")
    outputs = qa_model(inputs)
    start_scores, end_scores = outputs[0], outputs[1]  # Access tuple elements directly
    start_index = tf.math.argmax(start_scores, axis=1).numpy()[0]
    end_index = tf.math.argmax(end_scores, axis=1).numpy()[0] + 1
    answer_ids = inputs["input_ids"][0][start_index:end_index]
    answer = qa_tokenizer.decode(answer_ids)
    return answer

query = "Who is the headmaster at Hogwarts?"
# Use the top retrieved passage for answer extraction (or combine multiple passages)
selected_context = retrieved_passages[0]["passage"]
extracted_answer = answer_question(query, selected_context, qa_model, qa_tokenizer)
print("Extracted Answer:", extracted_answer)


Some layers from the model checkpoint at distilbert-base-uncased-distilled-squad were not used when initializing TFDistilBertForQuestionAnswering: ['dropout_19']
- This IS expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertForQuestionAnswering from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some layers of TFDistilBertForQuestionAnswering were not initialized from the model checkpoint at distilbert-base-uncased-distilled-squad and are newly initialized: ['dropout_96']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Extracted Answer: [CLS]


In [16]:
import numpy as np
import tensorflow as tf
import json
from sklearn.metrics.pairwise import cosine_similarity
from transformers import TFGPT2Model, GPT2Tokenizer

# --- Setup: Load Model, Tokenizer, and Pre-computed Data ---

# Load GPT-2 model (TensorFlow) with hidden states enabled
model = TFGPT2Model.from_pretrained("gpt2", output_hidden_states=True)
model.trainable = False  # Inference mode

# Load the GPT-2 tokenizer and set the pad token
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Assume you have your dataset already loaded as new_dataset (list of dicts)
# and that each dict contains a "passage" field.
# For example, you might have loaded it from a JSONL file:
# with open("hp1_chunked_embedded_tf.json", "r", encoding="utf-8") as f:
#     new_dataset = [json.loads(line) for line in f if line.strip()]
#
# Also assume that you have pre-computed passage embeddings (using a function like get_embedding)
# stored in a numpy array named `passage_embeddings` of shape (num_passages, hidden_dim).
# And that all_passages is a list of all passage texts:
all_passages = [doc["passage"] for doc in new_dataset]
# For this demo, passage_embeddings must be computed beforehand for each passage.
# For example:
# passage_embeddings = np.array([get_embedding([doc["passage"]], model, tokenizer)[0] for doc in new_dataset])

# --- Define Functions ---

def get_embedding(texts, model, tokenizer):
    """
    Compute a single vector embedding per text by averaging the GPT-2 token-level embeddings.
    """
    inputs = tokenizer(
        texts,
        return_tensors="tf",
        padding=True,
        truncation=True,
        add_special_tokens=True
    )
    outputs = model(inputs, return_dict=True)
    # Average over the sequence (token) dimension.
    last_hidden_state = outputs.last_hidden_state  # shape: (batch_size, seq_len, hidden_size)
    embeddings = tf.reduce_mean(last_hidden_state, axis=1)
    return embeddings.numpy()

def compute_attention_score(query_text, passage_text, model, tokenizer):
    """
    Computes a relevance score between a query and a passage using token-level attention.
    Steps:
      1. Obtain token-level embeddings (last hidden states) for both query and passage.
      2. Compute a dot-product similarity matrix between query and passage tokens.
      3. Apply softmax over passage tokens to get attention weights.
      4. Compute an attended passage representation for each query token.
      5. Calculate cosine similarities at the token level and average them.
    """
    # Tokenize query and passage separately.
    query_inputs = tokenizer(query_text, return_tensors="tf", truncation=True, padding=True)
    passage_inputs = tokenizer(passage_text, return_tensors="tf", truncation=True, padding=True)
    
    # Get token-level embeddings.
    query_outputs = model(query_inputs, return_dict=True)
    passage_outputs = model(passage_inputs, return_dict=True)
    
    query_embeds = query_outputs.last_hidden_state  # (1, q_len, hidden)
    passage_embeds = passage_outputs.last_hidden_state  # (1, p_len, hidden)
    
    # Compute similarity matrix between query and passage tokens.
    sim_matrix = tf.matmul(query_embeds, passage_embeds, transpose_b=True)  # (1, q_len, p_len)
    sim_matrix = tf.squeeze(sim_matrix, axis=0)  # (q_len, p_len)
    
    # Compute attention weights for each query token over passage tokens.
    attn_weights = tf.nn.softmax(sim_matrix, axis=-1)  # (q_len, p_len)
    
    # Compute attended passage representation for each query token.
    attended_passage = tf.matmul(attn_weights, passage_embeds[0])  # (q_len, hidden)
    
    # Normalize embeddings
    query_norm = tf.nn.l2_normalize(query_embeds[0], axis=-1)      # (q_len, hidden)
    attended_norm = tf.nn.l2_normalize(attended_passage, axis=-1)     # (q_len, hidden)
    
    # Compute cosine similarity per query token.
    token_similarities = tf.reduce_sum(query_norm * attended_norm, axis=-1)  # (q_len,)
    
    # Aggregate token-level similarities to a single scalar score.
    relevance_score = tf.reduce_mean(token_similarities)
    return relevance_score.numpy()

def compute_keyword_bonus(passage, keywords):
    """
    Computes a bonus score based on the presence of keywords in the passage.
    For each keyword found, adds a fixed bonus.
    """
    bonus = 0.0
    passage_lower = passage.lower()
    for kw in keywords:
        if kw in passage_lower:
            bonus += 0.1  # Adjust bonus value as needed.
    return bonus

# --- Two-Stage Retrieval and Re-ranking ---

# Define your query.
query = "What is Harry Potter's wand made of?"

# Stage 1: Use cosine similarity to retrieve a candidate set.
query_embedding = get_embedding([query], model, tokenizer)  # (1, hidden_dim)
# Compute cosine similarities between the query and all passage embeddings.
similarities = cosine_similarity(query_embedding, passage_embeddings)[0]

# Retrieve the top N candidate passages.
N = 50  # Adjust N based on your dataset and desired recall.
top_indices = np.argsort(similarities)[::-1][:N]
initial_candidates = [all_passages[i] for i in top_indices]

# Stage 2: Re-rank candidates using attention-based scoring enhanced with a keyword bonus.
# Define keywords that are critical for the query.
keywords = ["wand", "holly", "phoenix"]

final_scores = []
for passage in initial_candidates:
    attn_score = compute_attention_score(query, passage, model, tokenizer)
    keyword_bonus = compute_keyword_bonus(passage, keywords)
    final_score = attn_score + keyword_bonus
    final_scores.append(final_score)
    # Optionally, print intermediate results.
    print("Passage:", passage)
    print("Attention Score:", attn_score, "Keyword Bonus:", keyword_bonus, "Final Score:", final_score)
    print("---------")

# Re-rank the candidate passages based on the combined final score.
re_ranked_indices = np.argsort(final_scores)[::-1]
final_candidates = [initial_candidates[i] for i in re_ranked_indices]

# Print the top 5 final ranked passages.
print("\nFinal Ranked Passages:")
for i, passage in enumerate(final_candidates[:5], 1):
    print(f"{i}. {passage}")


All model checkpoint layers were used when initializing TFGPT2Model.

All the layers of TFGPT2Model were initialized from the model checkpoint at gpt2.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFGPT2Model for predictions without further training.


Passage: Nearly Headless Nick was always happy to point new Gryffindors in the right direction, but Peeves the Poltergeist was worth two locked doors and a trick staircase if you met him when you were late for class. He would drop wastepaper baskets on your head, pull rugs from under your feet, pelt you with bits of chalk, or sneak up behind you, invisible, grab your nose, and screech, "GOT YOUR CONK!" Even worse than Peeves, if that was possible, was the caretaker, Argus Filch. Harry and Ron managed to get on the wrong side of him on their very first morning. Filch found them trying to force their way through a door that unluckily turned out to be the entrance to the out-of-bounds corridor on the third floor. He wouldn't believe they were lost, was sure they were trying to break into it on purpose, and was threatening to lock them in the dungeons when they were rescued by Professor Quirrell, who was passing. Filch owned a cat called Mrs. Norris, a scrawny, dust-colored creature with b