## Using Neo4j

### Neo4j Creation

In [None]:
import os
from neo4j1 import GraphDatabase
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import threading

# configurations
RAW_DATA_DIR = "./freebase-easy-14-04-14"
FACTS_FILE = os.path.join(RAW_DATA_DIR, "facts.txt")
LINKS_FILE = os.path.join(RAW_DATA_DIR, "freebase-links.txt")

# Neo4j connection details
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "sakhil123"
NEO4J_Database = "kg1"

CHUNK_SIZE = 5000
NUM_WORKERS = 8
QUEUE_SIZE = 20  # Max chunks in memory at once

class Neo4jDirectImporter:
    def __init__(self, uri, user, password, database=None):
        try:
            self.driver = GraphDatabase.driver(uri, auth=(user, password), max_connection_pool_size=100)
            self.database = database
            self.driver.verify_connectivity()
            print(f"Successfully connected to Neo4j database '{database}'.")
        except Exception as e:
            print(f"Failed to connect to Neo4j. Error: {e}")
            raise

    def close(self):
        self.driver.close()

    def setup_constraints(self):
        print("Setting up database constraints and indexes...")
        with self.driver.session(database=self.database) as session:
            session.run("CREATE CONSTRAINT entity_unique_id IF NOT EXISTS FOR (n:Entity) REQUIRE n.entityId IS UNIQUE")
            try:
                session.run("CREATE INDEX entity_id_index IF NOT EXISTS FOR (n:Entity) ON (n.entityId)")
            except:
                pass
        print("Constraints and indexes are set.")
    
    def import_nodes_in_chunks(self):
        print("\n--- Pass 1: Importing Nodes ---")
        entities_seen_in_db = set()
        entities_to_add_chunk = set()

        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f, desc="Scanning for and importing nodes"):
                try:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        subject, _, obj = parts[0], parts[1], parts[2]
                        
                        if subject not in entities_seen_in_db:
                            entities_to_add_chunk.add(subject)
                        if obj not in entities_seen_in_db:
                            entities_to_add_chunk.add(obj)

                        if len(entities_to_add_chunk) >= CHUNK_SIZE:
                            self._write_node_chunk(entities_to_add_chunk)
                            entities_seen_in_db.update(entities_to_add_chunk)
                            entities_to_add_chunk.clear()
                except Exception:
                    pass
            
            if entities_to_add_chunk:
                self._write_node_chunk(entities_to_add_chunk)

    def _write_node_chunk(self, chunk):
        with self.driver.session(database=self.database) as session:
            query = """
            UNWIND $nodes AS node_data
            MERGE (e:Entity {entityId: node_data.id})
            ON CREATE SET e.name = node_data.name
            """
            node_list = [{'id': name, 'name': name} for name in chunk]
            session.run(query, nodes=node_list)

    def import_relationships_streaming(self):
        """Memory-efficient streaming version - doesn't load entire file into RAM"""
        print("\n--- Pass 2: Importing Relationships (Streaming with 8 threads) ---")
        
        chunk_queue = Queue(maxsize=QUEUE_SIZE)
        
        def producer():
            """Read file and produce chunks"""
            chunk = []
            with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    try:
                        parts = line.strip().split('\t')
                        if len(parts) >= 3:
                            subject, predicate, obj = parts[0], parts[1], parts[2]
                            chunk.append({'sub': subject, 'obj': obj, 'pred': predicate})
                            
                            if len(chunk) >= CHUNK_SIZE:
                                chunk_queue.put(chunk)
                                chunk = []
                    except Exception:
                        pass
                
                if chunk:
                    chunk_queue.put(chunk)
            
            # Signal end of data
            for _ in range(NUM_WORKERS):
                chunk_queue.put(None)
        
        def consumer():
            """Process chunks from queue"""
            while True:
                chunk = chunk_queue.get()
                if chunk is None:
                    break
                try:
                    self._write_relationship_chunk_parallel(chunk)
                    pbar.update(1)
                except Exception as e:
                    print(f"Error in chunk: {e}")
                finally:
                    chunk_queue.task_done()
        
        # Start producer thread
        producer_thread = threading.Thread(target=producer)
        producer_thread.start()
        
        # Count total chunks for progress bar (approximate)
        print("Counting lines for progress tracking...")
        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            total_lines = sum(1 for _ in f)
        estimated_chunks = (total_lines // CHUNK_SIZE) + 1
        
        # Start consumer threads with progress bar
        with tqdm(total=estimated_chunks, desc="Importing relationships") as pbar:
            consumers = []
            for _ in range(NUM_WORKERS):
                t = threading.Thread(target=consumer)
                t.start()
                consumers.append(t)
            
            # Wait for completion
            producer_thread.join()
            for t in consumers:
                t.join()

    def _write_relationship_chunk_parallel(self, chunk):
        """Thread-safe relationship chunk writer"""
        with self.driver.session(database=self.database) as session:
            query = """
            UNWIND $rels AS rel
            MATCH (a:Entity {entityId: rel.sub})
            MATCH (b:Entity {entityId: rel.obj})
            CALL apoc.create.relationship(a, rel.pred, {}, b) YIELD rel as r
            RETURN count(r)
            """
            session.run(query, rels=chunk)

importer = Neo4jDirectImporter(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_Database)

# 1. Set up constraints for performance
importer.setup_constraints()

# 2. Import all nodes
importer.import_nodes_in_chunks()

# 3. Import all relationships (streaming - memory efficient!)
importer.import_relationships_streaming()

print("\nData import complete!")

if 'importer' in locals() and importer.driver:
    importer.close()
    print("Database connection closed.")

Successfully connected to Neo4j database 'kg1'.
Setting up database constraints and indexes...
Constraints and indexes are set.

--- Pass 1: Importing Nodes ---


Scanning for and importing nodes: 241898031it [13:41, 294579.06it/s]



--- Pass 2: Importing Relationships (Streaming with 8 threads) ---
Counting lines for progress tracking...


Importing relationships: 100%|██████████| 48380/48380 [1:05:13<00:00, 12.36it/s]


Data import complete!
Database connection closed.





In [1]:
import os
from neo4j import GraphDatabase
from tqdm import tqdm
import re
from concurrent.futures import ThreadPoolExecutor, as_completed
from queue import Queue
import threading

# Configuration
RAW_DATA_DIR = "./freebase-easy-14-04-14"
FACTS_FILE = os.path.join(RAW_DATA_DIR, "facts.txt")
LINKS_FILE = os.path.join(RAW_DATA_DIR, "freebase-links.txt")

NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "sakhil123"
NEO4J_Database = "kg1"

CHUNK_SIZE = 5000
NUM_WORKERS = 16
QUEUE_SIZE = 20

class Neo4jFreebaseImporter:
    def __init__(self, uri, user, password, database=None):
        self.driver = GraphDatabase.driver(uri, auth=(user, password), max_connection_pool_size=100)
        self.database = database
        self.mid_to_name = {}  # MID → Human name mapping
        print(f"Connected to Neo4j database '{database}'")
    
    def close(self):
        self.driver.close()
    
    def extract_id_and_name(self, entity_str):
        """
        Extract ID and name from entity string.
        
        Rules:
        1. "3 Big Bags (m/0l24w26)" → id="m.0l24w26", name="3 Big Bags"
        2. "Dink"                    → id=None, name="Dink"
        3. "m.0d060g"               → id="m.0d060g", name=from links or "m.0d060g"
        4. Literals like "224.826"^^<...> → id=None, name=original (skip node creation)
        
        Returns: (entity_id, entity_name, is_literal)
        """
        entity_str = entity_str.strip()
        
        # Check if it's a literal value (starts with quote or has ^^)
        if entity_str.startswith('"') or '^^<http://' in entity_str:
            return None, entity_str, True  # is_literal=True
        
        # Case 1: Check if it has MID in parentheses: "Name (m/xxxxx)" or "Name (g/xxxxx)"
        match = re.search(r'^(.+?)\s*\(([mg]/[\w]+)\)\s*$', entity_str)
        if match:
            name = match.group(1).strip()
            mid_with_slash = match.group(2)  # e.g., "m/0l24w26"
            entity_id = mid_with_slash.replace('/', '.')  # Convert to "m.0l24w26"
            return entity_id, name, False
        
        # Case 2: Check if it's a pure MID: "m.xxxxx" or "g.xxxxx"
        if re.match(r'^[mg]\.\w+$', entity_str):
            entity_id = entity_str
            # Try to get name from links file
            entity_name = self.mid_to_name.get(entity_id, entity_id)
            return entity_id, entity_name, False
        
        # Case 3: Plain text without brackets → id=None, use text as name
        return None, entity_str, False
    
    def load_freebase_links(self):
        """Load MID → Human-readable name mappings"""
        print("\n--- Loading Freebase Links (MID → Name) ---")
        
        with open(LINKS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f, desc="Loading links"):
                parts = line.strip().split('\t')
                if len(parts) >= 2:
                    mid = parts[0]  # e.g., "m.0d060g"
                    name = parts[1]  # e.g., "Barack Obama"
                    self.mid_to_name[mid] = name
        
        print(f"✅ Loaded {len(self.mid_to_name)} MID → Name mappings")
    
    def setup_constraints(self):
        print("Setting up database constraints...")
        with self.driver.session(database=self.database) as session:
            # Use name as unique key (since some entities don't have MIDs)
            session.run("CREATE CONSTRAINT entity_unique_name IF NOT EXISTS FOR (n:Entity) REQUIRE n.name IS UNIQUE")
        print("Constraints set.")
    
    def import_nodes_corrected(self):
        """Import nodes WITH proper ID and name extraction (NO multithreading - single pass)"""
        print("\n--- Pass 1: Importing Nodes ---")
        
        entities_seen = set()  # Track by name (our unique key)
        entities_to_add = []
        literal_count = 0
        
        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f, desc="Scanning for nodes"):
                try:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        subject_raw = parts[0]
                        object_raw = parts[2]
                        
                        # Process subject
                        subject_id, subject_name, is_literal_subj = self.extract_id_and_name(subject_raw)
                        if not is_literal_subj and subject_name not in entities_seen:
                            entities_to_add.append({
                                'id': subject_id,      # Can be None
                                'name': subject_name
                            })
                            entities_seen.add(subject_name)
                        elif is_literal_subj:
                            literal_count += 1
                        
                        # Process object
                        object_id, object_name, is_literal_obj = self.extract_id_and_name(object_raw)
                        if not is_literal_obj and object_name not in entities_seen:
                            entities_to_add.append({
                                'id': object_id,
                                'name': object_name
                            })
                            entities_seen.add(object_name)
                        elif is_literal_obj:
                            literal_count += 1
                        
                        # Write chunk
                        if len(entities_to_add) >= CHUNK_SIZE:
                            self._write_node_chunk(entities_to_add)
                            entities_to_add.clear()
                            
                except Exception:
                    pass
            
            # Write remaining
            if entities_to_add:
                self._write_node_chunk(entities_to_add)
        
        print(f"✅ Imported {len(entities_seen)} unique entities")
        print(f"   Skipped {literal_count} literal values")
    
    def _write_node_chunk(self, chunk):
        """Thread-safe node writer"""
        with self.driver.session(database=self.database) as session:
            query = """
            UNWIND $nodes AS node_data
            MERGE (e:Entity {name: node_data.name})
            ON CREATE SET e.entityId = node_data.id
            """
            session.run(query, nodes=chunk)
    
    def import_relationships_streaming_multithreaded(self):
        """Import relationships WITH multithreading (memory-efficient streaming)"""
        print("\n--- Pass 2: Importing Relationships (Streaming with 8 threads) ---")
        
        chunk_queue = Queue(maxsize=QUEUE_SIZE)
        
        def producer():
            """Read file and produce chunks"""
            chunk = []
            with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
                for line in f:
                    try:
                        parts = line.strip().split('\t')
                        if len(parts) >= 3:
                            subject_raw = parts[0]
                            predicate = parts[1]
                            object_raw = parts[2]
                            
                            # Extract names (we'll match on names)
                            _, subject_name, is_literal_subj = self.extract_id_and_name(subject_raw)
                            _, object_name, is_literal_obj = self.extract_id_and_name(object_raw)
                            
                            # Skip if either is a literal
                            if is_literal_subj or is_literal_obj:
                                continue
                            
                            chunk.append({
                                'sub': subject_name,
                                'pred': predicate,
                                'obj': object_name
                            })
                            
                            if len(chunk) >= CHUNK_SIZE:
                                chunk_queue.put(chunk)
                                chunk = []
                    except Exception:
                        pass
                
                if chunk:
                    chunk_queue.put(chunk)
            
            # Signal end of data
            for _ in range(NUM_WORKERS):
                chunk_queue.put(None)
        
        def consumer():
            """Process chunks from queue"""
            while True:
                chunk = chunk_queue.get()
                if chunk is None:
                    break
                try:
                    self._write_relationship_chunk_parallel(chunk)
                    pbar.update(1)
                except Exception as e:
                    print(f"Error in chunk: {e}")
                finally:
                    chunk_queue.task_done()
        
        # Start producer thread
        producer_thread = threading.Thread(target=producer)
        producer_thread.start()
        
        # Count total chunks for progress bar (approximate)
        print("Counting lines for progress tracking...")
        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            total_lines = sum(1 for _ in f)
        estimated_chunks = (total_lines // CHUNK_SIZE) + 1
        
        # Start consumer threads with progress bar
        with tqdm(total=estimated_chunks, desc="Importing relationships") as pbar:
            consumers = []
            for _ in range(NUM_WORKERS):
                t = threading.Thread(target=consumer)
                t.start()
                consumers.append(t)
            
            # Wait for completion
            producer_thread.join()
            for t in consumers:
                t.join()
        
        print("✅ Relationship import complete!")
    
    def _write_relationship_chunk_parallel(self, chunk):
        """Thread-safe relationship chunk writer (MATCHES ON NAME)"""
        with self.driver.session(database=self.database) as session:
            query = """
            UNWIND $rels AS rel
            MATCH (a:Entity {name: rel.sub})
            MATCH (b:Entity {name: rel.obj})
            CALL apoc.create.relationship(a, rel.pred, {}, b) YIELD rel as r
            RETURN count(r)
            """
            session.run(query, rels=chunk)

# ============================================================================
# USAGE
# ============================================================================

importer = Neo4jFreebaseImporter(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_Database)

# Step 1: Load MID → Name mappings (for pure MIDs without names)
importer.load_freebase_links()

# Step 2: Set up constraints (using name as unique key)
# importer.setup_constraints()

# Step 3: Import nodes (single-threaded scan, chunked writes)
# importer.import_nodes_corrected()

# Step 4: Import relationships (MULTITHREADED - 8 worker threads)
importer.import_relationships_streaming_multithreaded()

print("\n✅ Freebase import complete!")
importer.close()

Connected to Neo4j database 'kg1'

--- Loading Freebase Links (MID → Name) ---


Loading links: 21719311it [00:19, 1133520.25it/s]


✅ Loaded 21718921 MID → Name mappings

--- Pass 2: Importing Relationships (Streaming with 8 threads) ---
Counting lines for progress tracking...


Importing relationships:  81%|████████  | 39024/48380 [44:07<10:34, 14.74it/s]   

✅ Relationship import complete!

✅ Freebase import complete!





### Neo4j Retriever Build

In [2]:
from neo4j import GraphDatabase
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

# Neo4j connection details
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "sakhil123"
NEO4J_Database = "kg1"

class Neo4jGraphRetriever:
    def __init__(self, uri, user, password, database=None):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))
        self.database = database
        print(f"Connected to Neo4j database '{database}'")
    
    def close(self):
        self.driver.close()
    
    def find_candidate_entities(self, mention, limit=100):
        """Find candidate entities matching the mention"""
        with self.driver.session(database=self.database) as session:
            query = """
            MATCH (e:Entity)
            WHERE toLower(e.name) CONTAINS toLower($mention)
            RETURN e.entityId AS entityId, e.name AS name
            LIMIT $limit
            """
            result = session.run(query, mention=mention, limit=limit)
            candidates = [(record["entityId"], record["name"]) for record in result]
        return candidates
    
    def find_actual_entity(self, mention):
        """Find the actual entity matching the mention exactly (MATCHES ON NAME)"""
        with self.driver.session(database=self.database) as session:
            query = """
            MATCH (e:Entity)
            WHERE toLower(e.name) = toLower($mention)
            RETURN e.entityId AS entityId, e.name AS name
            LIMIT 1
            """
            result = session.run(query, mention=mention)
            record = result.single()
            if record:
                return (record["entityId"], record["name"])
        return None
    
    def rank_candidates_by_similarity(self, question, candidates, model, top_k=5):
        """Rank candidates using sentence similarity"""
        if not candidates:
            return []
        
        candidate_texts = [name for _, name in candidates]
        question_embedding = model.encode([question])
        candidate_embeddings = model.encode(candidate_texts)
        
        similarities = cosine_similarity(question_embedding, candidate_embeddings)[0]
        
        scored_candidates = []
        for i, (entity_id, name) in enumerate(candidates):
            scored_candidates.append((entity_id, name, similarities[i]))
        
        scored_candidates.sort(key=lambda x: x[2], reverse=True)
        return scored_candidates[:top_k]
    
    def find_relation_paths_with_answers(self, start_entity_name, max_hops=2, max_paths=1000):
        """
        Find relation paths AND their final answer objects
        
        **UPDATED: Matches on 'name' instead of 'entityId'**
        
        Args:
            start_entity_name: The NAME of the starting entity (not entityId!)
            max_hops: Maximum number of hops
            max_paths: Maximum number of paths to return
        
        Returns: List of dictionaries with relation paths and answer entities
        """
        with self.driver.session(database=self.database) as session:
            if max_hops == 1:
                query = """
                MATCH (start:Entity {name: $start_entity})-[r]->(answer:Entity)
                RETURN type(r) AS rel, 
                    collect({id: answer.entityId, name: answer.name})[..20] AS answers
                LIMIT $max_paths
                """
                result = session.run(query, start_entity=start_entity_name, max_paths=max_paths)
                
                paths_with_answers = []
                for record in result:
                    rel = record["rel"]
                    answers = record["answers"]
                    paths_with_answers.append({
                        'relation_path': (rel,),
                        'answer_entities': answers
                    })
                return paths_with_answers
            
            elif max_hops == 2:
                # First, get all unique 2-hop relation path combinations
                query = """
                MATCH (start:Entity {name: $start_entity})-[r1]->(mid:Entity)-[r2]->(answer:Entity)
                WITH DISTINCT type(r1) AS rel1, type(r2) AS rel2
                RETURN rel1, rel2
                LIMIT $max_paths
                """
                result = session.run(query, start_entity=start_entity_name, max_paths=max_paths)
                relation_combinations = [(r["rel1"], r["rel2"]) for r in result]
                
                paths_with_answers = []
                seen_paths = set()
                
                for rel1, rel2 in relation_combinations:
                    # Add 1-hop path (if not already added)
                    if (rel1,) not in seen_paths:
                        one_hop_query = """
                        MATCH (start:Entity {name: $start_entity})-[r]->(answer:Entity)
                        WHERE type(r) = $rel
                        RETURN collect({id: answer.entityId, name: answer.name})[..20] AS answers
                        """
                        one_hop_result = session.run(one_hop_query, start_entity=start_entity_name, rel=rel1)
                        one_hop_record = one_hop_result.single()
                        if one_hop_record:
                            one_hop_answers = one_hop_record["answers"]
                            paths_with_answers.append({
                                'relation_path': (rel1,),
                                'answer_entities': one_hop_answers
                            })
                            seen_paths.add((rel1,))
                    
                    # Add 2-hop path with intermediate nodes
                    if (rel1, rel2) not in seen_paths:
                        two_hop_query = """
                        MATCH (start:Entity {name: $start_entity})-[r1]->(mid:Entity)-[r2]->(answer:Entity)
                        WHERE type(r1) = $rel1 AND type(r2) = $rel2
                        WITH collect(DISTINCT mid.name) AS intermediate_nodes,
                             collect(DISTINCT {id: answer.entityId, name: answer.name}) AS answers
                        RETURN intermediate_nodes[..20] AS intermediate_nodes,
                               answers[..20] AS answers
                        """
                        two_hop_result = session.run(two_hop_query, start_entity=start_entity_name, rel1=rel1, rel2=rel2)
                        two_hop_record = two_hop_result.single()
                        
                        if two_hop_record:
                            intermediate_nodes = two_hop_record["intermediate_nodes"]
                            answers = two_hop_record["answers"]
                            
                            paths_with_answers.append({
                                'relation_path': (rel1, rel2),
                                'answer_entities': answers,
                                'intermediate_nodes': intermediate_nodes
                            })
                            seen_paths.add((rel1, rel2))
                
                return paths_with_answers
        
        return []

In [3]:
# SPM Model
from sentence_transformers import SentenceTransformer
print("Loading similarity model...")
model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

Loading similarity model...


### Loading Relevant data from WebQSP Train File

In [19]:
import json
import pandas as pd

#getting dataset
WebQSP_Path = "./WebQSP/data/WebQSP.train.json"
with open(WebQSP_Path, 'r', encoding="utf8") as file:
        webQSP_json = json.load(file)
questions = webQSP_json['Questions']
print(f"Loaded {len(questions)} questions from WebQSP train set.")

all_questions = []
all_mentions = []
all_mids = []  # Changed variable name to be plural
all_actual_mentions = []
all_chains = []
all_hops = []

# Make a dataframe from ProcessedQuestion and PotentialTopicEntityMention
for q in questions:
    if not q['Parses'][0]['InferentialChain']:
        continue
    question_text = q['RawQuestion']
    topic_entity_mention = q['Parses'][0]['PotentialTopicEntityMention']
    topic_entity_mid = q['Parses'][0]['TopicEntityMid']
    actual_topic_entity_mention = q['Parses'][0]['TopicEntityName']
    inferential_chain = q['Parses'][0]['InferentialChain']
    expected_hops = len(inferential_chain)
    
    all_questions.append(question_text)
    all_mentions.append(topic_entity_mention)
    all_mids.append(topic_entity_mid)  # Append to list
    all_actual_mentions.append(actual_topic_entity_mention)
    all_chains.append(inferential_chain)
    all_hops.append(expected_hops)

data = {
    'question': all_questions, 
    'mention': all_mentions,
    'entity_id': all_mids,  # Use the list instead of single value
    'actual_mention': all_actual_mentions, 
    'inferential_chain': all_chains, 
    'expected_hops': all_hops
}
data_df = pd.DataFrame(data)
print(f"Created DataFrame with {len(data_df)} rows")
data_df.head()

Loaded 3098 questions from WebQSP train set.
Created DataFrame with 3044 rows


Unnamed: 0,question,mention,entity_id,actual_mention,inferential_chain,expected_hops
0,what is the name of justin bieber brother?,justin bieber,m.06w2sn5,Justin Bieber,"[people.person.sibling_s, people.sibling_relat...",2
1,what character did natalie portman play in sta...,natalie portman,m.09l3p,Natalie Portman,"[film.actor.film, film.performance.character]",2
2,what country is the grand bahama island in?,grand bahama island,m.03st9j,Grand Bahama,[location.location.containedby],1
3,what kind of money to take to bahamas?,bahamas,m.0160w,Bahamas,[location.country.currency_used],1
4,what character did john noble play in lord of ...,john noble,m.02fgm7,John Noble,"[film.actor.film, film.performance.character]",2


In [6]:
#Neo4j Connection Configuration
NEO4J_URI = "neo4j://127.0.0.1:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "sakhil123"
NEO4J_Database = "kg1"

# Connect to Neo4j
retriever = Neo4jGraphRetriever(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD, NEO4J_Database)

Connected to Neo4j database 'kg1'


### Finding all Candidates

In [7]:
data_df['actual_candidate'] = None

##### Direct Search for Actual Candidate

In [8]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Configuration
MAX_WORKERS = 32  # Adjust based on your system

def fetch_actual_candidate(idx, mention):
    """Worker function to fetch actual candidate for a single mention"""
    try:
        candidate = retriever.find_actual_entity(mention)
        return idx, candidate, None
    except Exception as e:
        return idx, None, str(e)

# Find actual candidates for all questions (multithreaded)
print(f"Fetching actual candidates for {len(data_df)} questions using {MAX_WORKERS} threads...")

# Prepare tasks
tasks = [(idx, row['actual_mention']) for idx, row in data_df.iterrows()]

# Execute in parallel with progress bar
results = {}
errors = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    futures = {
        executor.submit(fetch_actual_candidate, idx, mention): idx
        for idx, mention in tasks
    }
    
    # Process results as they complete with REAL-TIME progress bar
    for future in tqdm(as_completed(futures), total=len(futures), desc="Finding Actual Candidates"):
        idx, candidate, error = future.result()
        
        if error:
            errors.append((idx, error))
            print(f"\nError at index {idx}: {error}")
        else:
            results[idx] = candidate

# Assign results back to dataframe
for idx, candidate in results.items():
    data_df.at[idx, 'actual_candidate'] = candidate

print(f"\nCompleted! Successfully processed {len(results)}/{len(data_df)} questions.")
if errors:
    print(f"Errors encountered: {len(errors)}")
    for idx, error in errors[:5]:  # Show first 5 errors
        print(f"  Index {idx}: {error}")

# Optional: Print statistics
found_count = len([c for c in results.values() if c is not None])
print(f"\nActual candidates found: {found_count}/{len(data_df)}")
    

Fetching actual candidates for 3044 questions using 32 threads...


Finding Actual Candidates: 100%|██████████| 3044/3044 [26:29<00:00,  1.91it/s]  


Completed! Successfully processed 3044/3044 questions.

Actual candidates found: 3019/3044





In [9]:
#Check data_df['actual_candidate'] to see if any are None and save their indices for further inspection
empty_candidates = data_df[data_df['actual_candidate'].isnull()].index.tolist()

In [10]:
empty_candidates

[37,
 191,
 487,
 638,
 852,
 918,
 1046,
 1190,
 1273,
 1290,
 1463,
 1477,
 1553,
 1576,
 1670,
 1707,
 1902,
 1965,
 2085,
 2256,
 2330,
 2506,
 2848,
 2872,
 3008]

In [17]:
data_df.iloc[3008,:]

question                what book did w.e.b. dubois wrote?
mention                                 w . e . b . dubois
mid                                               m.09c7w0
actual_mention                            W. E. B. Du Bois
inferential_chain    [book.author.book_editions_published]
expected_hops                                            1
actual_candidate                                      None
Name: 3008, dtype: object

In [11]:
empty_df = data_df.loc[empty_candidates]

In [87]:
#drop rows with empty actual_candidate
data_df = data_df.drop(index=empty_candidates).reset_index(drop=True)

In [88]:
data_df.to_pickle("webqsp_train_with_actual_candidates.pkl")

In [20]:
data_df.head()

Unnamed: 0,question,mention,entity_id,actual_mention,inferential_chain,expected_hops
0,what is the name of justin bieber brother?,justin bieber,m.06w2sn5,Justin Bieber,"[people.person.sibling_s, people.sibling_relat...",2
1,what character did natalie portman play in sta...,natalie portman,m.09l3p,Natalie Portman,"[film.actor.film, film.performance.character]",2
2,what country is the grand bahama island in?,grand bahama island,m.03st9j,Grand Bahama,[location.location.containedby],1
3,what kind of money to take to bahamas?,bahamas,m.0160w,Bahamas,[location.country.currency_used],1
4,what character did john noble play in lord of ...,john noble,m.02fgm7,John Noble,"[film.actor.film, film.performance.character]",2


In [90]:
import pickle
#save data_df and empty_df
data_df.to_pickle("WebQSP_train_with_actual_candidates.pkl")
empty_df.to_pickle("WebQSP_train_empty_actual_candidates.pkl")

#save in csv as well
data_df.to_csv("WebQSP_train_with_actual_candidates.csv", index=False)
empty_df.to_csv("WebQSP_train_empty_actual_candidates.csv", index=False)

In [6]:
import pandas as pd
data_df = pd.read_pickle("webqsp_train_with_actual_candidates.pkl")

### BFS

In [7]:
data_df.head()

Unnamed: 0,question,mention,mid,actual_mention,inferential_chain,expected_hops,actual_candidate,top_k_5_paths,top_k_10_paths,top_k_30_paths,top_p_0.1_paths,top_p_0.2_paths,top_p_0.3_paths,top_p_0.5_paths,ranked_paths
0,what is the name of justin bieber brother?,justin bieber,m.09c7w0,Justin Bieber,"[people.person.sibling_s, people.sibling_relat...",2,"(Justin Bieber, Justin Bieber)",,,,,,,,
1,what character did natalie portman play in sta...,natalie portman,m.09c7w0,Natalie Portman,"[film.actor.film, film.performance.character]",2,"(Natalie Portman, Natalie Portman)",,,,,,,,
2,what country is the grand bahama island in?,grand bahama island,m.09c7w0,Grand Bahama,[location.location.containedby],1,"(Grand Bahama, Grand Bahama)",,,,,,,,
3,what kind of money to take to bahamas?,bahamas,m.09c7w0,Bahamas,[location.country.currency_used],1,"(Bahamas, Bahamas)",,,,,,,,
4,what character did john noble play in lord of ...,john noble,m.09c7w0,John Noble,"[film.actor.film, film.performance.character]",2,"(John Noble, John Noble)",,,,,,,,


In [None]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm

# Configuration
MAX_WORKERS = 32  # Adjust based on your system

# **FIX: Initialize the column first**
data_df['actual_candidate_paths'] = None

def fetch_paths_for_actual_candidate(idx, actual_candidate, max_hops):
    """Worker function to fetch relation paths WITH answers for the actual/ground-truth candidate"""
    try:
        if not actual_candidate or actual_candidate is None:
            return idx, None, None
        
        # Unpack the actual candidate tuple
        entity_id, entity_name = actual_candidate
        
        # **UPDATED: Pass entity_name instead of entity_id**
        paths_with_answers = retriever.find_relation_paths_with_answers(
            entity_name, max_hops=max_hops, max_paths=1000  # ← Use name, not ID
        )
        
        if not paths_with_answers:
            return idx, None, None
        
        # Store the actual candidate with its paths and answers
        actual_candidate_data = {
            'candidate_id': entity_id,
            'candidate_name': entity_name,
            'paths_with_answers': paths_with_answers
        }
        
        return idx, actual_candidate_data, None
        
    except Exception as e:
        return idx, None, str(e)

# Fetch paths for actual candidates only (multithreaded)
print(f"Fetching relation paths WITH answers for ACTUAL candidates across {len(data_df)} questions using {MAX_WORKERS} threads...")

# Prepare tasks - use actual_candidate and expected_hops from dataframe
tasks = [
    (idx, row['actual_candidate'], row['expected_hops'])
    for idx, row in data_df.iterrows()
]

# Execute in parallel with progress bar
results = {}
errors = []

with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
    # Submit all tasks
    futures = {
        executor.submit(fetch_paths_for_actual_candidate, idx, candidate, hops): idx
        for idx, candidate, hops in tasks
    }
    
    # Process results as they complete with REAL-TIME progress bar
    for future in tqdm(as_completed(futures), total=len(futures), desc="BFS on Actual Candidates"):
        idx, candidate_data, error = future.result()
        
        if error:
            errors.append((idx, error))
            print(f"\nError at index {idx}: {error}")
        elif candidate_data:
            results[idx] = candidate_data

# Assign results back to dataframe
for idx, candidate_data in results.items():
    data_df.at[idx, 'actual_candidate_paths'] = candidate_data

print(f"\nCompleted! Successfully processed {len(results)}/{len(data_df)} questions.")
if errors:
    print(f"Errors encountered: {len(errors)}")
    for idx, error in errors[:5]:  # Show first 5 errors
        print(f"  Index {idx}: {error}")

# Optional: Print statistics
successful_count = len([r for r in results.values() if r])
print(f"\nActual candidates with paths found: {successful_count}/{len(data_df)}")

# Optional: Print total paths statistics
total_paths = sum(
    len(results[idx]['paths_with_answers']) 
    for idx in results 
    if results[idx]
)
avg_paths = total_paths / successful_count if successful_count > 0 else 0
print(f"Average paths per actual candidate: {avg_paths:.2f}")

# Save intermediate results
print("\nSaving results with actual candidate paths and answers...")
data_df.to_csv('./webqsp_with_actual_candidate_paths.csv', index=False)
print("Done!")

Fetching relation paths WITH answers for ACTUAL candidates across 3017 questions using 32 threads...


BFS on Actual Candidates: 100%|██████████| 3017/3017 [01:51<00:00, 27.16it/s]



Completed! Successfully processed 2941/3017 questions.

Actual candidates with paths found: 2941/3017
Average paths per actual candidate: 80.43

Saving results with actual candidate paths and answers...
Done!


In [12]:
data_df.question[1]

'what character did natalie portman play in star wars?'

In [13]:
data_df.inferential_chain[1]

['film.actor.film', 'film.performance.character']

In [14]:
data_df['actual_candidate_paths'][1]

{'candidate_id': 'Natalie Portman',
 'candidate_name': 'Natalie Portman',
 'paths_with_answers': [{'relation_path': ('is-a',),
   'answer_entities': [{'id': 'Author', 'name': 'Author'},
    {'id': 'Musical Artist', 'name': 'Musical Artist'},
    {'id': 'Person', 'name': 'Person'},
    {'id': 'Topic', 'name': 'Topic'},
    {'id': 'Actor', 'name': 'Actor'},
    {'id': 'Film actor', 'name': 'Film actor'},
    {'id': 'Film director', 'name': 'Film director'},
    {'id': 'Person or entity appearing in film',
     'name': 'Person or entity appearing in film'},
    {'id': 'Film Crew', 'name': 'Film Crew'},
    {'id': 'Performance Artist', 'name': 'Performance Artist'},
    {'id': 'TV Actor', 'name': 'TV Actor'},
    {'id': 'Ranked item', 'name': 'Ranked item'},
    {'id': 'Film writer', 'name': 'Film writer'},
    {'id': 'Award Winner', 'name': 'Award Winner'},
    {'id': 'Award Nominee', 'name': 'Award Nominee'},
    {'id': 'Film Producer', 'name': 'Film Producer'},
    {'id': 'TV programme 

In [140]:
from tqdm import tqdm

def filter_tuples_by_expected_hops(processed_paths_tuples, expected_hops):
    """
    Filter path tuples based on expected hop count.
    - If expected_hops = 1: Keep only 3-element tuples (subject, relation, object)
    - If expected_hops = 2: Keep only 5-element tuples (subject, rel1, intermediate, rel2, object)
    
    Returns: Filtered list of tuples
    """
    if not processed_paths_tuples:
        return []
    
    filtered_tuples = []
    
    for path_tuple in processed_paths_tuples:
        tuple_length = len(path_tuple)
        
        if expected_hops == 1 and tuple_length == 3:
            # Keep 1-hop paths (3 elements)
            filtered_tuples.append(path_tuple)
        elif expected_hops == 2 and tuple_length == 5:
            # Keep 2-hop paths (5 elements)
            filtered_tuples.append(path_tuple)
    
    return filtered_tuples

# Apply filtering to all rows
print("Filtering path tuples based on expected hop count...")
print(f"Processing {len(data_df)} questions...")

for idx, row in tqdm(data_df.iterrows(), total=len(data_df), desc="Filtering tuples"):
    processed_paths_tuples = row.get('processed_paths_tuples')
    expected_hops = row.get('expected_hops')
    
    if processed_paths_tuples and expected_hops:
        # Filter tuples based on expected hops
        filtered_tuples = filter_tuples_by_expected_hops(processed_paths_tuples, expected_hops)
        
        # Update the dataframe with filtered tuples
        data_df.at[idx, 'processed_paths_tuples'] = filtered_tuples

# Show statistics
print("\n" + "="*80)
print("Filtering Statistics")
print("="*80)

total_before = data_df['processed_paths_tuples'].apply(lambda x: len(x) if x else 0).sum()
print(f"\nTotal tuples before filtering: {total_before}")

# Count by hop type
hop_1_count = len(data_df[data_df['expected_hops'] == 1])
hop_2_count = len(data_df[data_df['expected_hops'] == 2])
print(f"\nQuestions by hop count:")
print(f"  1-hop questions: {hop_1_count}")
print(f"  2-hop questions: {hop_2_count}")

# Show sample
print("\n" + "="*80)
print("SAMPLE: Filtered Tuples")
print("="*80)

# Find a 2-hop example
two_hop_idx = data_df[data_df['expected_hops'] == 2].index[0] if len(data_df[data_df['expected_hops'] == 2]) > 0 else 0

if len(data_df) > 0 and data_df['processed_paths_tuples'][two_hop_idx]:
    sample_tuples = data_df['processed_paths_tuples'][two_hop_idx]
    print(f"\nQuestion (2-hop): {data_df['question'][two_hop_idx]}")
    print(f"Expected hops: {data_df['expected_hops'][two_hop_idx]}")
    print(f"Candidate: {data_df['actual_candidate'][two_hop_idx][1]}")
    print(f"Total tuples after filtering: {len(sample_tuples)}")
    print(f"\nSample Tuples (showing first 5):")
    for i, path_tuple in enumerate(sample_tuples[:5], 1):
        print(f"{i}. Length {len(path_tuple)}: {path_tuple}")

# Save filtered results
print("\n\nSaving filtered results...")
data_df.to_pickle('./data_df_with_filtered_tuples.pkl')
print("✅ Saved to data_df_with_filtered_tuples.pkl")

Filtering path tuples based on expected hop count...
Processing 2919 questions...


Filtering tuples: 100%|██████████| 2919/2919 [00:00<00:00, 18855.41it/s]


Filtering Statistics

Total tuples before filtering: 618019

Questions by hop count:
  1-hop questions: 1829
  2-hop questions: 1090

SAMPLE: Filtered Tuples

Question (2-hop): what is the name of justin bieber brother?
Expected hops: 2
Candidate: Justin Bieber
Total tuples after filtering: 912

Sample Tuples (showing first 5):
1. Length 5: ('Justin Bieber', 'is-a', 'Author', 'Specialization Of', ('Writer', 'Artist', 'Performance Artist', 'Musician (Profession)', 'Singer', 'Manager', 'Music Producer', 'Business executive', 'Music executive', 'Coach', 'Ironmaster', 'Film Crew'))
2. Length 5: ('Justin Bieber', 'is-a', 'Musician (Profession)', 'Specialization Of', ('Writer', 'Artist', 'Performance Artist', 'Musician (Profession)', 'Singer', 'Manager', 'Music Producer', 'Business executive', 'Music executive', 'Coach', 'Ironmaster', 'Film Crew'))
3. Length 5: ('Justin Bieber', 'is-a', 'Singer', 'Specialization Of', ('Writer', 'Artist', 'Performance Artist', 'Musician (Profession)', 'Singe




✅ Saved to data_df_with_filtered_tuples.pkl


### Rank Processed Path Strings by Top K Cosine Similarity

In [148]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity

# Configuration
MAX_WORKERS = 32

def rank_processed_path_strings(idx, question, processed_paths, model, top_k=10):
    """Worker function to rank processed path strings using cosine similarity"""
    try:
        if not processed_paths or len(processed_paths) == 0:
            return idx, None, None
        
        # Encode question and path strings
        question_embedding = model.encode([question])
        path_embeddings = model.encode(processed_paths)
        
        # Calculate cosine similarities
        similarities = cosine_similarity(question_embedding, path_embeddings)[0]
        
        # Create scored paths with their similarity scores
        scored_paths = []
        for i, path_string in enumerate(processed_paths):
            scored_paths.append({
                'path_string': path_string,
                'similarity': float(similarities[i])
            })
        
        # Sort by similarity (descending) and take top-k
        scored_paths.sort(key=lambda x: x['similarity'], reverse=True)
        top_k_ranked = scored_paths[:top_k]
        
        return idx, top_k_ranked, None
        
    except Exception as e:
        return idx, None, str(e)

# Initialize column for ranked string paths
data_df['ranked_string_paths_top_5'] = None
data_df['ranked_string_paths_top_10'] = None
data_df['ranked_string_paths_top_30'] = None

print("Ranking processed path strings by similarity to questions...")
print(f"Processing {len(data_df)} questions using {MAX_WORKERS} threads...")

# We'll run this for different k values
for k_value, column_name in [(5, 'ranked_string_paths_top_5'), 
                              (10, 'ranked_string_paths_top_10'), 
                              (30, 'ranked_string_paths_top_30')]:
    
    print(f"\n{'='*80}")
    print(f"Ranking with top-{k_value}")
    print('='*80)
    
    # Prepare tasks
    tasks = [
        (idx, row['question'], row['processed_paths_tuples'])
        for idx, row in data_df.iterrows()
    ]
    
    # Execute in parallel with progress bar
    results = {}
    errors = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        futures = {
            executor.submit(rank_processed_path_strings, idx, question, paths, model, top_k=k_value): idx
            for idx, question, paths in tasks
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Ranking (k={k_value})"):
            idx, ranked_paths, error = future.result()
            
            if error:
                errors.append((idx, error))
                print(f"\nError at index {idx}: {error}")
            elif ranked_paths:
                results[idx] = ranked_paths
    
    # Assign results back to dataframe
    for idx, ranked_paths in results.items():
        data_df.at[idx, column_name] = ranked_paths
    
    print(f"Completed! Successfully ranked {len(results)}/{len(data_df)} questions for k={k_value}.")
    if errors:
        print(f"Errors encountered: {len(errors)}")

# Show sample results
print("\n" + "="*80)
print("SAMPLE: Top-5 Ranked Path Strings for Question 0")
print("="*80 + "\n")

if len(data_df) > 0 and data_df['ranked_string_paths_top_5'][0]:
    print(f"Question: {data_df['question'][0]}")
    print(f"Candidate: {data_df['actual_candidate'][0][1]}")
    print(f"\nTop-5 Ranked Paths:")
    for i, path_data in enumerate(data_df['ranked_string_paths_top_5'][0], 1):
        print(f"{i}. [Score: {path_data['similarity']:.4f}] {path_data['path_string']}")

# Save results
print("\nSaving results with ranked string paths...")
data_df.to_pickle('./data_df_with_ranked_string_paths.pkl')
print("Done!")

Ranking processed path strings by similarity to questions...
Processing 2919 questions using 32 threads...

Ranking with top-5


Ranking (k=5): 100%|██████████| 2919/2919 [12:58<00:00,  3.75it/s]



Completed! Successfully ranked 2919/2919 questions for k=5.

Ranking with top-10


Ranking (k=10): 100%|██████████| 2919/2919 [12:51<00:00,  3.78it/s]



Completed! Successfully ranked 2919/2919 questions for k=10.

Ranking with top-30


Ranking (k=30): 100%|██████████| 2919/2919 [11:33<00:00,  4.21it/s]



Completed! Successfully ranked 2919/2919 questions for k=30.

SAMPLE: Top-5 Ranked Path Strings for Question 0

Question: what is the name of justin bieber brother?
Candidate: Justin Bieber

Top-5 Ranked Paths:
1. [Score: 0.8522] ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'is-a', ('Person', 'Topic'))
2. [Score: 0.8522] ('Justin Bieber', 'Sibling', 'Jaxon Bieber', 'is-a', ('Person', 'Topic'))
3. [Score: 0.8522] ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'Date of birth', ('"2008-05-30"^^<http://www.w3.org/2001/XMLSchema#date>', '"2009-11-20"^^<http://www.w3.org/2001/XMLSchema#date>'))
4. [Score: 0.8522] ('Justin Bieber', 'Sibling', 'Jaxon Bieber', 'Date of birth', ('"2008-05-30"^^<http://www.w3.org/2001/XMLSchema#date>', '"2009-11-20"^^<http://www.w3.org/2001/XMLSchema#date>'))
5. [Score: 0.8522] ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'Gender', ('Female', 'Male'))

Saving results with ranked string paths...
Done!
Done!


In [7]:
data_df=pd.read_pickle('./data_df_with_ranked_string_paths.pkl')

In [9]:
data_df.ranked_string_paths_top_5[0]

[{'path_string': ('Justin Bieber',
   'Sibling',
   'Jazmyn Bieber',
   'is-a',
   ('Person', 'Topic')),
  'similarity': 0.8521941304206848},
 {'path_string': ('Justin Bieber',
   'Sibling',
   'Jaxon Bieber',
   'is-a',
   ('Person', 'Topic')),
  'similarity': 0.8521941304206848},
 {'path_string': ('Justin Bieber',
   'Sibling',
   'Jazmyn Bieber',
   'Date of birth',
   ('"2008-05-30"^^<http://www.w3.org/2001/XMLSchema#date>',
    '"2009-11-20"^^<http://www.w3.org/2001/XMLSchema#date>')),
  'similarity': 0.8521941304206848},
 {'path_string': ('Justin Bieber',
   'Sibling',
   'Jaxon Bieber',
   'Date of birth',
   ('"2008-05-30"^^<http://www.w3.org/2001/XMLSchema#date>',
    '"2009-11-20"^^<http://www.w3.org/2001/XMLSchema#date>')),
  'similarity': 0.8521941304206848},
 {'path_string': ('Justin Bieber',
   'Sibling',
   'Jazmyn Bieber',
   'Gender',
   ('Female', 'Male')),
  'similarity': 0.8521941304206848}]

### Rank Processed Path Strings by Top P (Nucleus Sampling)

In [12]:
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm import tqdm
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Configuration
MAX_WORKERS = 32

def rank_processed_path_strings_top_p(idx, question, processed_paths, model, p=0.5):
    """
    Worker function to rank processed path strings using top-p (nucleus sampling)
    
    Top-p sampling: Select the smallest set of paths whose cumulative similarity >= p
    This means we keep adding paths (sorted by similarity) until their cumulative 
    probability mass reaches threshold p.
    """
    try:
        if not processed_paths or len(processed_paths) == 0:
            return idx, None, None
        
        # Encode question and path strings
        question_embedding = model.encode([question])
        path_embeddings = model.encode(processed_paths)
        
        # Calculate cosine similarities
        similarities = cosine_similarity(question_embedding, path_embeddings)[0]
        
        # Create scored paths with their similarity scores
        scored_paths = []
        for i, path_string in enumerate(processed_paths):
            scored_paths.append({
                'path_string': path_string,
                'similarity': float(similarities[i])
            })
        
        # Sort by similarity (descending)
        scored_paths.sort(key=lambda x: x['similarity'], reverse=True)
        
        # Normalize similarities to get probability distribution (softmax)
        sim_scores = np.array([p['similarity'] for p in scored_paths])
        # Use softmax to convert to probabilities
        exp_scores = np.exp(sim_scores - np.max(sim_scores))  # Subtract max for numerical stability
        probabilities = exp_scores / exp_scores.sum()
        
        # Calculate cumulative probabilities
        cumulative_probs = np.cumsum(probabilities)
        
        # Find the cutoff index where cumulative probability >= p
        cutoff_idx = np.searchsorted(cumulative_probs, p) + 1  # +1 to include the threshold-crossing element
        cutoff_idx = min(cutoff_idx, len(scored_paths))  # Ensure we don't exceed array bounds
        
        # Select top-p paths
        top_p_paths = scored_paths[:cutoff_idx]
        
        # Add probability information to each path
        for i, path in enumerate(top_p_paths):
            path['probability'] = float(probabilities[i])
            path['cumulative_probability'] = float(cumulative_probs[i])
        
        return idx, top_p_paths, None
        
    except Exception as e:
        return idx, None, str(e)

# Initialize columns for top-p ranked string paths
data_df['ranked_string_paths_top_p_0.1'] = None
data_df['ranked_string_paths_top_p_0.2'] = None
data_df['ranked_string_paths_top_p_0.3'] = None
data_df['ranked_string_paths_top_p_0.5'] = None

print("Ranking processed path strings using Top-P (nucleus sampling)...")
print(f"Processing {len(data_df)} questions using {MAX_WORKERS} threads...")

# Run for different p values
for p_value, column_name in [(0.1, 'ranked_string_paths_top_p_0.1'), 
                              (0.2, 'ranked_string_paths_top_p_0.2'), 
                              (0.3, 'ranked_string_paths_top_p_0.3'),
                              (0.5, 'ranked_string_paths_top_p_0.5')]:
    
    print(f"\n{'='*80}")
    print(f"Ranking with top-p = {p_value}")
    print('='*80)
    
    # Prepare tasks
    tasks = [
        (idx, row['question'], row['processed_paths_tuples'])
        for idx, row in data_df.iterrows()
    ]
    
    # Execute in parallel with progress bar
    results = {}
    errors = []
    
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        # Submit all tasks
        futures = {
            executor.submit(rank_processed_path_strings_top_p, idx, question, paths, model, p=p_value): idx
            for idx, question, paths in tasks
        }
        
        # Process results as they complete
        for future in tqdm(as_completed(futures), total=len(futures), desc=f"Ranking (p={p_value})"):
            idx, ranked_paths, error = future.result()
            
            if error:
                errors.append((idx, error))
                print(f"\nError at index {idx}: {error}")
            elif ranked_paths:
                results[idx] = ranked_paths
    
    # Assign results back to dataframe
    for idx, ranked_paths in results.items():
        data_df.at[idx, column_name] = ranked_paths
    
    print(f"Completed! Successfully ranked {len(results)}/{len(data_df)} questions for p={p_value}.")
    if errors:
        print(f"Errors encountered: {len(errors)}")

# Show sample results for different p values
print("\n" + "="*80)
print("SAMPLE: Top-P Ranked Path Strings for Question 0")
print("="*80 + "\n")

if len(data_df) > 0:
    print(f"Question: {data_df['question'][0]}")
    print(f"Candidate: {data_df['actual_candidate'][0][1]}")
    
    for p_val, col_name in [(0.1, 'ranked_string_paths_top_p_0.1'),
                             (0.2, 'ranked_string_paths_top_p_0.2'),
                             (0.3, 'ranked_string_paths_top_p_0.3'),
                             (0.5, 'ranked_string_paths_top_p_0.5')]:
        if data_df[col_name][0]:
            print(f"\n--- Top-p = {p_val} ({len(data_df[col_name][0])} paths) ---")
            for i, path_data in enumerate(data_df[col_name][0][:3], 1):  # Show first 3
                print(f"{i}. [Sim: {path_data['similarity']:.4f}, Prob: {path_data['probability']:.4f}, "
                      f"Cum: {path_data['cumulative_probability']:.4f}]")
                print(f"   {path_data['path_string']}")

# Save results
print("\n\nSaving results with top-p ranked string paths...")
data_df.to_pickle('./data_df_with_top_p_ranked_string_paths.pkl')
print("Done!")

Ranking processed path strings using Top-P (nucleus sampling)...
Processing 2919 questions using 32 threads...

Ranking with top-p = 0.1


Ranking (p=0.1): 100%|██████████| 2919/2919 [14:45<00:00,  3.30it/s]


Completed! Successfully ranked 2919/2919 questions for p=0.1.

Ranking with top-p = 0.2


Ranking (p=0.2): 100%|██████████| 2919/2919 [10:21<00:00,  4.70it/s]


Completed! Successfully ranked 2919/2919 questions for p=0.2.

Ranking with top-p = 0.3


Ranking (p=0.3): 100%|██████████| 2919/2919 [07:55<00:00,  6.13it/s]


Completed! Successfully ranked 2919/2919 questions for p=0.3.

Ranking with top-p = 0.5


Ranking (p=0.5): 100%|██████████| 2919/2919 [07:51<00:00,  6.19it/s]


Completed! Successfully ranked 2919/2919 questions for p=0.5.

SAMPLE: Top-P Ranked Path Strings for Question 0

Question: what is the name of justin bieber brother?
Candidate: Justin Bieber

--- Top-p = 0.1 (86 paths) ---
1. [Sim: 0.8522, Prob: 0.0013, Cum: 0.0013]
   ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'is-a', ('Person', 'Topic'))
2. [Sim: 0.8522, Prob: 0.0013, Cum: 0.0026]
   ('Justin Bieber', 'Sibling', 'Jaxon Bieber', 'is-a', ('Person', 'Topic'))
3. [Sim: 0.8522, Prob: 0.0013, Cum: 0.0038]
   ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'Date of birth', ('"2008-05-30"^^<http://www.w3.org/2001/XMLSchema#date>', '"2009-11-20"^^<http://www.w3.org/2001/XMLSchema#date>'))

--- Top-p = 0.2 (174 paths) ---
1. [Sim: 0.8522, Prob: 0.0013, Cum: 0.0013]
   ('Justin Bieber', 'Sibling', 'Jazmyn Bieber', 'is-a', ('Person', 'Topic'))
2. [Sim: 0.8522, Prob: 0.0013, Cum: 0.0026]
   ('Justin Bieber', 'Sibling', 'Jaxon Bieber', 'is-a', ('Person', 'Topic'))
3. [Sim: 0.8522, Prob: 0.0013, C

In [None]:
data_df.question[2]

'where is jamarcus russell from?'

In [None]:
data_df.actual_candidate[2]

('JaMarcus Russell', 'JaMarcus Russell')

In [150]:
data_df.inferential_chain[2]

['location.location.containedby']

In [151]:
data_df.ranked_string_paths_top_30[2]

[{'path_string': ('Grand Bahama',
   'Area',
   ('"1373.0"^^<http://www.w3.org/2001/XMLSchema#float>',)),
  'similarity': 0.7593859434127808},
 {'path_string': ('Grand Bahama',
   'is-a',
   ('Statistical region',
    'Topic',
    'Location',
    'Island',
    'Geographical Feature')),
  'similarity': 0.7303248047828674},
 {'path_string': ('Grand Bahama',
   'Contained by',
   ('Northern Hemisphere',
    'Western Hemisphere',
    'North America',
    'Americas',
    'World Ocean',
    'Bahamas',
    'Atlantic Ocean',
    'Caribbean Sea',
    'Caribbean',
    'Lucayan Archipelago')),
  'similarity': 0.7257298231124878},
 {'path_string': ('Grand Bahama',
   'Population',
   ('"51756"^^<http://www.w3.org/2001/XMLSchema#int>',)),
  'similarity': 0.7179986238479614},
 {'path_string': ('Grand Bahama',
   'people/place_lived/location',
   ('Demetrius Pinder',)),
  'similarity': 0.7158254384994507},
 {'path_string': ('Grand Bahama', 'Nearby airport', ('West End Airport',)),
  'similarity': 0.6

### Extract Ground Truth Paths from Knowledge Graph

In [13]:
import json
import pandas as pd

# Load the JSON file
json_file_path = "./WebQSP/data/WebQSP.train.json"
with open(json_file_path, 'r', encoding="utf8") as file:
    webqsp_data = json.load(file)

questions = webqsp_data['Questions']

# Extract relevant information
extracted_data = []

for question in questions:
    raw_question = question['RawQuestion']
    
    # Process each parse
    if 'Parses' in question and len(question['Parses']) > 0:
        answer_list=[]
        for parse in question['Parses']:
            # Skip if no inferential chain
            if not parse.get('InferentialChain'):
                continue
            
            inferential_chain = parse['InferentialChain']
            topic_entity_name = parse.get('TopicEntityName')
            topic_entity_mid = parse.get('TopicEntityMid')
            
            # Extract answers
            answers = parse.get('Answers', [])
            if len(answers) == 0:
                continue
                
            for answer in answers:
                answer_list.append(answer.get('EntityName'))
                                
        extracted_data.append({
            'question': raw_question,
            'topic_entity': topic_entity_name,
            'topic_entity_mid': topic_entity_mid,
            'inferential_chain': inferential_chain,
            'answers': answer_list,
            'num_hops': len(inferential_chain)
        })

# Create DataFrame
answers_df = pd.DataFrame(extracted_data)

print(f"✅ Created answers_df with {len(answers_df)} rows")
print(f"\nDataFrame shape: {answers_df.shape}")
print(f"Columns: {list(answers_df.columns)}")

# Show statistics
print(f"\n{'='*80}")
print("Statistics:")
print(f"{'='*80}")
print(f"Total rows: {len(answers_df)}")
print(f"\nHop distribution:")
print(answers_df['num_hops'].value_counts().sort_index())

print(f"\nFirst few rows:")
print(answers_df.head(3))

# Save the DataFrame
answers_df.to_csv('answers_df.csv', index=False)
answers_df.to_pickle('answers_df.pkl')
print("\n✅ Saved to answers_df.csv and answers_df.pkl")

✅ Created answers_df with 3098 rows

DataFrame shape: (3098, 6)
Columns: ['question', 'topic_entity', 'topic_entity_mid', 'inferential_chain', 'answers', 'num_hops']

Statistics:
Total rows: 3098

Hop distribution:
num_hops
1    1911
2    1187
Name: count, dtype: int64

First few rows:
                                            question     topic_entity  \
0         what is the name of justin bieber brother?    Justin Bieber   
1  what character did natalie portman play in sta...  Natalie Portman   
2        what country is the grand bahama island in?     Grand Bahama   

  topic_entity_mid                                  inferential_chain  \
0        m.06w2sn5  [people.person.sibling_s, people.sibling_relat...   
1          m.09l3p      [film.actor.film, film.performance.character]   
2         m.03st9j                    [location.location.containedby]   

           answers  num_hops  
0   [Jaxon Bieber]         2  
1  [Padmé Amidala]         2  
2        [Bahamas]         1  

✅ 

In [14]:
len(answers_df)

3098

In [15]:
question_to_answers = {}
for idx, row in answers_df.iterrows():
    question = row['question']
    topic_entity = row['topic_entity']
    answers = row['answers']
    
    # Create ground truth tuples: (subject, object)
    answer_tuples = [(topic_entity, answers)]
    
    if question not in question_to_answers:
        question_to_answers[question] = []
    
    question_to_answers[question].extend(answer_tuples)

# Map answers to data_df using the question text
data_df['actual_candidate_answers'] = data_df['question'].map(
    lambda q: question_to_answers.get(q, [])
)

In [16]:
# rows where actual_candidate_answers list len is 0
empty_answer_candidates1 = data_df[data_df['actual_candidate_answers'].apply(len) == 0].index.tolist()

In [17]:
empty_answer_candidates1

[]

In [33]:
#drop not used columns such as top_k_5_paths
data_df = data_df.drop(columns=[
    'top_k_5_paths', 
    'top_k_10_paths', 
    'top_k_30_paths'
])

In [35]:
#drop not used columns like top_p_0.1_paths
data_df = data_df.drop(columns=[
    'top_p_0.1_paths', 
    'top_p_0.2_paths', 
    'top_p_0.3_paths', 
    'top_p_0.5_paths'
])

In [36]:
data_df.head()

Unnamed: 0,question,mention,mid,actual_mention,inferential_chain,expected_hops,actual_candidate,ranked_paths,actual_candidate_paths,processed_paths_tuples,ranked_string_paths_top_5,ranked_string_paths_top_10,ranked_string_paths_top_30,ranked_string_paths_top_p_0.1,ranked_string_paths_top_p_0.2,ranked_string_paths_top_p_0.3,ranked_string_paths_top_p_0.5,correct_path_tuples_v2,ground_truth_answers,actual_candidate_answers
0,what is the name of justin bieber brother?,justin bieber,m.09c7w0,Justin Bieber,"[people.person.sibling_s, people.sibling_relat...",2,"(Justin Bieber, Justin Bieber)",,"{'candidate_id': 'Justin Bieber', 'candidate_n...","[(Justin Bieber, is-a, Author, Specialization ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[{'path_string': ('Justin Bieber', 'Sibling', ...","[(Justin Bieber, is-a, Author, Specialization ...",{m.0gxnnwq},"[(Justin Bieber, [Jaxon Bieber])]"
1,what character did natalie portman play in sta...,natalie portman,m.09c7w0,Natalie Portman,"[film.actor.film, film.performance.character]",2,"(Natalie Portman, Natalie Portman)",,"{'candidate_id': 'Natalie Portman', 'candidate...","[(Natalie Portman, is-a, Author, Specializatio...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[{'path_string': ('Natalie Portman', 'Film app...","[(Natalie Portman, is-a, Author, Specializatio...",{m.0drf_},"[(Natalie Portman, [Padmé Amidala])]"
2,what country is the grand bahama island in?,grand bahama island,m.09c7w0,Grand Bahama,[location.location.containedby],1,"(Grand Bahama, Grand Bahama)",,"{'candidate_id': 'Grand Bahama', 'candidate_na...","[(Grand Bahama, is-a, (Statistical region, Top...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[{'path_string': ('Grand Bahama', 'Area', ('""1...","[(Grand Bahama, is-a, Statistical region), (Gr...",{m.0160w},"[(Grand Bahama, [Bahamas])]"
3,what kind of money to take to bahamas?,bahamas,m.09c7w0,Bahamas,[location.country.currency_used],1,"(Bahamas, Bahamas)",,"{'candidate_id': 'Bahamas', 'candidate_name': ...","[(Bahamas, is-a, (Statistical region, Topic, L...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[{'path_string': ('Bahamas', 'Currency Used', ...","[(Bahamas, is-a, Statistical region), (Bahamas...",{m.01l6dm},"[(Bahamas, [Bahamian dollar])]"
4,what character did john noble play in lord of ...,john noble,m.09c7w0,John Noble,"[film.actor.film, film.performance.character]",2,"(John Noble, John Noble)",,"{'candidate_id': 'John Noble', 'candidate_name...","[(John Noble, is-a, Actor, is-a, (Topic, Liter...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[{'path_string': ('John Noble', 'Starring TV r...","[(John Noble, is-a, Actor, is-a, Topic), (John...",{m.0gp7f},"[(John Noble, [Denethor II])]"


In [132]:
data_df.drop(index=empty_answer_candidates1, inplace=True)
data_df.reset_index(drop=True, inplace=True)

In [133]:
len(data_df)

2919

### Validation: Evaluate All Ranking Approaches

In [29]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def extract_objects_from_path(path_tuple, expected_hops):
    """
    Extract objects from path tuple based on hop count.
    
    1-hop: (subject, relation, (obj1, obj2, ...)) 
           -> return [(obj1, obj2, ...)]
    
    2-hop: (subject, rel1, intermediate, rel2, (obj1, obj2, ...))
           -> return [intermediate, (obj1, obj2, ...)]
           Check BOTH intermediate (1st hop object) AND tail objects (2nd hop)
    
    Returns: List of object tuples/strings to check
    """
    try:
        if expected_hops == 1 and len(path_tuple) == 3:
            # 1-hop: only tail objects
            return [path_tuple[2]]  # [(obj1, obj2, ...)]
        
        elif expected_hops == 2 and len(path_tuple) == 5:
            # 2-hop: BOTH intermediate (index 2) AND tail (index 4)
            intermediate = path_tuple[2]  # Single intermediate node
            tail_objects = path_tuple[4]  # Tuple of tail objects
            
            # Return both for checking
            objects_to_check = []
            
            # Add intermediate as single item (wrap in list for consistency)
            if intermediate is not None:
                objects_to_check.append([intermediate])
            
            # Add tail objects tuple
            if tail_objects:
                objects_to_check.append(tail_objects)
            
            return objects_to_check
        
        return None
    except (IndexError, TypeError):
        return None

def check_answer_in_paths(ranked_paths, ground_truth_answers, expected_hops):
    """
    Check if AT LEAST ONE ground truth answer exists in ranked paths.
    
    For 1-hop: Check tail objects only
    For 2-hop: Check BOTH intermediate objects (1st hop) AND tail objects (2nd hop)
    
    ALL MATCHING IS CASE-INSENSITIVE.
    
    Args:
        ranked_paths: List of dicts with 'path_string' key (tuple format)
        ground_truth_answers: List of answer strings from ground truth
        expected_hops: 1 or 2
    
    Returns:
        (found: bool, matched_answers: list, match_location: str)
    """
    if not ranked_paths or not ground_truth_answers:
        return False, [], None
    
    # Step 1: Normalize ground truth answers (lowercase + strip)
    gt_answers_normalized = set()
    for ans in ground_truth_answers:
        if ans is not None:
            gt_answers_normalized.add(str(ans).lower().strip())
    
    if not gt_answers_normalized:
        return False, [], None
    
    # Step 2: Check each ranked path's objects
    matched_answers = set()
    match_location = None
    
    for path_data in ranked_paths:
        path_tuple = path_data.get('path_string')
        
        if not path_tuple or not isinstance(path_tuple, tuple):
            continue
        
        # Extract objects to check (intermediate + tail for 2-hop)
        objects_groups = extract_objects_from_path(path_tuple, expected_hops)
        
        if not objects_groups:
            continue
        
        # Check each group of objects
        for obj_group_idx, obj_group in enumerate(objects_groups):
            if not obj_group:
                continue
            
            # Handle both single items and tuples
            objects_to_check = obj_group if isinstance(obj_group, (list, tuple)) else [obj_group]
            
            for obj in objects_to_check:
                if obj is not None:
                    obj_normalized = str(obj).lower().strip()
                    
                    if obj_normalized in gt_answers_normalized:
                        matched_answers.add(obj_normalized)
                        
                        # Track where match was found
                        if expected_hops == 2:
                            if obj_group_idx == 0:
                                match_location = "intermediate (1st hop)"
                            else:
                                match_location = "tail (2nd hop)"
                        else:
                            match_location = "tail"
    
    # Found = True if at least one match
    found = len(matched_answers) > 0
    return found, list(matched_answers), match_location

def evaluate_ranking(data_df, ranked_column_name):
    """
    Evaluate ranking approach: TP if at least one correct answer found.
    
    Returns: dict with accuracy, tp, total, detailed_results
    """
    tp = 0
    total = 0
    detailed_results = []
    
    for idx, row in tqdm(data_df.iterrows(), total=len(data_df), 
                         desc=f"Evaluating {ranked_column_name}"):
        ranked_paths = row.get(ranked_column_name)
        ground_truth_tuples = row.get('actual_candidate_answers')
        expected_hops = row.get('expected_hops')
        
        if not ground_truth_tuples or len(ground_truth_tuples) == 0:
            continue
        
        # Extract answers from tuples: (subject, [ans1, ans2, ...])
        all_answers = []
        for subject, answers_list in ground_truth_tuples:
            if answers_list:
                all_answers.extend(answers_list)
        
        if not all_answers:
            continue
        
        total += 1
        
        # Check if any answer found in paths
        found, matched_answers, match_location = check_answer_in_paths(
            ranked_paths, all_answers, expected_hops
        )
        
        if found:
            tp += 1
        
        detailed_results.append({
            'question_idx': idx,
            'question': row['question'],
            'expected_hops': expected_hops,
            'ground_truth_answers': all_answers,
            'ranked_paths_count': len(ranked_paths) if ranked_paths else 0,
            'found': found,
            'matched_answers': matched_answers,
            'match_location': match_location  # NEW: Where was match found
        })
    
    accuracy = tp / total if total > 0 else 0.0
    
    return {
        'approach': ranked_column_name,
        'accuracy': accuracy,
        'tp': tp,
        'total': total,
        'detailed_results': detailed_results
    }

# ============================================================================
# MAIN EVALUATION
# ============================================================================

print("="*80)
print("VALIDATION: Check Intermediate (1st hop) + Tail (2nd hop) for 2-hop paths")
print("           Check Tail only for 1-hop paths")
print("           (Case-Insensitive)")
print("="*80)

approaches = [
    'ranked_string_paths_top_5',
    'ranked_string_paths_top_10',
    'ranked_string_paths_top_30',
    'ranked_string_paths_top_p_0.1',
    'ranked_string_paths_top_p_0.2',
    'ranked_string_paths_top_p_0.3',
    'ranked_string_paths_top_p_0.5'
]

all_results = {}

for approach in approaches:
    if approach not in data_df.columns:
        print(f"\n⚠️ Skipping {approach} (not found)")
        continue
    
    print(f"\n{'='*80}")
    print(f"Evaluating: {approach}")
    print('='*80)
    
    result = evaluate_ranking(data_df, approach)
    all_results[approach] = result
    
    print(f"\n📊 Accuracy: {result['accuracy']:.4f}")
    print(f"   TP: {result['tp']} / Total: {result['total']}")

# Summary table
print("\n" + "="*80)
print("SUMMARY")
print("="*80 + "\n")

summary_data = []
for approach, result in all_results.items():
    summary_data.append({
        'Approach': approach.replace('ranked_string_paths_', ''),
        'Accuracy': f"{result['accuracy']:.4f}",
        'TP': result['tp'],
        'Total': result['total']
    })

summary_df = pd.DataFrame(summary_data)
print(summary_df.to_string(index=False))

# Save results
validation_results_df = pd.DataFrame(summary_data)
validation_results_df.to_csv('./validation_intermediate_and_tail.csv', index=False)

for approach, result in all_results.items():
    detailed_df = pd.DataFrame(result['detailed_results'])
    filename = f"./validation_detailed_{approach.replace('ranked_string_paths_', '')}_int_tail.csv"
    detailed_df.to_csv(filename, index=False)

print("\n✅ Saved to validation_intermediate_and_tail.csv")

# Show match location statistics for 2-hop questions
print("\n" + "="*80)
print("MATCH LOCATION STATISTICS (2-hop questions only)")
print("="*80)

for approach, result in all_results.items():
    two_hop_results = [r for r in result['detailed_results'] if r['expected_hops'] == 2 and r['found']]
    
    if two_hop_results:
        intermediate_matches = len([r for r in two_hop_results if r['match_location'] and 'intermediate' in r['match_location']])
        tail_matches = len([r for r in two_hop_results if r['match_location'] and 'tail' in r['match_location']])
        
        print(f"\n{approach.replace('ranked_string_paths_', '')}:")
        print(f"  Matches in intermediate (1st hop): {intermediate_matches}")
        print(f"  Matches in tail (2nd hop): {tail_matches}")
        print(f"  Total 2-hop matches: {len(two_hop_results)}")

VALIDATION: Check Intermediate (1st hop) + Tail (2nd hop) for 2-hop paths
           Check Tail only for 1-hop paths
           (Case-Insensitive)

Evaluating: ranked_string_paths_top_5


Evaluating ranked_string_paths_top_5: 100%|██████████| 2919/2919 [00:00<00:00, 29922.85it/s]



📊 Accuracy: 0.5039
   TP: 1471 / Total: 2919

Evaluating: ranked_string_paths_top_10


Evaluating ranked_string_paths_top_10: 100%|██████████| 2919/2919 [00:00<00:00, 24851.97it/s]



📊 Accuracy: 0.5200
   TP: 1518 / Total: 2919

Evaluating: ranked_string_paths_top_30


Evaluating ranked_string_paths_top_30: 100%|██████████| 2919/2919 [00:00<00:00, 18164.19it/s]



📊 Accuracy: 0.5430
   TP: 1585 / Total: 2919

Evaluating: ranked_string_paths_top_p_0.1


Evaluating ranked_string_paths_top_p_0.1: 100%|██████████| 2919/2919 [00:00<00:00, 17749.13it/s]



📊 Accuracy: 0.4868
   TP: 1421 / Total: 2919

Evaluating: ranked_string_paths_top_p_0.2


Evaluating ranked_string_paths_top_p_0.2: 100%|██████████| 2919/2919 [00:00<00:00, 3208.06it/s]



📊 Accuracy: 0.5324
   TP: 1554 / Total: 2919

Evaluating: ranked_string_paths_top_p_0.3


Evaluating ranked_string_paths_top_p_0.3: 100%|██████████| 2919/2919 [00:00<00:00, 8425.75it/s]



📊 Accuracy: 0.5557
   TP: 1622 / Total: 2919

Evaluating: ranked_string_paths_top_p_0.5


Evaluating ranked_string_paths_top_p_0.5: 100%|██████████| 2919/2919 [00:00<00:00, 5339.33it/s]



📊 Accuracy: 0.5755
   TP: 1680 / Total: 2919

SUMMARY

 Approach Accuracy   TP  Total
    top_5   0.5039 1471   2919
   top_10   0.5200 1518   2919
   top_30   0.5430 1585   2919
top_p_0.1   0.4868 1421   2919
top_p_0.2   0.5324 1554   2919
top_p_0.3   0.5557 1622   2919
top_p_0.5   0.5755 1680   2919

✅ Saved to validation_intermediate_and_tail.csv

MATCH LOCATION STATISTICS (2-hop questions only)

top_5:
  Matches in intermediate (1st hop): 219
  Matches in tail (2nd hop): 19
  Total 2-hop matches: 238

top_10:
  Matches in intermediate (1st hop): 235
  Matches in tail (2nd hop): 26
  Total 2-hop matches: 261

top_30:
  Matches in intermediate (1st hop): 267
  Matches in tail (2nd hop): 49
  Total 2-hop matches: 316

top_p_0.1:
  Matches in intermediate (1st hop): 280
  Matches in tail (2nd hop): 53
  Total 2-hop matches: 333

top_p_0.2:
  Matches in intermediate (1st hop): 293
  Matches in tail (2nd hop): 90
  Total 2-hop matches: 383

top_p_0.3:
  Matches in intermediate (1st hop)

In [31]:
data_df.to_pickle('./data_df_final_train.pkl')

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm

def extract_objects_from_path(path_tuple, expected_hops):
    """
    Extract objects from path tuple based on hop count.
    
    1-hop: (subject, relation, (obj1, obj2, ...)) 
           -> return [(obj1, obj2, ...)]
    
    2-hop: (subject, rel1, intermediate, rel2, (obj1, obj2, ...))
           -> return [intermediate, (obj1, obj2, ...)]
           Check BOTH intermediate (1st hop object) AND tail objects (2nd hop)
    
    Returns: List of object tuples/strings to check
    """
    try:
        if expected_hops == 1 and len(path_tuple) == 3:
            # 1-hop: only tail objects
            return [path_tuple[2]]  # [(obj1, obj2, ...)]
        
        elif expected_hops == 2 and len(path_tuple) == 5:
            # 2-hop: BOTH intermediate (index 2) AND tail (index 4)
            intermediate = path_tuple[2]  # Single intermediate node
            tail_objects = path_tuple[4]  # Tuple of tail objects
            
            # Return both for checking
            objects_to_check = []
            
            # Add intermediate as single item (wrap in list for consistency)
            if intermediate is not None:
                objects_to_check.append([intermediate])
            
            # Add tail objects tuple
            if tail_objects:
                objects_to_check.append(tail_objects)
            
            return objects_to_check
        
        return None
    except (IndexError, TypeError):
        return None

def check_answer_in_all_paths(all_path_tuples, ground_truth_answers, expected_hops):
    """
    Check if AT LEAST ONE ground truth answer exists in ALL processed path tuples.
    
    For 1-hop: Check tail objects only (case-insensitive)
    For 2-hop: Check BOTH intermediate objects (1st hop) AND tail objects (2nd hop) (case-insensitive)
    
    ALL MATCHING IS CASE-INSENSITIVE WITH PROPER ITERATION THROUGH TUPLES.
    
    Args:
        all_path_tuples: List of path tuples (direct from processed_paths_tuples column)
        ground_truth_answers: List of answer strings from ground truth
        expected_hops: 1 or 2
    
    Returns:
        (found: bool, matched_answers: list, match_location: str, total_paths_checked: int)
    """
    if not all_path_tuples or not ground_truth_answers:
        return False, [], None, 0
    
    # Step 1: Normalize ground truth answers (lowercase + strip)
    gt_answers_normalized = set()
    for ans in ground_truth_answers:
        if ans is not None:
            gt_answers_normalized.add(str(ans).lower().strip())
    
    if not gt_answers_normalized:
        return False, [], None, 0
    
    # Step 2: Check each path tuple
    matched_answers = set()
    match_location = None
    total_paths_checked = 0
    
    for path_tuple in all_path_tuples:
        if not path_tuple or not isinstance(path_tuple, tuple):
            continue
        
        total_paths_checked += 1
        
        # Extract objects to check (intermediate + tail for 2-hop)
        objects_groups = extract_objects_from_path(path_tuple, expected_hops)
        
        if not objects_groups:
            continue
        
        # Check each group of objects (intermediate vs tail)
        for obj_group_idx, obj_group in enumerate(objects_groups):
            if not obj_group:
                continue
            
            # Handle both single items and tuples
            objects_to_check = obj_group if isinstance(obj_group, (list, tuple)) else [obj_group]
            
            # ITERATE through EACH object in the tuple/list
            for obj in objects_to_check:
                if obj is None:
                    continue
                
                # Normalize the object (case-insensitive)
                obj_normalized = str(obj).lower().strip()
                
                # Check if it matches any ground truth answer
                if obj_normalized in gt_answers_normalized:
                    matched_answers.add(obj_normalized)
                    
                    # Track where match was found (only set once)
                    if match_location is None:
                        if expected_hops == 2:
                            if obj_group_idx == 0:
                                match_location = "intermediate (1st hop)"
                            else:
                                match_location = "tail (2nd hop)"
                        else:
                            match_location = "tail"
    
    # Found = True if at least one match
    found = len(matched_answers) > 0
    return found, list(matched_answers), match_location, total_paths_checked

def evaluate_all_processed_tuples(data_df):
    """
    Evaluate using ALL processed_paths_tuples (UPPER BOUND).
    
    Returns: dict with accuracy, tp, total, detailed_results
    """
    tp = 0
    total = 0
    detailed_results = []
    
    for idx, row in tqdm(data_df.iterrows(), total=len(data_df), 
                         desc="Evaluating ALL processed tuples"):
        all_path_tuples = row.get('processed_paths_tuples')
        ground_truth_tuples = row.get('actual_candidate_answers')
        expected_hops = row.get('expected_hops')
        
        if not ground_truth_tuples or len(ground_truth_tuples) == 0:
            continue
        
        # Extract answers from tuples: (subject, [ans1, ans2, ...])
        all_answers = []
        for subject, answers_list in ground_truth_tuples:
            if answers_list:
                all_answers.extend(answers_list)
        
        if not all_answers:
            continue
        
        total += 1
        
        # Check if any answer found in ALL path tuples (CASE-INSENSITIVE)
        found, matched_answers, match_location, paths_checked = check_answer_in_all_paths(
            all_path_tuples, all_answers, expected_hops
        )
        
        if found:
            tp += 1
        
        detailed_results.append({
            'question_idx': idx,
            'question': row['question'],
            'expected_hops': expected_hops,
            'ground_truth_answers': all_answers,
            'total_paths_checked': paths_checked,
            'found': found,
            'matched_answers': matched_answers,
            'match_location': match_location
        })
    
    accuracy = tp / total if total > 0 else 0.0
    
    return {
        'approach': 'processed_paths_tuples (ALL PATHS)',
        'accuracy': accuracy,
        'tp': tp,
        'total': total,
        'detailed_results': detailed_results
    }


print("="*80)
print("VALIDATION: Using processed_paths_tuples (ALL PATHS from correct entity)")
print("           This should give UPPER BOUND accuracy")
print("           Check: Intermediate (1st hop) + Tail (2nd hop) for 2-hop")
print("                  Tail only for 1-hop")
print("           (Case-Insensitive, Iterates through each object)")
print("="*80)

result = evaluate_all_processed_tuples(data_df)

print(f"\n{'='*80}")
print(f"RESULTS: {result['approach']}")
print('='*80)
print(f"\n📊 Accuracy: {result['accuracy']:.4f}")
print(f"   TP: {result['tp']} / Total: {result['total']}")

# Show match location statistics
two_hop_results = [r for r in result['detailed_results'] if r['expected_hops'] == 2 and r['found']]

if two_hop_results:
    intermediate_matches = len([r for r in two_hop_results if r['match_location'] and 'intermediate' in r['match_location']])
    tail_matches = len([r for r in two_hop_results if r['match_location'] and 'tail' in r['match_location']])
    
    print(f"\n{'='*80}")
    print("MATCH LOCATION STATISTICS (2-hop questions)")
    print('='*80)
    print(f"Matches in intermediate (1st hop): {intermediate_matches}")
    print(f"Matches in tail (2nd hop): {tail_matches}")
    print(f"Total 2-hop matches: {len(two_hop_results)}")

# Path statistics
total_paths_checked = sum(r['total_paths_checked'] for r in result['detailed_results'])
avg_paths = total_paths_checked / result['total'] if result['total'] > 0 else 0

print(f"\n{'='*80}")
print("PATH STATISTICS")
print('='*80)
print(f"Total paths checked: {total_paths_checked}")
print(f"Average paths per question: {avg_paths:.2f}")

# Save results
detailed_df = pd.DataFrame(result['detailed_results'])
detailed_df.to_csv('./validation_ALL_processed_tuples_upper_bound.csv', index=False)
print(f"\n✅ Saved to validation_ALL_processed_tuples_upper_bound.csv")

# Show some negative examples
print(f"\n{'='*80}")
print("SAMPLE NEGATIVE CASES (if any)")
print('='*80)

negative_cases = [r for r in result['detailed_results'] if not r['found']]
if negative_cases:
    print(f"\nFound {len(negative_cases)} questions where answer NOT in ANY path")
    print("\nFirst 3 negative cases:")
    for i, case in enumerate(negative_cases[:3], 1):
        print(f"\n{i}. Question: {case['question']}")
        print(f"   Expected hops: {case['expected_hops']}")
        print(f"   Ground truth: {case['ground_truth_answers'][:3]}")
        print(f"   Paths checked: {case['total_paths_checked']}")
        print(f"   ❌ No match found in ANY path!")
else:
    print("\n✅ ALL questions had at least one correct answer in some path!")
    print("   This is the IDEAL scenario - perfect upper bound.")

print("\n" + "="*80)
print("COMPARISON WITH RANKED RESULTS")
print("="*80)

# Compare with top-10 ranked
if 'ranked_string_paths_top_10' in data_df.columns:
    top_10_accuracy = 0.607  # Replace with actual if available
    gap = result['accuracy'] - top_10_accuracy
    print(f"\nUpper bound (ALL paths): {result['accuracy']:.4f}")
    print(f"Top-10 ranked:           {top_10_accuracy:.4f}")
    print(f"Gap (lost by ranking):   {gap:.4f} ({gap*100:.1f}% of questions)")
    print(f"\nThis means ranking is pushing {result['tp'] - int(top_10_accuracy * result['total'])} correct answers out of top-10")

print("\n" + "="*80)

VALIDATION: Using processed_paths_tuples (ALL PATHS from correct entity)
           This should give UPPER BOUND accuracy
           Check: Intermediate (1st hop) + Tail (2nd hop) for 2-hop
                  Tail only for 1-hop
           (Case-Insensitive, Iterates through each object)


Evaluating ALL processed tuples: 100%|██████████| 2919/2919 [00:01<00:00, 2662.49it/s]


RESULTS: processed_paths_tuples (ALL PATHS)

📊 Accuracy: 0.5975
   TP: 1744 / Total: 2919

MATCH LOCATION STATISTICS (2-hop questions)
Matches in intermediate (1st hop): 334
Matches in tail (2nd hop): 141
Total 2-hop matches: 475

PATH STATISTICS
Total paths checked: 607360
Average paths per question: 208.07

✅ Saved to validation_ALL_processed_tuples_upper_bound.csv

SAMPLE NEGATIVE CASES (if any)

Found 1175 questions where answer NOT in ANY path

First 3 negative cases:

1. Question: what character did natalie portman play in star wars?
   Expected hops: 2
   Ground truth: ['Padmé Amidala']
   Paths checked: 1257
   ❌ No match found in ANY path!

2. Question: what character did john noble play in lord of the rings?
   Expected hops: 2
   Ground truth: ['Denethor II']
   Paths checked: 576
   ❌ No match found in ANY path!

3. Question: where are the nfl redskins from?
   Expected hops: 1
   Ground truth: ['Washington, D.C.']
   Paths checked: 10
   ❌ No match found in ANY path!

COM


