## File Based Search

In [None]:
import os, sys
from collections import defaultdict

# Configurations
freebase_dir    = "/run/media/sakhil/sakhil/IIIT H/ANLP/Project/freebase-easy-14-04-14"
facts_file      = os.path.join(freebase_dir, "facts.txt")

#to test on example
question="what does jamaican people speak?"
anchor_mention  = "Jamaican"   
predicted_hops  = 1            

# Candidate selection caps (INT or NONE)
CANDIDATE_LIMIT        = None  
MAX_NAME_LINES_SCANNED = None  

# BFS caps
MAX_FRONTIER_NODES_PER_DEPTH = 2000   
MAX_EDGES_PER_NODE_PER_DEPTH = 30       
MAX_PATHS_PER_DEPTH          = 50000  
PRINT_SAMPLE_LIMIT           = 50

# Skip noisy predicates to reduce branching (updated for Freebase-easy)
SKIP_PREDICATES_PREFIXES = {
    "Length",
    "Track #", 
    "is-a",
}

In [None]:
def norm_mid(mid: str) -> str:
    if mid.startswith("/m/"):
        return "m." + mid[3:]
    if mid.startswith("m/"):
        return "m." + mid[2:]
    return mid

def extract_entity_info(entity_str):
    entity_str = entity_str.strip()
    if entity_str.endswith(')') and '(m/' in entity_str:
        # Has MID: "Jamaica (m/03_r3)"
        parts = entity_str.rsplit('(m/', 1)
        if len(parts) == 2:
            name = parts[0].strip()
            mid = 'm.' + parts[1][:-1]  # remove closing paren, add dot
            return mid, name
    # No MID found, use entity string as both ID and name
    return entity_str, entity_str

def iter_facts(path, max_lines=None):
    n = 0
    with open(path, "r", encoding="utf-8", errors="ignore") as f:
        for raw in f:
            if max_lines is not None and n >= max_lines:
                break
            line = raw.strip()
            if not line or line.startswith("#"):
                continue
            parts = line.split("\t")
            if len(parts) < 3:
                parts = line.split()
            if len(parts) < 3:
                continue
            s, p, o = parts[0], parts[1], " ".join(parts[2:])
            # Remove trailing period if present
            if o.endswith('\t.'):
                o = o[:-2].strip()
            elif o.endswith('.'):
                o = o[:-1].strip()
            yield s, p, o
            n += 1

def clean_literal(obj: str) -> str:
    t = obj.strip()
    if t.startswith('"') and '"' in t[1:]:
        # remove first and last quote if balanced
        if t.endswith('"') or '"@' in t:
            t = t[1:]
            # split by last quote
            qpos = t.rfind('"')
            if qpos != -1:
                t = t[:qpos]
    # drop @lang if still present
    t = t.split('@')[0]
    return t

def fmt_path(seq): return " → ".join(seq) if seq else "(empty)"

In [None]:
if not os.path.exists(facts_file):
    print(f"[ERROR] facts.txt not found at: {facts_file}", file=sys.stderr)
    sys.exit(1)

mention_cf = anchor_mention.strip().casefold()
if not mention_cf:
    print("[ERROR] anchor_mention is empty.", file=sys.stderr)
    sys.exit(1)

print(f"[INFO] Searching anchor candidates for mention: '{anchor_mention}'", file=sys.stderr)

exact_hits = []   # (mid, label)
substr_hits = []  # (mid, label)
seen_entities = set()

def extract_entity_info(entity_str):
    """Extract MID and clean name from entity string like 'Jamaica (m/03_r3)' or 'Jamaica'"""
    entity_str = entity_str.strip()
    if entity_str.endswith(')') and '(m/' in entity_str:
        # Has MID: "Jamaica (m/03_r3)"
        parts = entity_str.rsplit('(m/', 1)
        if len(parts) == 2:
            name = parts[0].strip()
            mid = 'm.' + parts[1][:-1]  # remove closing paren, add dot
            return mid, name
    return entity_str, entity_str

# Search through facts for entities containing the mention
for s, p, o in iter_facts(facts_file, max_lines=MAX_NAME_LINES_SCANNED):
    # Check subject
    if mention_cf in s.casefold():
        mid, name = extract_entity_info(s)
        if mid not in seen_entities:
            if s.casefold() == mention_cf or name.casefold() == mention_cf:
                exact_hits.append((mid, name))
            else:
                substr_hits.append((mid, name))
            seen_entities.add(mid)
    
    # Check object (if it looks like an entity, not a literal)
    if not (o.startswith('"') or o.startswith('<http')) and mention_cf in o.casefold():
        mid, name = extract_entity_info(o)
        if mid not in seen_entities:
            if o.casefold() == mention_cf or name.casefold() == mention_cf:
                exact_hits.append((mid, name))
            else:
                substr_hits.append((mid, name))
            seen_entities.add(mid)

# Merge exact + substr respecting CANDIDATE_LIMIT
if CANDIDATE_LIMIT is None:
    candidates = exact_hits + substr_hits
else:
    need = CANDIDATE_LIMIT
    candidates = exact_hits[:need]
    if len(candidates) < need:
        candidates += substr_hits[:(need - len(candidates))]

if not candidates:
    print(f"[WARN] No candidates found for mention='{anchor_mention}'. "
          f"Try MAX_NAME_LINES_SCANNED=None and ensure the mention exists in your dump.", file=sys.stderr)
    sys.exit(0)

print("[INFO] Anchor candidates:")
for i, (mid, label) in enumerate(candidates, 1):
    print(f"  [{i}] {mid}   «{label}»")

[INFO] Searching anchor candidates for mention: 'Jamaican'


[INFO] Anchor candidates:
  [1] Jamaican   «Jamaican»
  [2] m.0lp70fh   «Jamaican»
  [3] m.0m38_m4   «Jamaican»
  [4] m.0mbvqy_   «Jamaican»
  [5] m.0mlgpx9   «Jamaican»
  [6] m.0mv164s   «Jamaican»
  [7] m.0n6kx9t   «Jamaican»
  [8] "Assertive 'no' in Jamaican Creole" (Book Edition)   «"Assertive 'no' in Jamaican Creole" (Book Edition)»
  [9] "Assertive 'no' in Jamaican Creole" (Book)   «"Assertive 'no' in Jamaican Creole" (Book)»
  [10] Jamaican Barbie   «Jamaican Barbie»
  [11] Trojan Jamaican R&B Box Set (Consumer product)   «Trojan Jamaican R&B Box Set (Consumer product)»
  [12] Jamaican Style (Consumer product)   «Jamaican Style (Consumer product)»
  [13] Roots Reggae: The Classic Jamaican Albums (Multi-Part Musical Release)   «Roots Reggae: The Classic Jamaican Albums (Multi-Part Musical Release)»
  [14] Love Is All I Had : A Tribute to the Queen of Jamaican Soul (Compact Disc Musical Release)   «Love Is All I Had : A Tribute to the Queen of Jamaican Soul (Compact Disc Musical R

In [None]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

SIMILARITY_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K_CANDIDATES = 5

def calculate_candidate_similarity(question, candidates, model_name=SIMILARITY_MODEL, top_k=TOP_K_CANDIDATES):
    if not candidates:
        return []
    
    print(f"[INFO] Loading similarity model: {model_name}")
    model = SentenceTransformer(model_name)
    
    # Prepare texts for encoding
    question_text = question.strip()
    candidate_texts = []
    
    for mid, name in candidates:
        candidate_text = f"{name}"
        
        # Add some context if the name is very short
        if len(name.split()) < 2:
            candidate_text = f"entity named {name}"
            
        candidate_texts.append(candidate_text)
    
    print(f"[INFO] Encoding question and {len(candidates)} candidates...")
    
    # Encode question and candidates
    question_embedding = model.encode([question_text])
    candidate_embeddings = model.encode(candidate_texts)
    
    # Calculate cosine similarities
    similarities = cosine_similarity(question_embedding, candidate_embeddings)[0]
    
    # Create scored candidates
    scored_candidates = []
    for i, (mid, name) in enumerate(candidates):
        scored_candidates.append((mid, name, similarities[i]))
    
    # Sort by similarity (descending) and take top-k
    scored_candidates.sort(key=lambda x: x[2], reverse=True)
    top_candidates = scored_candidates[:top_k]
    
    print(f"[INFO] Top {len(top_candidates)} candidates by similarity:")
    for i, (mid, name, score) in enumerate(top_candidates, 1):
        print(f"  [{i}] {score:.4f} - {mid} «{name}»")
    
    return top_candidates

In [29]:
candidates1=calculate_candidate_similarity(question, candidates)

[INFO] Loading similarity model: sentence-transformers/all-MiniLM-L6-v2
[INFO] Encoding question and 1092 candidates...
[INFO] Top 5 candidates by similarity:
  [1] 0.8282 - m.0lsyw5q «Distinguished Jamaican English»
  [2] 0.8187 - How to speak Jamaican? (Book) «How to speak Jamaican? (Book)»
  [3] 0.8126 - Jamaican Creole English Language «Jamaican Creole English Language»
  [4] 0.8007 - Jamaican Creole «Jamaican Creole»
  [5] 0.7984 - Jamaican English «Jamaican English»


In [33]:
final_candidates = [(mid, name) for mid, name, score in candidates1]

In [32]:
candidates 

[('Jamaican', 'Jamaican'),
 ('m.0lp70fh', 'Jamaican'),
 ('m.0m38_m4', 'Jamaican'),
 ('m.0mbvqy_', 'Jamaican'),
 ('m.0mlgpx9', 'Jamaican'),
 ('m.0mv164s', 'Jamaican'),
 ('m.0n6kx9t', 'Jamaican'),
 ('"Assertive \'no\' in Jamaican Creole" (Book Edition)',
  '"Assertive \'no\' in Jamaican Creole" (Book Edition)'),
 ('"Assertive \'no\' in Jamaican Creole" (Book)',
  '"Assertive \'no\' in Jamaican Creole" (Book)'),
 ('Jamaican Barbie', 'Jamaican Barbie'),
 ('Trojan Jamaican R&B Box Set (Consumer product)',
  'Trojan Jamaican R&B Box Set (Consumer product)'),
 ('Jamaican Style (Consumer product)', 'Jamaican Style (Consumer product)'),
 ('Roots Reggae: The Classic Jamaican Albums (Multi-Part Musical Release)',
  'Roots Reggae: The Classic Jamaican Albums (Multi-Part Musical Release)'),
 ('Love Is All I Had : A Tribute to the Queen of Jamaican Soul (Compact Disc Musical Release)',
  'Love Is All I Had : A Tribute to the Queen of Jamaican Soul (Compact Disc Musical Release)'),
 ('(To Be) Young, 

In [23]:
print(candidates1)

[('m.0kt6gts', 'Jamaican People'), ('Jamaican Petrel', 'Jamaican Petrel')]


In [None]:
for rank, (start_mid, start_label) in enumerate(final_candidates, 1):
    print("\n" + "=" * 80)
    print(f"Candidate #{rank}: {start_mid}  «{start_label}»")
    print(f"Hops (target): {predicted_hops}")

    start = start_mid
    frontier = { start }
    paths_by_end = { start: [tuple()] }
    all_paths = []

    for depth in range(1, predicted_hops + 1):
        if not frontier:
            print(f"[INFO] Depth {depth}: empty frontier, stop.", file=sys.stderr)
            break

        print(f"[INFO] Depth {depth}: frontier={len(frontier)}", file=sys.stderr)

        # Stream file, collect only edges whose subject is in current frontier
        neighbors = defaultdict(list)
        kept_counts = defaultdict(int)

        for s, p, o in iter_facts(facts_file):
            # Extract entity ID from subject
            s_id, s_name = extract_entity_info(s)
            if s_id not in frontier:
                continue
                
            # Skip noisy predicates
            skip = False
            for pref in SKIP_PREDICATES_PREFIXES:
                if p.startswith(pref):
                    skip = True
                    break
            if skip:
                continue
            
            # Skip literal values and some system predicates
            if (o.startswith('"') or o.startswith('<http') or 
                p in ['is-a', 'Length', 'Track #']):
                continue

            if kept_counts[s_id] < MAX_EDGES_PER_NODE_PER_DEPTH:
                # Extract object ID
                o_id, o_name = extract_entity_info(o)
                neighbors[s_id].append((p, o_id))
                kept_counts[s_id] += 1

        # Extend paths by one hop
        if len(frontier) > MAX_FRONTIER_NODES_PER_DEPTH:
            frontier = set(list(frontier)[:MAX_FRONTIER_NODES_PER_DEPTH])

        next_paths_by_end = defaultdict(list)
        total_paths_this_depth = 0

        for node in frontier:
            base_paths = paths_by_end.get(node, [])
            if not base_paths:
                continue
            outs = neighbors.get(node, [])
            if not outs:
                continue

            for (p, o) in outs:
                for base in base_paths:
                    new_seq = base + (p,)
                    next_paths_by_end[o].append(new_seq)
                    all_paths.append((new_seq, o))
                    total_paths_this_depth += 1
                    if total_paths_this_depth >= MAX_PATHS_PER_DEPTH:
                        break
                if total_paths_this_depth >= MAX_PATHS_PER_DEPTH:
                    break
            if total_paths_this_depth >= MAX_PATHS_PER_DEPTH:
                break

        # Build next frontier with cap
        next_frontier = set()
        for o in next_paths_by_end.keys():
            if len(next_frontier) >= MAX_FRONTIER_NODES_PER_DEPTH:
                break
            next_frontier.add(o)

        # keep only paths to nodes we actually keep in frontier
        next_paths_by_end = { n: next_paths_by_end[n] for n in next_frontier }

        print(f"[INFO] Depth {depth}: paths_kept={total_paths_this_depth}, next_frontier={len(next_frontier)}", file=sys.stderr)

        frontier = next_frontier
        paths_by_end = next_paths_by_end

    # Output Sample
    print(f"Total paths (≤ {predicted_hops} hops): {len(all_paths)}")
    to_show = all_paths if PRINT_SAMPLE_LIMIT is None else all_paths[:PRINT_SAMPLE_LIMIT]
    for i, (seq, end_node) in enumerate(to_show, 1):
        print(f"[{i:04d}] {fmt_path(seq)}   =>   {end_node}")
    if PRINT_SAMPLE_LIMIT is not None and len(all_paths) > PRINT_SAMPLE_LIMIT:
        print(f"... (showing first {PRINT_SAMPLE_LIMIT} of {len(all_paths)} paths)")

print("\n[Done]", file=sys.stderr)


Candidate #1: m.0lsyw5q  «Distinguished Jamaican English»
Hops (target): 1


[INFO] Depth 1: frontier=1
[INFO] Depth 1: paths_kept=2, next_frontier=2
[INFO] Depth 1: frontier=1


Total paths (≤ 1 hops): 2
[0001] Recording   =>   Distinguished Jamaican English (Musical Recording)
[0002] Release   =>   Something Wicked This Way Comes (Consumer product) #85

Candidate #2: How to speak Jamaican? (Book)  «How to speak Jamaican? (Book)»
Hops (target): 1


[INFO] Depth 1: paths_kept=1, next_frontier=1
[INFO] Depth 1: frontier=1


Total paths (≤ 1 hops): 1
[0001] Author   =>   Ken Maxwell (Author)

Candidate #3: Jamaican Creole English Language  «Jamaican Creole English Language»
Hops (target): 1


[INFO] Depth 1: paths_kept=5, next_frontier=4
[INFO] Depth 1: frontier=1


Total paths (≤ 1 hops): 5
[0001] Language Family   =>   Creole language
[0002] Language Family   =>   English-based creole languages
[0003] Main Country   =>   Jamaica
[0004] Region   =>   Americas
[0005] Spoken in country   =>   Jamaica

Candidate #4: Jamaican Creole  «Jamaican Creole»
Hops (target): 1


[INFO] Depth 1: paths_kept=3, next_frontier=3
[INFO] Depth 1: frontier=1


Total paths (≤ 1 hops): 3
[0001] Author   =>   R. B. Le Page
[0002] Subject   =>   Creole dialects
[0003] Subject   =>   Slavery

Candidate #5: Jamaican English  «Jamaican English»
Hops (target): 1
Total paths (≤ 1 hops): 1
[0001] Spoken in country   =>   Jamaica


[INFO] Depth 1: paths_kept=1, next_frontier=1

[Done]


## SQlite Database Approach

In [None]:
# graph_retriever.py
import sqlite3
from collections import deque

class GraphRetriever:
    def __init__(self, db_file):
        """
        Initializes the retriever by connecting to the pre-built SQLite database.
        """
        try:
            self.db_file = db_file
            self.con = sqlite3.connect(self.db_file)
        except sqlite3.OperationalError as e:
            print(f"Error connecting to database at '{db_file}': {e}")
            print("Please make sure you have run the 'preprocess_kg.py' script first.")
            raise

    def get_mid_for_name(self, name: str) -> str | None:
        """Looks up the MID for a given entity name."""
        cur = self.con.cursor()
        res = cur.execute("SELECT mid FROM mappings WHERE name = ?", (name,))
        result = res.fetchone()
        return result[0] if result else None

    def get_name_for_mid(self, mid: str) -> str | None:
        """Looks up the entity name for a given MID."""
        cur = self.con.cursor()
        res = cur.execute("SELECT name FROM mappings WHERE mid = ?", (mid,))
        result = res.fetchone()
        return result[0] if result else None

    def get_relations_from_entity(self, entity_subject: str) -> list[tuple[str, str]]:
        """
        Retrieves all outgoing (predicate, object) pairs for a given subject entity.
        This query is fast due to the index on the 'subject' column.
        """
        cur = self.con.cursor()
        res = cur.execute("SELECT predicate, object FROM graph WHERE subject = ?", (entity_subject,))
        return res.fetchall()

    def find_relation_links(self, start_entity_subject: str, max_depth: int) -> set[tuple]:
        """
        Performs a Breadth-First Search (BFS) to find all relation paths
        up to a certain depth. This is the 'Relation Retrieval' step.
        """
        if max_depth <= 0:
            return set()

        q = deque([(start_entity_subject, [])])
        found_relation_links = set()

        while q:
            current_subject, path = q.popleft()

            if len(path) >= max_depth:
                continue

            # This is a fast, indexed SQL query
            neighbors = self.get_relations_from_entity(current_subject)

            for predicate, obj in neighbors:
                new_path = path + [predicate]
                
                # Add the complete relation path as a tuple (to make it hashable)
                found_relation_links.add(tuple(new_path))
                
                q.append((obj, new_path))
                
        return found_relation_links

    def close(self):
        """Closes the database connection."""
        self.con.close()


In [None]:
print("Initializing GraphRetriever...")
db_file = "./freeb_db"  
retriever = GraphRetriever(db_file)

# Example
topic_mid = "m.03_r3"
start_entity_name = retriever.get_mid_for_name('Nick Cannon')
print(f"Looking up MID for 'Nick Cannon': {start_entity_name}")

if start_entity_name:
    print(f"Found name for MID '{topic_mid}': '{start_entity_name}'")
    
    predicted_hops = 2
    
    print(f"Starting BFS with max depth {predicted_hops}...")
    relation_links = retriever.find_relation_links(start_entity_name, predicted_hops)
    
    print(f"\nFound {len(relation_links)} unique relation links:")
    # Print some of the found links
    for i, link in enumerate(list(relation_links)):
        if i >= 15: break 
        print(f"  -> {' -> '.join(link)}")

else:
    print(f"Could not find a name for the MID '{topic_mid}' in the mappings table.")

# Clean up the connection
retriever.close()
print("\nRetriever closed.")

Initializing GraphRetriever...
Looking up MID for 'Nick Cannon': None
Could not find a name for the MID 'm.03_r3' in the mappings table.

Retriever closed.


In [None]:
retriever = GraphRetriever(db_file)
    
relation_links = retriever.find_relation_links('Jamaica', 1)

print(f"\nFound {len(relation_links)} unique relation links:")
# Print some of the found links
for i, link in enumerate(list(relation_links)): 
    print(f"  -> {' -> '.join(link)}")

# Clean up the connection
retriever.close()
print("\nRetriever closed.")


Found 27 unique relation links:
  -> Calling Code
  -> FIPS Code
  -> Time zone(s)
  -> ISO Alpha 3
  -> Olympics participated in
  -> FIFA Code
  -> ISO Short Name
  -> Capital
  -> Internet TLD
  -> people/place_lived/location
  -> Area
  -> Official Language
  -> Size of armed forces
  -> Contained by
  -> Nearby airport
  -> Date founded
  -> Currency Used
  -> ISO Numeric
  -> Member of
  -> Form of Government
  -> Multi-event tournament participated in
  -> IOC Code
  -> Net migration
  -> ISO Alpha 2
  -> is-a
  -> Adjectival form
  -> Population

Retriever closed.


## Finding relevent paths from database

In [None]:
import sqlite3
from sentence_transformers import SentenceTransformer, util


def get_name_for_mid(con: sqlite3.Connection, mid: str) -> str | None:
    cursor = con.cursor()
    res = cursor.execute("SELECT name FROM mappings WHERE mid = ?", (mid,))
    result = res.fetchone()
    return result[0] if result else None

def retrieve_all_links_from_db(con: sqlite3.Connection, start_entity: str, max_depth: int) -> set:
    found_links = set()
    cursor = con.cursor()
    # Queue stores tuples of (current_entity, current_path_of_relations)
    queue = [(start_entity, [])]
    # Keep track of entities visited at the current depth to avoid redundant DB calls in cycles
    visited_entities = {start_entity}

    while queue:
        current_entity, path = queue.pop(0)

        if len(path) >= max_depth:
            continue

        # This is the fast, indexed query to the database
        res = cursor.execute("SELECT predicate, object FROM graph WHERE subject = ?", (current_entity,))
        
        for predicate, obj in res.fetchall():
            new_path = path + [predicate]
            found_links.add(tuple(new_path))
            
            if obj not in visited_entities:
                queue.append((obj, new_path))
                visited_entities.add(obj)
    
    return found_links

def rank_links_by_similarity(question: str, links: set, model: SentenceTransformer) -> list:
    if not links:
        return []

    link_list = list(links)
    formatted_links = [" ".join(link).replace("_", " ").replace(".", " ") for link in link_list]

    question_embedding = model.encode(question, convert_to_tensor=True)
    link_embeddings = model.encode(formatted_links, convert_to_tensor=True)

    cosine_scores = util.cos_sim(question_embedding, link_embeddings)[0]

    scored_links = list(zip(link_list, cosine_scores.tolist()))
    scored_links.sort(key=lambda x: x[1], reverse=True)

    return scored_links

def get_final_answer_from_db(con: sqlite3.Connection, start_entity: str, relation_path: tuple) -> list:
    cursor = con.cursor()
    # For a simple 1-hop path, this is a direct query
    if len(relation_path) == 1:
        relation = relation_path[0]
        res = cursor.execute("SELECT object FROM graph WHERE subject = ? AND predicate = ?", (start_entity, relation))
        return [row[0] for row in res.fetchall()]
    else:
        # This would need to be expanded for multi-hop paths
        print("This simple script currently only finds answers for 1-hop paths.")
        return []


In [None]:
DB_FILE = "./freeb_db"

# setup with example
question = "what language is spoken by jamaican people?"
topic_mid = "m.03_r3" # The MID for Jamaica from the WebQSP dataset
hops = 1

print("Loading sentence embedding model...")
model = SentenceTransformer('all-MiniLM-L6-v2')

print(f"Connecting to database: {DB_FILE}")
db_connection = sqlite3.connect(DB_FILE)
print("-" * 30)

# Get starting entity 
print(f"Step 1: Finding canonical name for MID '{topic_mid}'...")
start_entity = 'Jamaica' # get_name_for_mid(db_connection, topic_mid)

if not start_entity:
    raise ValueError(f"Could not find a name for MID '{topic_mid}' in the database.")
print(f"Found start entity: '{start_entity}'")
print("-" * 30)

# retrieve all possible links
print(f"Step 2: Finding all {hops}-hop links from '{start_entity}'...")
all_possible_links = retrieve_all_links_from_db(db_connection, start_entity, hops)
print(f"Found {len(all_possible_links)} possible relations.")
print("-" * 30)

# link ranking
print("Step 3: Ranking links by similarity to the question...")
ranked_links = rank_links_by_similarity(question, all_possible_links, model)
print("Top 10 ranked relations:")
for link, score in ranked_links[:10]:
    print(f"- Score: {score:.4f}, Relation: {' -> '.join(link)}")
print("-" * 30)

# final answer, just for testing top output of ranking algorithm
print("Step 4: Getting the answer using the top-ranked relation...")
if ranked_links:
    top_relation_path = ranked_links[0][0]
    answers = get_final_answer_from_db(db_connection, start_entity, top_relation_path)
    
    print("\n--- FINAL ANSWER ---")
    print(f"Question: {question}")
    print(f"Most Relevant Relation: '{' -> '.join(top_relation_path)}'")
    print(f"Answer(s): {answers}")
else:
    print("Could not find a relevant answer.")
    
# --- F. CLEANUP ---
db_connection.close()
print("\nDatabase connection closed.")


#E5


Loading sentence embedding model...
Connecting to database: ./freeb_db
------------------------------
Step 1: Finding canonical name for MID 'm.03_r3'...
Found start entity: 'Jamaica'
------------------------------
Step 2: Finding all 1-hop links from 'Jamaica'...
Found 27 possible relations.
------------------------------
Step 3: Ranking links by similarity to the question...
Top 10 ranked relations:
- Score: 0.3855, Relation: Official Language
- Score: 0.2844, Relation: people/place_lived/location
- Score: 0.1843, Relation: Population
- Score: 0.1612, Relation: Currency Used
- Score: 0.1524, Relation: Olympics participated in
- Score: 0.1154, Relation: Area
- Score: 0.0999, Relation: is-a
- Score: 0.0984, Relation: Contained by
- Score: 0.0916, Relation: Time zone(s)
- Score: 0.0788, Relation: Calling Code
------------------------------
Step 4: Getting the answer using the top-ranked relation...

--- FINAL ANSWER ---
Question: what language is spoken by jamaican people?
Most Relevant

## DB Creation

In [None]:
# preprocess_kg.py
import sqlite3
import os
from tqdm import tqdm
import re

# configurations
RAW_DATA_DIR = "freebase-easy-14-04-14/freebase-easy-14-04-14"
DB_FILE = "./freeb_db"

FACTS_FILE = os.path.join(RAW_DATA_DIR, "/run/media/sakhil/sakhil/IIIT H/ANLP/Project/freebase-easy-14-04-14/facts.txt")
LINKS_FILE = os.path.join(RAW_DATA_DIR, "/run/media/sakhil/sakhil/IIIT H/ANLP/Project/freebase-easy-14-04-14/freebase-links.txt")


# Delete the old DB file to ensure a fresh start
if os.path.exists(DB_FILE):
    print(f"Removing existing database at {DB_FILE}...")
    os.remove(DB_FILE)

# Connect to SQLite (this will create the file)
con = sqlite3.connect(DB_FILE)
cur = con.cursor()

print("--- Step 1: Processing Freebase Links File ---")

# Create a table for name <-> mid mappings
cur.execute("""
    CREATE TABLE mappings (
        name TEXT NOT NULL,
        mid TEXT NOT NULL
    );
""")
print("Created 'mappings' table.")

# Prepare for batch inserts
batch = []
batch_size = 100000

with open(LINKS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
    for line in tqdm(f, desc="Reading links file"):
        try:
            parts = line.strip().split('\t')
            if len(parts) >= 3 and parts[1] == 'freebase-entity':
                name = parts[0]
                # Regex to reliably extract the MID
                match = re.search(r'/ns/(m\..+)>', parts[2])
                if match:
                    mid = match.group(1)
                    batch.append((name, mid))
            
            if len(batch) >= batch_size:
                cur.executemany("INSERT INTO mappings VALUES(?, ?)", batch)
                batch = []
        except Exception:
            pass
# Insert any remaining records from the last batch
if batch:
    cur.executemany("INSERT INTO mappings VALUES(?, ?)", batch)

# Create indexes for fast lookups in both directions
print("Creating indexes on 'mappings' table...")
cur.execute("CREATE INDEX idx_map_name ON mappings (name);")
cur.execute("CREATE INDEX idx_map_mid ON mappings (mid);")
con.commit()
print("Finished processing links file.")

print("\n--- Step 2: Processing Facts File ---")

# Create the main table for the knowledge graph facts
cur.execute("""
    CREATE TABLE graph (
        subject TEXT NOT NULL,
        predicate TEXT NOT NULL,
        object TEXT NOT NULL
    );
""")
print("Created 'graph' table.")
batch = [] # Reset batch for the next file

with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
    for line in tqdm(f, desc="Reading facts file"):
        try:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                # The final part is often a '.', which we can ignore
                subject, predicate, obj = parts[0], parts[1], parts[2]
                batch.append((subject, predicate, obj))
            
            if len(batch) >= batch_size:
                cur.executemany("INSERT INTO graph VALUES(?, ?, ?)", batch)
                batch = []
        except Exception:
            pass

# Insert any remaining records
if batch:
    cur.executemany("INSERT INTO graph VALUES(?, ?, ?)", batch)

# Create an index on the 'subject' column for fast BFS lookups
print("Creating index on 'graph' table (this is the final, longest step)...")
cur.execute("CREATE INDEX idx_graph_subject ON graph (subject);")

# Commit all changes and close the connection
print("Committing final changes to the database...")
con.commit()
con.close()

print(f"\nPreprocessing complete! Your SQLite database is ready at '{DB_FILE}'.")

Removing existing database at ./freeb_db...
--- Step 1: Processing Freebase Links File ---
Created 'mappings' table.


Reading links file: 0it [00:00, ?it/s]

Reading links file: 21719311it [00:31, 694448.82it/s]


Creating indexes on 'mappings' table...
Finished processing links file.

--- Step 2: Processing Facts File ---
Created 'graph' table.


Reading facts file: 241898031it [10:57, 367675.91it/s]


Creating index on 'graph' table (this is the final, longest step)...
Committing final changes to the database...

Preprocessing complete! Your SQLite database is ready at './freeb_db'.


## using Network X

In [None]:
# preprocess_kg_networkx.py
import networkx as nx
import os
import pickle
from tqdm import tqdm

# configurations
RAW_DATA_DIR = "/run/media/sakhil/sakhil/IIIT H/ANLP/Project/freebase-easy-14-04-14/"
OUT_DIR = "/run/media/sakhil/sakhil/IIIT H/ANLP/Project/NetworkXGraph/"

FACTS_FILE = os.path.join(RAW_DATA_DIR, "facts.txt")


os.makedirs(OUT_DIR, exist_ok=True)

print("--- Step 1: Building Mappings (Entity/Relation -> Integer ID) ---")
# This part is identical to the DGL version, as it's library-agnostic.
entity_map = {}
relation_map = {}

def add_to_map(item, item_map):
    if item not in item_map:
        item_map[item] = len(item_map)

# This will now read your actual large Facts.txt file
with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
    for line in tqdm(f, desc="Mapping entities and relations"):
        try:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                subject, predicate, obj = parts[0], parts[1], parts[2]
                add_to_map(subject, entity_map)
                add_to_map(predicate, relation_map)
                add_to_map(obj, entity_map)
        except Exception:
            pass

print(f"Found {len(entity_map)} unique entities and {len(relation_map)} unique relations.")

map_path = os.path.join(OUT_DIR, "mappings.pkl")
with open(map_path, 'wb') as f:
    pickle.dump({'entities': entity_map, 'relations': relation_map}, f)
print(f"Mappings saved to {map_path}")


print("\n--- Step 2: Building the NetworkX Graph ---")
# Use a MultiDiGraph to handle multiple different edges between two nodes
G = nx.MultiDiGraph()

with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
    for line in tqdm(f, desc="Creating graph edges"):
        try:
            parts = line.strip().split('\t')
            if len(parts) >= 3:
                subject, predicate, obj = parts[0], parts[1], parts[2]
                src_id = entity_map[subject]
                dest_id = entity_map[obj]
                
                # Add an edge with the relation name as an attribute
                G.add_edge(src_id, dest_id, relation=predicate)
        except Exception:
            pass

# Add all nodes to the graph, including those that might not be a source node
G.add_nodes_from(range(len(entity_map)))

print(f"Graph created with {G.number_of_nodes()} nodes and {G.number_of_edges()} edges.")

# Save the graph using Python's pickle module
graph_path = os.path.join(OUT_DIR, "kg_graph.gpickle")
with open(graph_path, 'wb') as f:
    pickle.dump(G, f)
print(f"NetworkX graph saved to {graph_path}")
print("\nPreprocessing with NetworkX is complete!")

--- Step 1: Building Mappings (Entity/Relation -> Integer ID) ---


Mapping entities and relations: 173190144it [04:18, 670658.72it/s]


KeyboardInterrupt: 

In [None]:
# graph_retriever_networkx.py
import networkx as nx
import os
import pickle
from collections import deque

class GraphRetrieverNX:
    def __init__(self, graph_dir="/run/media/sakhil/sakhil/IIIT H/ANLP/Project/NetworkXGraph/"):
        print("Loading NetworkX graph and mappings...")
        # Load the saved graph using Python's pickle module
        graph_path = os.path.join(graph_dir, "kg_graph.gpickle")
        with open(graph_path, 'rb') as f:
            self.graph = pickle.load(f)

        # Load the mappings
        map_path = os.path.join(graph_dir, "mappings.pkl")
        with open(map_path, 'rb') as f:
            mappings = pickle.load(f)
            self.entity_map = mappings['entities']
            # We don't need the relation map for retrieval, but we'll keep the rev_entity_map
        
        self.rev_entity_map = {v: k for k, v in self.entity_map.items()}
        print("NetworkX retriever initialized successfully.")

    def find_relation_links(self, start_entity_name: str, max_depth: int) -> set:
        """Performs a manual BFS to find all relation paths up to a max_depth."""
        if start_entity_name not in self.entity_map:
            print(f"Warning: Entity '{start_entity_name}' not found in graph.")
            return set()
            
        start_node_id = self.entity_map[start_entity_name]
        
        # A queue for BFS will store tuples of (node_id, current_path_of_relations)
        queue = deque([(start_node_id, [])])
        
        found_links = set()

        while queue:
            current_node, current_path = queue.popleft()
            
            # Stop exploring if we've reached the maximum depth
            if len(current_path) >= max_depth:
                continue
                
            # G.edges(node, data=True) gets all outgoing edges with their attributes
            for _, neighbor, edge_data in self.graph.edges(current_node, data=True):
                relation = edge_data['relation']
                
                # Create the new path by extending the current one
                new_path = current_path + [relation]
                
                # Add the new path (as a tuple) to our results
                found_links.add(tuple(new_path))
                
                # Add the neighbor and its new path to the queue for further exploration
                queue.append((neighbor, new_path))
                    
        return found_links


# example
if __name__ == '__main__':
    # This assumes you have run the preprocessing script first.
    retriever = GraphRetrieverNX()
    
    start_entity_name = "Nick Cannon"
    predicted_hops = 2
    
    print(f"Starting BFS from '{start_entity_name}' with depth {predicted_hops}...")
    relation_links = retriever.find_relation_links(start_entity_name, predicted_hops)
    
    print(f"\nFound {len(relation_links)} unique relation links:")
    for link in sorted(list(relation_links)): # Sorted for consistent output
        print(f"  -> {' -> '.join(link)}")

Loading NetworkX graph and mappings...
NetworkX retriever initialized successfully.
Starting BFS from 'Nick Cannon' with depth 2...

Found 6 unique relation links:
  -> acted_in
  -> acted_in -> has_genre
  -> has_spouse
  -> has_spouse -> acted_in
  -> produced
  -> produced -> has_genre


## Using Neo4j

In [None]:
import os
from neo4j import GraphDatabase
from tqdm import tqdm
import re

# configurations
RAW_DATA_DIR = "./freebase-easy-14-04-14"
FACTS_FILE = os.path.join(RAW_DATA_DIR, "facts.txt")
LINKS_FILE = os.path.join(RAW_DATA_DIR, "freebase-links.txt") # Added this back


# Neo4j connection details
NEO4J_URI = "neo4j://localhost:7687"
NEO4J_USER = "neo4j"
NEO4J_PASSWORD = "kallind123" # The password you set in Neo4j Desktop

CHUNK_SIZE = 50000

class Neo4jDirectImporter:
    def __init__(self, uri, user, password):
        try:
            self.driver = GraphDatabase.driver(uri, auth=(user, password))
            self.driver.verify_connectivity()
            print("Successfully connected to Neo4j database.")
        except Exception as e:
            print(f"Failed to connect to Neo4j. Error: {e}")
            raise

    def close(self):
        self.driver.close()

    def setup_constraints(self):
        print("Setting up database constraints...")
        with self.driver.session() as session:
            # This constraint is CRITICAL for performance of the MERGE operation.
            query = "CREATE CONSTRAINT entity_unique_id IF NOT EXISTS FOR (n:Entity) REQUIRE n.entityId IS UNIQUE"
            session.run(query)
        print("Constraints are set.")
    
    def import_nodes_in_chunks(self):
        print("\n--- Pass 1: Importing Nodes ---")
        entities_seen_in_db = set()
        entities_to_add_chunk = set()

        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f, desc="Scanning for and importing nodes"):
                try:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        subject, _, obj = parts[0], parts[1], parts[2]
                        
                        if subject not in entities_seen_in_db:
                            entities_to_add_chunk.add(subject)
                        if obj not in entities_seen_in_db:
                            entities_to_add_chunk.add(obj)

                        if len(entities_to_add_chunk) >= CHUNK_SIZE:
                            self._write_node_chunk(entities_to_add_chunk)
                            entities_seen_in_db.update(entities_to_add_chunk)
                            entities_to_add_chunk.clear()
                except Exception:
                    pass
            
            # Write the final chunk if any entities are left
            if entities_to_add_chunk:
                self._write_node_chunk(entities_to_add_chunk)

    def _write_node_chunk(self, chunk):
        with self.driver.session() as session:
            query = """
            UNWIND $nodes AS node_data
            MERGE (e:Entity {entityId: node_data.id})
            ON CREATE SET e.name = node_data.name
            """
            # Create a list of dictionaries for the query parameter
            node_list = [{'id': name, 'name': name} for name in chunk]
            session.run(query, nodes=node_list)

    def import_relationships_in_chunks(self):
        print("\n--- Pass 2: Importing Relationships ---")
        relations_chunk = []

        with open(FACTS_FILE, 'r', encoding='utf-8', errors='ignore') as f:
            for line in tqdm(f, desc="Importing relationships"):
                try:
                    parts = line.strip().split('\t')
                    if len(parts) >= 3:
                        subject, predicate, obj = parts[0], parts[1], parts[2]
                        relations_chunk.append({'sub': subject, 'obj': obj, 'pred': predicate})

                        if len(relations_chunk) >= CHUNK_SIZE:
                            self._write_relationship_chunk(relations_chunk)
                            relations_chunk.clear()
                except Exception:
                    pass
            
            # Write the final chunk
            if relations_chunk:
                self._write_relationship_chunk(relations_chunk)

    def _write_relationship_chunk(self, chunk):
        with self.driver.session() as session:
            query = """
            UNWIND $rels AS rel
            MATCH (a:Entity {entityId: rel.sub})
            MATCH (b:Entity {entityId: rel.obj})
            CALL apoc.create.relationship(a, rel.pred, {}, b) YIELD rel as r
            RETURN count(r)
            """
            session.run(query, rels=chunk)

importer = Neo4jDirectImporter(NEO4J_URI, NEO4J_USER, NEO4J_PASSWORD)

# 1. Set up constraints for performance
importer.setup_constraints()

# 2. Import all nodes
importer.import_nodes_in_chunks()

# 3. Import all relationships
importer.import_relationships_in_chunks()

print("\nData import complete!")

if 'importer' in locals() and importer.driver:
    importer.close()
    print("Database connection closed.")

Successfully connected to Neo4j database.
Setting up database constraints...
Constraints are set.

--- Pass 1: Importing Nodes ---


Scanning for and importing nodes: 194180080it [43:55, 81273.07it/s] 

: 