In [1]:
import os
import sys
sys.path.insert(0, '..')
from utils.dataset_processing import process_data

In [2]:
import re
from typing import Dict, Any, List, Tuple

remove_underscore = True

def _norm(s):
    if s is None:
        return s
    return s.replace("_", " ") if remove_underscore else s

def _new_unknown(unk_list):
    u = f"unknown_{len(unk_list)}"
    unk_list.append(u)
    return u


# ============================================================
#          →→  CORRECT GRAPH CONSTRUCTION FOR FACTKG  ←←
# ============================================================
def generate(claim: str, sample: Dict[str, Any]):
    entity_set = [_norm(e) for e in sample.get("Entity_set", [])]
    evidence = sample.get("Evidence", {})

    unknown_list = []
    triplets = []

    # Mapping raw → normalized
    evidence_map = { _norm(k): v for k, v in evidence.items() }

    # ----------------------------------------------------------
    # For each entity and each evidence relation group
    # ----------------------------------------------------------
    for ent_norm, rel_groups in evidence_map.items():

        for rel_path in rel_groups:

            # ----------------------------------------------------------
            # CASE 1 — Single hop (no unknown). We must find real target.
            # ----------------------------------------------------------
            if len(rel_path) == 1:
                r = rel_path[0]
                inverse = r.startswith("~")
                rel = r[1:] if inverse else r

                target_entity = None

                # Search for target: entity that has inverse relation
                if not inverse:
                    # forward relation
                    # find entity E with "~rel"
                    for cand, groups in evidence_map.items():
                        for g in groups:
                            if f"~{rel}" in g:
                                target_entity = cand
                                break
                        if target_entity:
                            break
                else:
                    # inverse relation
                    # find entity E with forward rel
                    for cand, groups in evidence_map.items():
                        for g in groups:
                            if rel in g:      # NOT "~rel"
                                target_entity = cand
                                break
                        if target_entity:
                            break

                # If cannot find target (rare), fallback to unknown
                if target_entity is None:
                    target_entity = _new_unknown(unknown_list)

                # Construct triplet
                if inverse:
                    triplets.append((target_entity, rel, ent_norm))
                else:
                    triplets.append((ent_norm, rel, target_entity))

                continue

            # --------------------------------------------------
            # CASE 2 — Multi-hop relation path
            # ent_norm is starting point
            # --------------------------------------------------
            prev = ent_norm

            for r in rel_path:
                inverse = r.startswith("~")
                rel = r[1:] if inverse else r

                next_node = _new_unknown(unknown_list)

                if inverse:
                    triplets.append((next_node, rel, prev))
                else:
                    triplets.append((prev, rel, next_node))

                prev = next_node

    # Deduplicate
    triplets = list(dict.fromkeys(triplets))
    sample["triplet"] = triplets
    return sample


# --------------------------------------------------------------
def linearize(triplets: List[Tuple[str, str, str]]) -> str:
    return "\n".join(f"<e>{h}</e> || {r} || <e>{t}</e>" for h, r, t in triplets)

def process_data(data: dict, remove_underscore: bool = True) -> Tuple[Dict, List]:
    from tqdm import tqdm
    """
    Create triplets from given FactKG structure.

    Parameters:
    - data (dict): Input data containing 'Entity_set' and 'Evidence'.
    - remove_underscore (bool): If True, replace underscores with spaces in entity names.

    Returns:
    - Tuple[Dict, List]: A tuple containing the updated data dictionary and the list distinct entity used for later update the trie.
    """
    updated_data = {}
    distinct_entities = set()
    keys = list(data.keys())
    for key in tqdm(keys, desc="Processing data"):
        updated = generate(key, data[key])
        updated_data[key] = updated

        # Collect distinct entities from all triplets
        for triplet in updated["triplet"]:
            # Triplet contains 3 elements, get the first one and the last one as entities
            distinct_entities.add(triplet[0])
            distinct_entities.add(triplet[2])

    return updated_data, list(distinct_entities)

In [3]:
DATA_DIR = 'resources'
# Data dir = (1) working directory, (2) move out of test, (3) move out of src, and append to resources
DATA_DIR = os.path.join(os.getcwd(), '..', 'resources')
print("Data Directory:", DATA_DIR)

TRAIN_FILE = 'factkg_train_5k.pickle'
TEST_FILE = 'factkg_test_1k.pickle'
VALID_FILE = 'factkg_val_300.pickle'

TRAIN_FILE_PATH = os.path.join(DATA_DIR, TRAIN_FILE)
TEST_FILE_PATH = os.path.join(DATA_DIR, TEST_FILE)
VALID_FILE_PATH = os.path.join(DATA_DIR, VALID_FILE)

import pickle

valid_data = None
with open(VALID_FILE_PATH, 'rb') as f:
    valid_data = pickle.load(f)

valid_updated_data, valid_distinct_entities = process_data(valid_data, remove_underscore=True)

Data Directory: d:\claimpkg\claimpkg-clone\src\test\..\resources


Processing data: 100%|██████████| 300/300 [00:00<00:00, 149458.51it/s]


In [5]:
from llm.pseudograph_checking_llm import PseudoGraphCheckingLLM

keys = list(valid_updated_data.keys())[:10]

res = []
pseudograph_llm = PseudoGraphCheckingLLM()
for key in keys:
    sample = valid_updated_data[key]
    triplets = sample["triplet"]
    linearized_triplets = linearize(triplets)

    response = pseudograph_llm.submit(sample, linearized_triplets)
    res.append((response))

In [6]:
print(res)

['CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT', 'CORRECT']


In [11]:
INDEX = 3
keys = list(valid_updated_data.keys())
print(keys[INDEX])
print(valid_updated_data[keys[INDEX]])

Guiana Space Centre was the launch site of the rocket which was manufactured by Arianespace and had its final flight on 27th September, 2003.
{'Label': [True], 'Entity_set': ['"2003-09-27"', 'Arianespace', 'Guiana_Space_Centre'], 'Evidence': {'Arianespace': [['~manufacturer', 'sites'], ['~manufacturer', 'finalFlight']], 'Guiana_Space_Centre': [['~sites', 'manufacturer'], ['~sites', 'finalFlight']], '"2003-09-27"': [['~finalFlight', 'manufacturer'], ['~finalFlight', 'sites']]}, 'types': ['written', 'num3', 'multi hop'], 'triplet': [('unknown_0', 'manufacturer', 'Arianespace'), ('unknown_0', 'sites', 'unknown_1'), ('unknown_2', 'manufacturer', 'Arianespace'), ('unknown_2', 'finalFlight', 'unknown_3'), ('unknown_4', 'sites', 'Guiana Space Centre'), ('unknown_4', 'manufacturer', 'unknown_5'), ('unknown_6', 'sites', 'Guiana Space Centre'), ('unknown_6', 'finalFlight', 'unknown_7'), ('unknown_8', 'finalFlight', '"2003-09-27"'), ('unknown_8', 'manufacturer', 'unknown_9'), ('unknown_10', 'fina