In [1]:
import os
import sys
sys.path.insert(0, '..')

In [2]:
import re
from typing import Dict, Any, List, Tuple

remove_underscore = True

def _norm(s):
    if s is None:
        return s
    return s.replace("_", " ") if remove_underscore else s

def _new_unknown(unk_list):
    u = f"unknown_{len(unk_list)}"
    unk_list.append(u)
    return u


# ============================================================
#          →→  CORRECT GRAPH CONSTRUCTION FOR FACTKG  ←←
# ============================================================

def generate(claim: str, sample: Dict[str, Any]):
    entity_set = [_norm(e) for e in sample.get("Entity_set", [])]
    evidence = sample.get("Evidence", {})

    # Normalize keys
    evidence_map = { _norm(k): v for k, v in evidence.items() }

    # --------------------------------------------------------------
    # STEP 1 — Create ONE hidden entity for the entire claim
    # --------------------------------------------------------------
    # All evidence paths describe the same hidden entity (FactKG rule)
    hidden = "unknown_0"
    unknown_list = [hidden]  # keep index consistency

    triplets = []

    # --------------------------------------------------------------
    # STEP 2 — Build real targets (the tail of relations)
    # The rule in FactKG:
    # - If rel = r  → hidden --r--> target(entity with ~r)
    # - If rel = ~r → target(entity with r) --r--> hidden
    # --------------------------------------------------------------
    def find_target(rel, inverse):
        """Find the target entity mentioned in evidence."""
        for cand, groups in evidence_map.items():
            for g in groups:
                if inverse:
                    # rel = r, inverse = True → look for r in groups
                    if rel in g:
                        return cand
                else:
                    # normal forward rel → look for ~rel
                    if f"~{rel}" in g:
                        return cand
        return None  # no explicit target → unspecified → hidden hop continuation

    # --------------------------------------------------------------
    # STEP 3 — Iterate over all evidence paths
    # --------------------------------------------------------------
    for ent, rel_groups in evidence_map.items():

        for rel_path in rel_groups:

            # pointer starts at hidden entity
            curr = hidden

            # traverse each relation in the path
            for i, r in enumerate(rel_path):
                inverse = r.startswith("~")
                rel = r[1:] if inverse else r

                # Last hop should connect to REAL entity if exists
                if i == len(rel_path) - 1:
                    target = find_target(rel, inverse)
                else:
                    # internal hop → real entity not known → create one more hidden
                    next_hidden = f"unknown_{len(unknown_list)}"
                    unknown_list.append(next_hidden)
                    target = next_hidden

                # Build triplet
                if inverse:
                    triplets.append((target, rel, curr))
                else:
                    triplets.append((curr, rel, target))

                # move pointer
                curr = target

    # Deduplicate final results
    triplets = list(dict.fromkeys(triplets))
    sample["triplet"] = triplets
    return sample


# --------------------------------------------------------------
def linearize(triplets: List[Tuple[str, str, str]]) -> str:
    return "\n".join(f"<e>{h}</e> || {r} || <e>{t}</e>" for h, r, t in triplets)



def process_data(data: dict, remove_underscore: bool = True) -> Tuple[Dict, List]:
    from tqdm import tqdm
    """
    Create triplets from given FactKG structure.

    Parameters:
    - data (dict): Input data containing 'Entity_set' and 'Evidence'.
    - remove_underscore (bool): If True, replace underscores with spaces in entity names.

    Returns:
    - Tuple[Dict, List]: A tuple containing the updated data dictionary and the list distinct entity used for later update the trie.
    """
    updated_data = {}
    distinct_entities = set()
    keys = list(data.keys())
    for key in tqdm(keys, desc="Processing data"):
        updated = generate(key, data[key])
        updated_data[key] = updated

        # Collect distinct entities from all triplets
        for triplet in updated["triplet"]:
            # Triplet contains 3 elements, get the first one and the last one as entities
            distinct_entities.add(triplet[0])
            distinct_entities.add(triplet[2])

    return updated_data, list(distinct_entities)

In [3]:
DATA_DIR = 'resources'
# Data dir = (1) working directory, (2) move out of test, (3) move out of src, and append to resources
DATA_DIR = os.path.join(os.getcwd(), '..', 'resources')
print("Data Directory:", DATA_DIR)

TRAIN_FILE = 'factkg_train_5k.pickle'
TEST_FILE = 'factkg_test_1k.pickle'
VALID_FILE = 'factkg_val_300.pickle'

TRAIN_FILE_PATH = os.path.join(DATA_DIR, TRAIN_FILE)
TEST_FILE_PATH = os.path.join(DATA_DIR, TEST_FILE)
VALID_FILE_PATH = os.path.join(DATA_DIR, VALID_FILE)

import pickle

train_data = None
test_data = None
valid_data = None
with open(TRAIN_FILE_PATH, 'rb') as f:
    train_data = pickle.load(f)
with open(TEST_FILE_PATH, 'rb') as f:
    test_data = pickle.load(f)
with open(VALID_FILE_PATH, 'rb') as f:
    valid_data = pickle.load(f)

train_updated_data, train_distinct_entities = process_data(train_data, remove_underscore=True)

test_updated_data, test_distinct_entities = process_data(test_data, remove_underscore=True)

valid_updated_data, valid_distinct_entities = process_data(valid_data, remove_underscore=True)

Data Directory: d:\claimpkg\claimpkg-clone\src\notebooks\..\resources


Processing data: 100%|██████████| 5000/5000 [00:00<00:00, 61507.81it/s]
Processing data: 100%|██████████| 1000/1000 [00:00<00:00, 132529.83it/s]
Processing data: 100%|██████████| 300/300 [00:00<00:00, 85481.74it/s]


In [4]:
valid_updated_data['Guiana Space Centre was the launch site of the rocket which was manufactured by Arianespace and had its final flight on 27th September, 2003.']

#Guiana Space Centre was the launch site of the rocket which was manufactured by Arianespace and had its final flight on 27th September, 2003

{'Label': [True],
 'Entity_set': ['"2003-09-27"', 'Arianespace', 'Guiana_Space_Centre'],
 'Evidence': {'Arianespace': [['~manufacturer', 'sites'],
   ['~manufacturer', 'finalFlight']],
  'Guiana_Space_Centre': [['~sites', 'manufacturer'],
   ['~sites', 'finalFlight']],
  '"2003-09-27"': [['~finalFlight', 'manufacturer'],
   ['~finalFlight', 'sites']]},
 'types': ['written', 'num3', 'multi hop'],
 'triplet': [('unknown_1', 'manufacturer', 'unknown_0'),
  ('unknown_1', 'sites', 'Guiana Space Centre'),
  ('unknown_2', 'manufacturer', 'unknown_0'),
  ('unknown_2', 'finalFlight', '"2003-09-27"'),
  ('unknown_3', 'sites', 'unknown_0'),
  ('unknown_3', 'manufacturer', 'Arianespace'),
  ('unknown_4', 'sites', 'unknown_0'),
  ('unknown_4', 'finalFlight', '"2003-09-27"'),
  ('unknown_5', 'finalFlight', 'unknown_0'),
  ('unknown_5', 'manufacturer', 'Arianespace'),
  ('unknown_6', 'finalFlight', 'unknown_0'),
  ('unknown_6', 'sites', 'Guiana Space Centre')]}

In [5]:
# Export the processed data back to pickle files
with open(os.path.join(DATA_DIR, 'factkg_train_5k_triplets.pickle'), 'wb') as f:
    pickle.dump(train_updated_data, f)

with open(os.path.join(DATA_DIR, 'factkg_test_1k_triplets.pickle'), 'wb') as f:
    pickle.dump(test_updated_data, f)

with open(os.path.join(DATA_DIR, 'factkg_dev_300_triplets.pickle'), 'wb') as f:
    pickle.dump(valid_updated_data, f)

In [6]:
all_join_entities = set()
all_join_entities.update(train_distinct_entities)
all_join_entities.update(test_distinct_entities)
all_join_entities.update(valid_distinct_entities)

print("Total distinct entities across all datasets:", len(all_join_entities))

Total distinct entities across all datasets: 5269


In [7]:
# Export as a pickle file contains a set, later use this for a trie reformation
with open(os.path.join(DATA_DIR, 'factkg_all_distinct_entities.pickle'), 'wb') as f:
    pickle.dump(all_join_entities, f)