In [1]:
import os
import sys
sys.path.insert(0, '..')


In [2]:
import re
from typing import Dict, Any, List, Tuple

remove_underscore = True

def _norm(s):
    if s is None:
        return s
    return s.replace("_", " ") if remove_underscore else s

def _new_unknown(unk_list):
    u = f"unknown_{len(unk_list)}"
    unk_list.append(u)
    return u

def generate(claim: str, sample: Dict[str, Any]):
    # Normalize entities by adding the underscore
    entity_set = [_norm(e) for e in sample.get("Entity_set", [])]

    # Get evidences
    evidence = sample.get("Evidence", {})

    # Normalize keys
    evidence_map = { _norm(k): v for k, v in evidence.items() }

    # --------------------------------------------------------------
    # STEP 1 — Build direct matches (A --rel--> B)
    # --------------------------------------------------------------
    direct_edges = []

    for e1, groups1 in evidence_map.items():
        for rels1 in groups1:
            for r in rels1:
                inverse = r.startswith("~")
                rel = r[1:] if inverse else r

                # CASE 1: forward r → look for entity with ~r
                if not inverse:
                    for e2, groups2 in evidence_map.items():
                        if e1 == e2: continue
                        for rels2 in groups2:
                            if f"~{rel}" in rels2:
                                direct_edges.append((e1, rel, e2))

                # CASE 2: inverse ~r → look for entity with forward r
                else:
                    for e2, groups2 in evidence_map.items():
                        if e1 == e2: continue
                        for rels2 in groups2:
                            if rel in rels2:
                                direct_edges.append((e2, rel, e1))

    # --------------------------------------------------------------
    # STEP 2 — Now build multi-hop paths (using unknown_i)
    # --------------------------------------------------------------
    unknown_list = []
    triplets = []

    # dictionary to quickly check for direct pairs
    direct_set = set((h, r, t) for h, r, t in direct_edges)

    # Helper to detect if last hop of unit should link directly
    def find_direct_target(rel, exclude):
        for h, r, t in direct_edges:
            if r == rel:
                if h != exclude:
                    return t
        return None

    for ent, rel_groups in evidence_map.items():
        for rel_path in rel_groups:

            # If path has only 1 rel, and direct match exists → already captured
            if len(rel_path) == 1:
                r = rel_path[0]
                inv = r.startswith("~")
                rel = r[1:] if inv else r

                # direct already handled → skip
                continue

            # Multi-hop path → must use unknowns
            curr = ent
            for i, r in enumerate(rel_path):
                inv = r.startswith("~")
                rel = r[1:] if inv else r

                # last hop? try direct match
                if i == len(rel_path) - 1:
                    target = find_direct_target(rel, curr)
                    if target is None:
                        target = _new_unknown(unknown_list)
                else:
                    # internal hop → always unknown
                    target = _new_unknown(unknown_list)

                # build triplet
                if inv:
                    triplets.append((target, rel, curr))
                else:
                    triplets.append((curr, rel, target))

                curr = target

    # --------------------------------------------------------------
    # STEP 3 — Combine direct + multi-hop, deduplicate
    # --------------------------------------------------------------
    triplets.extend(direct_edges)
    final = list(dict.fromkeys(triplets))

    sample["triplet"] = final
    return sample


# --------------------------------------------------------------
def linearize(triplets: List[Tuple[str, str, str]]) -> str:
    return "\n".join(f"<e>{h}</e> || {r} || <e>{t}</e>" for h, r, t in triplets)



def process_data(data: dict, remove_underscore: bool = True) -> Tuple[Dict, List]:
    from tqdm import tqdm

    updated_data = {}
    distinct_entities = set()
    keys = list(data.keys())

    for key in tqdm(keys, desc="Processing data"):
        updated = generate(key, data[key])
        updated_data[key] = updated

        for h, r, t in updated["triplet"]:
            distinct_entities.add(h)
            distinct_entities.add(t)

    return updated_data, list(distinct_entities)


In [3]:
DATA_DIR = 'resources'
# Data dir = (1) working directory, (2) move out of test, (3) move out of src, and append to resources
DATA_DIR = os.path.join(os.getcwd(), '..', 'resources')
print("Data Directory:", DATA_DIR)

TRAIN_FILE = 'factkg_train_5k.pickle'
TEST_FILE = 'factkg_test_1k.pickle'
VALID_FILE = 'factkg_val_300.pickle'

TRAIN_FILE_PATH = os.path.join(DATA_DIR, TRAIN_FILE)
TEST_FILE_PATH = os.path.join(DATA_DIR, TEST_FILE)
VALID_FILE_PATH = os.path.join(DATA_DIR, VALID_FILE)

import pickle

train_data = None
test_data = None
valid_data = None
with open(TRAIN_FILE_PATH, 'rb') as f:
    train_data = pickle.load(f)
with open(TEST_FILE_PATH, 'rb') as f:
    test_data = pickle.load(f)
with open(VALID_FILE_PATH, 'rb') as f:
    valid_data = pickle.load(f)

train_updated_data, train_distinct_entities = process_data(train_data, remove_underscore=True)

test_updated_data, test_distinct_entities = process_data(test_data, remove_underscore=True)

valid_updated_data, valid_distinct_entities = process_data(valid_data, remove_underscore=True)

Data Directory: d:\claimpkg\claimpkg-clone\src\notebooks\..\resources


Processing data: 100%|██████████| 5000/5000 [00:00<00:00, 30026.62it/s]
Processing data: 100%|██████████| 1000/1000 [00:00<00:00, 86544.73it/s]
Processing data: 100%|██████████| 300/300 [00:00<00:00, 73292.82it/s]


In [4]:
# Concat the 3 set train, test, valid
concat_data = {}
concat_data.update(train_updated_data)
concat_data.update(test_updated_data)
concat_data.update(valid_updated_data)

print("Total number of items in concatenated data:", len(concat_data))

key_list = list(concat_data.keys())

Total number of items in concatenated data: 6300


In [5]:
# Get all types items from train, test, and valid datasets
types = set()
for data in [train_updated_data, test_updated_data, valid_updated_data]:
    for item in data:
        for element in data[item]['types']:
            types.add(element)
types = list(types)
print("Distinct types found:", len(types))
types

Distinct types found: 13


['num2',
 'num4',
 'multi claim',
 'negation',
 'num3',
 'coll:model',
 'num1',
 'existence',
 'written',
 'multi hop',
 'coll:presup',
 'substitution',
 'question']

# Explain

| Type           | Category            | Ý nghĩa                                |
| -------------- | ------------------- | -------------------------------------- |
| `written`      | claim style         | Văn phong tự nhiên                     |
| `coll:model`   | claim style         | Văn nói do model sinh                  |
| `coll:presup`  | claim style         | Dạng câu hỏi giả định (presupposition) |
| `num1`         | reasoning           | One-hop                                |
| `multi claim`  | reasoning           | Contains multiple facts                |
| `existence`    | reasoning           | Hỏi về sự tồn tại                      |
| `multi hop`    | reasoning           | Multi-hop reasoning                    |
| `negation`     | reasoning           | Phủ định                               |
| `num2`         | reasoning (complex) | Multi relation 2 chiều                 |
| `num3`         | reasoning (complex) | Multi relation 3 chiều                 |
| `substitution` | generation          | Claim tạo bằng thay thế thông tin      |


So, these won't work or need to concern.
(1) existence
(2) num[```i```]
(3) multi-hop

The next step is to find number of rows to concern

In [6]:
is_concern : list[bool] = []
# Get all types items from train, test, and valid datasets
for data in [train_updated_data, test_updated_data, valid_updated_data]:
    for item in data:
        if 'multi-hop' in data[item]['types']:
            is_concern.append(True)
            continue
        if 'num2' in data[item]['types']:
            is_concern.append(True)
            continue
        if 'num3' in data[item]['types']:
            is_concern.append(True)
            continue
        if 'num4' in data[item]['types']:
            is_concern.append(True)
            continue
        is_concern.append(False)

print("Number of concerned types:", sum(is_concern))
print("Total types checked:", len(is_concern))

# Append the is_concern flag to the concat_data
for idx, key in enumerate(key_list):
    concat_data[key]['is_concern'] = is_concern[idx]
    concat_data[key]['complexity'] = 'skipped'

# See the example
example_key = key_list[0]
print(f"Example Key: {example_key}")
print(f"DATA: {concat_data[example_key]}")

Number of concerned types: 4066
Total types checked: 6300
Example Key: is published by Lippincott Williams & Wilkins in the UK where English is the main language.
DATA: {'Label': [True], 'Entity_set': ['United_Kingdom', 'English_language', 'Lippincott_Williams_&_Wilkins', 'AIDS_(journal)'], 'Evidence': {'United_Kingdom': [['language'], ['~country']], 'AIDS_(journal)': [['country'], ['publisher']], 'English_language': [['~language']], 'Lippincott_Williams_&_Wilkins': [['~publisher']]}, 'types': ['coll:model', 'num4', 'multi claim'], 'triplet': [('United Kingdom', 'language', 'English language'), ('AIDS (journal)', 'country', 'United Kingdom'), ('AIDS (journal)', 'publisher', 'Lippincott Williams & Wilkins')], 'is_concern': True, 'complexity': 'skipped'}


In [7]:
INDEX = 20
print(f"Concern state: {is_concern[INDEX]}")
print(f"Key: {key_list[INDEX]}")
print(f"DATA: {concat_data[key_list[INDEX]]}")

Concern state: False
Key: It was Micol Fontana who did not have an award.
DATA: {'Label': [True], 'Entity_set': ['Micol_Fontana'], 'Evidence': {'Micol_Fontana': [['award']]}, 'types': ['coll:model', 'negation', 'existence'], 'triplet': [], 'is_concern': False, 'complexity': 'skipped'}


# Analyzing the difficult level of multi-hop or num-i

In [8]:
import re

PRONOUNS = {"he", "she", "they", "them", "his", "her", "their",
            "its", "it", "this artist", "the artist", "the city",
            "the governor", "the musician"}

def contains_pronoun(claim: str):
    text = claim.lower()
    return any(p in text for p in PRONOUNS)

def classify_multihop_complexity(claim: str, sample: dict):
    """
    Classify multi-hop difficulty into: easy, medium, hard
    """
    types = sample["types"]
    evidence = sample.get("Evidence")

    # Not multi-hop => always easy
    if "multi hop" not in types:
        return "easy"

    # Count relations per entity
    rel_counts = {ent: len(paths) for ent, paths in evidence.items()}

    # Count relation-types inside each hop
    hop_complexity = []
    for ent, paths in evidence.items():
        for hop in paths:
            hop_complexity.append(len(hop))

    max_rels_per_entity = max(rel_counts.values()) if rel_counts else 0
    max_hop_width = max(hop_complexity) if hop_complexity else 0

    # Number of entities
    num_entities = len(evidence)

    # Detect cycles -- if entity A has ~r followed by B has r again
    def has_inverse_cycles(evidence):
        inverse_pairs = {}
        for ent, paths in evidence.items():
            for hop in paths:
                for rel in hop:
                    if rel.startswith("~"):
                        inverse_pairs.setdefault(ent, []).append(rel[1:])
        # If multiple entities share inverse forms => likely cycle
        inverse_map = {}
        for ent, relations in inverse_pairs.items():
            for r in relations:
                inverse_map.setdefault(r, []).append(ent)
        return any(len(v) > 1 for v in inverse_map.values())

    # Rule: Easy cases
    if max_rels_per_entity == 1 and max_hop_width == 1:
        # no ambiguity, no complex inverse chains
        return "easy"

    # Rule: Hard cases (requiring GPT)
    # 1. Implicit subject/object → pronoun
    if contains_pronoun(claim):
        return "hard"

    # 2. Entity has > 2 relation paths → ambiguous multi-hop (num2, num3)
    if max_rels_per_entity >= 3:
        return "hard"

    # 3. Any hop uses >= 3 relations (multi-path)
    if max_hop_width >= 3:
        return "hard"

    # 4. Inverse cycles or loops
    if has_inverse_cycles(evidence):
        return "hard"

    # 5. Too many entities in multi-hop (structure ambiguous)
    if num_entities >= 4:
        return "hard"

    # If not easy, not hard → medium
    return "medium"

complexity_counts = {"easy": [], "medium": [], "hard": []}

for i, key in enumerate(concat_data):
    if is_concern[i] == False:
        continue

    item = concat_data[key]
    difficulty = classify_multihop_complexity(key, item)
    complexity_counts[difficulty].append(key)
    concat_data[key]['complexity'] = difficulty

In [9]:
# Count and show percentages
total_counts = sum(len(v) for v in complexity_counts.values())
for level, items in complexity_counts.items():
    count = len(items)
    percentage = (count / total_counts) * 100 if total_counts > 0 else 0
    print(f"{level.capitalize()}: {count} items ({percentage:.2f}%)")

Easy: 2315 items (56.94%)
Medium: 113 items (2.78%)
Hard: 1638 items (40.29%)


# Inspecting each classification results

In [10]:
INDEX = 3
key = complexity_counts['hard'][INDEX]
print(f"Key: {key}")
print(f"DATA: {concat_data[key]}")

Key: Paleobiology is the academic discipline of an academic journal (abbreviated to Acta Palaeontol. Pol) which has the ISSN number 0567-7920.
DATA: {'Label': [True], 'Entity_set': ['"0567-7920"', 'Paleobiology', '"Acta Palaeontol. Pol."'], 'Evidence': {'"Acta Palaeontol. Pol."': [['~abbreviation', 'discipline'], ['~abbreviation', 'issn']], 'Paleobiology': [['~discipline', 'abbreviation'], ['~discipline', 'issn']], '"0567-7920"': [['~issn', 'abbreviation'], ['~issn', 'discipline']]}, 'types': ['written', 'num3', 'multi hop'], 'triplet': [('unknown_0', 'abbreviation', '"Acta Palaeontol. Pol."'), ('unknown_0', 'discipline', 'Paleobiology'), ('unknown_1', 'abbreviation', '"Acta Palaeontol. Pol."'), ('unknown_1', 'issn', '"0567-7920"'), ('unknown_2', 'discipline', 'Paleobiology'), ('unknown_2', 'abbreviation', '"Acta Palaeontol. Pol."'), ('unknown_3', 'discipline', 'Paleobiology'), ('unknown_3', 'issn', '"0567-7920"'), ('unknown_4', 'issn', '"0567-7920"'), ('unknown_4', 'abbreviation', '"A

-> Easy set is good, but medium and hard set sucks. Let's extract it into a pickle file

In [11]:
# Export the classified data into a pickle file
OUTPUT_FILE = 'classified_book_dataset_6300.pickle'
OUTPUT_PATH = os.path.join(DATA_DIR, OUTPUT_FILE)
with open(OUTPUT_PATH, 'wb') as f:
    pickle.dump(concat_data, f)
print(f"Classified data saved to {OUTPUT_PATH}")

Classified data saved to d:\claimpkg\claimpkg-clone\src\notebooks\..\resources\classified_book_dataset_6300.pickle
