In [1]:
import pandas as pd
import Levenshtein
import json
import glob

In [2]:
# First, combine all entities and triplets
all_data = []

for filename in sorted(glob.glob('data/batches/entities_and_triplets_*.json')):
    with open(filename, 'r') as file:
        data = json.load(file)
        all_data.extend(data)

with open('data/raw_triplets.json', 'w') as output_file:
    json.dump(all_data, output_file, indent=4, ensure_ascii=False)

In [3]:
# Enrich triplets with entity type data
with open('data/raw_triplets.json', 'r') as file:
    raw_data = json.load(file)

threshold = 0.8
enriched_data = []
enriched_triplets_all = []

for instance in raw_data:

    triplets = instance['triplets']
    entities = instance['entities']
    url = instance['url']

    enriched_triplets = []

    for triplet in triplets:

        head = triplet['head']
        tail = triplet['tail']

        best_head = None
        best_head_sim = threshold

        best_tail = None
        best_tail_sim = threshold

        for entity in entities:
            name = entity['word']
            head_sim = Levenshtein.ratio(name, head)
            tail_sim = Levenshtein.ratio(name, tail)

            if head_sim > best_head_sim:
                best_head_sim = head_sim
                best_head = {
                    'word': name,
                    'entity': entity['entity']
                }
            
            if tail_sim > best_tail_sim:
                best_tail_sim = tail_sim
                best_tail = {
                    'word': name,
                    'entity': entity['entity']
                }
    
    
        enriched_triplets.append({
            'head': best_head if best_head else {'word': head, 'entity': 'Unknown'},
            'relation': triplet['relation'],
            'tail': best_tail if best_tail else {'word': tail, 'entity': 'Unknown'},
        })
    
    enriched_data.append({
        'idx': instance['idx'],
        'url': url,
        'enriched_triplets': enriched_triplets,
        'original_triplets': triplets,
        'all_entities': entities
    })
    enriched_triplets_all.extend(enriched_triplets)

with open('data/enriched_triplets.json', 'w') as file:
    json.dump(enriched_data, file, indent=4, ensure_ascii=False)

In [5]:
exact_replacements = {
    # Demonyms
    'Russian': 'Russia', 'Chinese': 'Chinese', 
    'American': 'United States', 'British': 'United Kingdom', 'Polish': 'Poland',
    'Swedish': 'Sweden', 'French': 'France', 'South Korean': 'South Korea', 
    'Dutch': 'Netherlands', 'Ukranian': 'Ukraine',

    # Countries etc.
    'UK': 'United Kingdom', 'US': 'United States', 'UAE': 'United Arab Emirates',
    'Russian Federation': 'Russia', 'Czech': 'Czech Republic', 'Netherlands': 'Netherlands',
    'Korea': 'South Korea',
    'EU': 'European Union', 'UN': 'United Nations', 

    # Abbreviations
    'WEC': 'Westinghouse', 'USNC': 'Ultra Safe Nuclear Corporation', 'GEH': 'GE Hitachi', 'GA': 'General Atomics',
    'GFP': 'Global First Power', 'ORNL': 'Oak Ridge National Laboratory',
    'IAEA': 'International Atomic Energy Agency'
}

org_replacements = { # Replace if entity type == ORG (substring match)
    'ARC': 'ARC',
    'Babcock': 'Babcock and Wilcox',
    'BWX': 'BWX',
    'Elysium': 'Elysium',
    'Flibe': 'Flibe',
    'Framatome': 'Framatome',
    'Hitachi': 'GE Hitachi',
    'General Atomics': 'General Atomics',
    'Holos': 'HolosGen',
    'Holtec': 'Holtec International',
    'Hyperion': 'Hyperion Power',
    'Kairos': 'Kairos Power',
    'Moltex': 'Moltex Energy',
    'NANO': 'NANO Nuclear',
    'NuScale': 'NuScale',
    'Oak Ridge': 'Oak Ridge National Laboratory',
    'Oklo': 'Oklo',
    'StarCore': 'StarCore Nuclear',
    'TerraPower': 'TerraPower',
    'Terrestial': 'Terrestial',
    'ThorCon': 'ThorCon',
    'Ultra Safe Nuclear': 'Ultra Safe Nuclear Corporation',
    'Berkeley': 'Berkeley',
    'Westinghouse': 'Westinghouse',
    'X-Energy': 'X-Energy',

    'Point Lepreau': 'Point Lepreau NPP',
    'China General Nuclear ': 'China General Nuclear Power Corporation',
    'Clinch River': 'Clinch River Site',
    'East Tennessee Technology': 'East Tennessee Technology Park',
    'Fukushima': 'Fukushima',
    'Jacobs UK': 'Jacobs',
    'Korea Electric Power': 'Korea Electric Power Company',
    'Magnox': 'Magnox',
    'ORLEN Synthos': 'ORLEN Synthos Green Energy',
    'Rolls Royce': 'Rolls-Royce',
    'Sizewell': 'Sizewell C',
    'WEC': 'WEC Group',
    'Temelin': 'Temelin NPP',
    'TransAlta': 'TransAlta  Corporation'
}

per_replacements = { # Replace if entity type == PER (substring match)
    'Grossi': 'Rafael Grossi'
}

loc_replacements = {
    'Point Lepreau': 'Point Lepreau NPP'
}

letter_replacements = {
    'à': 'a', 'À': 'A',
    'á': 'a', 'Á': 'A',
    'ä': 'a', 'Ä': 'A',
    'ã': 'a', 'Ã': 'A',
    'å': 'a', 'Å': 'A',
    'ą': 'a', 'Ą': 'A',
    'ç': 'c', 'Ç': 'C',
    'č': 'c', 'Č': 'C',
    'ď': 'd', 'Ď': 'D',
    'é': 'e', 'É': 'E',
    'è': 'e', 'È': 'E',
    'ě': 'e', 'Ě': 'E',
    'ğ': 'g', 'Ğ': 'G',
    'í': 'i', 'Í': 'I',
    'ï': 'i', 'Ï': 'I',
    'ı': 'i', 'İ': 'I',
    'ł': 'l', 'Ł': 'L',
    'ł': 'l', 'Ł': 'L',
    'ñ': 'n', 'Ñ': 'N',
    'ň': 'n', 'Ň': 'N',
    'ń': 'n', 'Ń': 'N',
    'ó': 'o', 'Ó': 'O',
    'ö': 'o', 'Ö': 'O',
    'ô': 'o', 'Ô': 'O',
    'õ': 'o', 'Õ': 'O',
    'ø': 'o', 'Ø': 'O',
    'ř': 'r', 'Ř': 'R',
    'š': 's', 'Š': 'S',
    'ť': 't', 'Ť': 'T',
    'ú': 'u', 'Ú': 'U',
    'ü': 'u', 'Ü': 'U',
    'û': 'u', 'Û': 'U',
    'ù': 'u', 'Ù': 'U',
    'ý': 'y', 'Ý': 'Y',
    'ž': 'z', 'Ž': 'Z',
    '.': ' ', ' “': '',
    '&': 'and',
}

def custom_replacement(word, entity=None):
    new = None
    
    for letter, replacement in letter_replacements.items():
        word = word.replace(letter, replacement)
    
    lower_word = word.strip().lower()

    for key, replacement in exact_replacements.items():
        if lower_word == key.lower():
            new = replacement
    
    if entity == 'ORG':
        for org_key, replacement in org_replacements.items():
            if org_key.lower() in lower_word:
                new = replacement
    elif entity == 'PER':
        for per_key, replacement in per_replacements.items():
            if per_key.lower() in lower_word:
                new = replacement
    elif entity == 'LOC':
        for loc_key, replacement in loc_replacements.items():
            if loc_key.lower() in lower_word:
                new = replacement
    
    #if not new:
        #print(word, entity)
    #if new and new != word:
        #print(f"Before: {word}")
        #print(f"After: {new}")
    if not new:
        new = word
    return new

In [7]:
# Make into Excel for data prep
filepath = 'data/unfiltered/all_triplets' # use the unfiltered data

with open(filepath + '.json', 'r') as file:
    triplets = json.load(file)

only_triplets = []
for triplet in triplets:

    head = triplet['head']['word']
    h_ent = triplet['head']['entity']
    tail = triplet['tail']['word']
    t_ent = triplet['tail']['entity']

    # Drop triplets DistilBERT was not able to identify
    if h_ent == 'Unknown' or t_ent == 'Unknown':
        continue

    head = custom_replacement(head, h_ent)
    tail = custom_replacement(tail, t_ent)

    only_triplets.append({
        'head': head,
        'h_ent': h_ent,
        'relation': triplet['relation'],
        'tail': tail,
        't_ent': t_ent
    })

df = pd.json_normalize(only_triplets)
df = df.drop_duplicates()
df.to_excel('data/final_triplets.xlsx', index=False)