## Prepare Data for Knowledge Graph Creation

In [13]:
import pandas as pd
import json
import re
import os
import Levenshtein

In [20]:
# Load data
#filepath = '../scraping/articles/filtered_articles.json'
filepath = '../scraping/articles/all_articles.json'
with open(filepath, 'r') as file:
    articles = pd.read_json(filepath)

In [22]:
# Names used for replacement
name_dict = {
    'ARC': ['ARC'],
    'Babcock and Wilcox': ['Babcock'],
    'Berkeley': ['Berkeley'],
    'BWX': ['BWX'],
    'Elysium': ['Elysium'],
    'Flibe': ['Flibe'],
    'Framatome': ['Framatome'],
    'GE Hitachi': ['Hitachi', 'GEH'],
    'General Atomics': ['General Atomics'],
    'HolosGen': ['HolosGen', 'Holos'],
    'Holtec International': ['Holtec'],
    'Hyperion Power': ['Hyperion'],
    'Kairos Power': ['Kairos'],
    'Moltex Energy': ['Moltex'],
    'NANO Nuclear': ['NANO', 'NNE'],
    'NuScale': ['NuScale'],
    'Oak Ridge National Laboratory': ['Oak Ridge National Laboratory', 'ORNL'],
    'Oklo': ['Oklo'],
    'StarCore Nuclear': ['StarCore'],
    'TerraPower': ['TerraPower'],
    'Terrestial': ['Terrestial'],
    'ThorCon': ['ThorCon'],
    'Ultra Safe Nuclear Corporation': ['Ultra Safe Nuclear Corporation', 'USNC'],
    'Westinghouse': ['Westinghouse', 'WEC'],
    'X-Energy': ['X-energy']
}

### Get frequency of names in articles (Optional)

In [None]:
# Get frequency of each start up name in each article
articles_with_frequency = []

for i in range(len(articles)):
    article = articles.iloc[i]
    url = article['url']
    title = article['title']
    sents = article['text']
    text = ' '.join(sents)

    new_article = {
        'url': url,
        'title': title,
        'text': sents,
    }

    for key, names in name_dict.items():
        frequency = 0
        
        for name in names:
            if name.isupper():
                pattern = rf'\b{re.escape(name)}\b|\({re.escape(name)}\)|\b{re.escape(name)}[-\w]*\b'
                frequency += len(re.findall(pattern, text))
            else:
                # Case-insensitive match for non-acronyms
                pattern = rf'\b{re.escape(name.lower())}\b|\({re.escape(name.lower())}\)|\b{re.escape(name.lower())}[-\w]*\b'
                frequency += len(re.findall(pattern, text.lower()))
        
        new_article[key] = frequency
    
    articles_with_frequency.append(new_article)

filepath = f'data/articles_with_frequency.json'
os.makedirs(os.path.dirname(filepath), exist_ok=True)
with open(filepath, 'w') as file:
    json.dump(articles_with_frequency, file, indent=4, ensure_ascii=False)

### Choose base directory

In [23]:
#base_directory = 'data/triplets_with_frequency'
#base_directory = 'data/triplets_no_cutoff'
base_directory = 'data/triplets_all_time'

# Also: you'll need this later
with open('stopwords/countries.txt', 'r') as file:
    countries = [line.strip() for line in file]

with open('stopwords/cities.txt', 'r') as file:
    capitals = [line.strip() for line in file]

with open('stopwords/startups.txt', 'r') as file:
    startups = [line.strip() for line in file]

with open('stopwords/states.txt', 'r') as file:
    states = [line.strip() for line in file]

stopwords = countries + capitals + startups + states

### Filter articles (optional)

In [8]:
SPLIT_FACTOR = 10 # Determines cutoff: 1 -> top 100%; 2 -> top 50%; 10 -> top 10%
base_directory = 'data/triplets_with_frequency'

#SPLIT_FACTOR = 1
#base_directory = 'data/triplets_no_cutoff' # for SP

with open('data/enriched_triplets.json', 'r') as file:
    all_triplets = json.load(file)

for name, name_list in name_dict.items():
    
    filepath = f"{base_directory}/{name.lower().replace(' ','_')}/{name.lower().replace(' ','_')}.json"
    with open(filepath, 'r') as file:
        articles = pd.read_json(filepath)
    
    if not articles.empty:

        # Sort by frequency, get first decile
        articles = articles.sort_values(by=['frequency'], ascending=False)
        cutoff = len(articles) // SPLIT_FACTOR
        articles = articles.iloc[:cutoff+1]

        # Next, get the triplets
        triplets = []
        length = len(articles)
        for i in range(length):
            article = articles.iloc[i]
            target_url = article['url']
            freq = int(article['frequency'])

            for instance in all_triplets:
                url = instance['url']
                if url == target_url:
                    triplets.append({
                        'url': url,
                        'triplets': instance['enriched_triplets'],
                        'target': name,
                        'frequency': freq,
                    })
    
        triplets_length = len(triplets)
        percent = triplets_length/length*100 if length > 0 else 0

        print(f'{name}: Triplets retrieved from {triplets_length} articles')

        filepath = f"{base_directory}/{name.lower().replace(' ','_')}/{name.lower().replace(' ','_')}_triplets.json"
        os.makedirs(os.path.dirname(filepath), exist_ok=True)

        with open(filepath, 'w') as file:
            json.dump(triplets, file, indent=4, ensure_ascii=False) 

ARC: Triplets retrieved from 4 articles
Babcock and Wilcox: Triplets retrieved from 2 articles
Berkeley: Triplets retrieved from 3 articles
BWX: Triplets retrieved from 6 articles
Elysium: Triplets retrieved from 1 articles
Flibe: Triplets retrieved from 2 articles
Framatome: Triplets retrieved from 17 articles
GE Hitachi: Triplets retrieved from 21 articles
General Atomics: Triplets retrieved from 3 articles
HolosGen: Triplets retrieved from 1 articles
Holtec International: Triplets retrieved from 12 articles
Kairos Power: Triplets retrieved from 2 articles
Moltex Energy: Triplets retrieved from 4 articles
NANO Nuclear: Triplets retrieved from 1 articles
NuScale: Triplets retrieved from 19 articles
Oak Ridge National Laboratory: Triplets retrieved from 7 articles
Oklo: Triplets retrieved from 3 articles
TerraPower: Triplets retrieved from 9 articles
ThorCon: Triplets retrieved from 1 articles
Ultra Safe Nuclear Corporation: Triplets retrieved from 5 articles
Westinghouse: Triplets ret

In [8]:
# Preprocess enriched triplets (for case of no filtering) to make next step possible
if base_directory == 'data/triplets_all_time':
    with open('data/enriched_triplets.json', 'r') as file:
        all_triplets = json.load(file)

    triplets = []
    for instance in all_triplets:
        triplets.extend(instance['enriched_triplets'])

    filepath = f"{base_directory}/all_triplets.json"
    with open(filepath, 'w') as file:
        json.dump(triplets, file, indent=4, ensure_ascii=False) 

### Script for Preprocessing Triplets

In [9]:
exact_replacements = {
    # Demonyms
    'Russian': 'Russia', 'Chinese': 'Chinese', 
    'American': 'United States', 'British': 'United Kingdom', 'Polish': 'Poland',
    'Swedish': 'Sweden', 'French': 'France', 'South Korean': 'South Korea', 
    'Dutch': 'Netherlands', 'Ukranian': 'Ukraine',

    # Countries etc.
    'UK': 'United Kingdom', 'US': 'United States', 'UAE': 'United Arab Emirates',
    'Russian Federation': 'Russia', 'Czech': 'Czech Republic', 'Netherlands': 'Netherlands',
    'Korea': 'South Korea',
    'EU': 'European Union', 'UN': 'United Nations', 

    # Abbreviations
    'WEC': 'Westinghouse', 'USNC': 'Ultra Safe Nuclear Corporation', 'GEH': 'GE Hitachi', 'GA': 'General Atomics',
    'GFP': 'Global First Power', 'ORNL': 'Oak Ridge National Laboratory',
    'IAEA': 'International Atomic Energy Agency'
}

org_replacements = { # Replace if entity type == ORG (substring match)
    'ARC': 'ARC',
    'Babcock': 'Babcock and Wilcox',
    'BWX': 'BWX',
    'Elysium': 'Elysium',
    'Flibe': 'Flibe',
    'Framatome': 'Framatome',
    'Hitachi': 'GE Hitachi',
    'General Atomics': 'General Atomics',
    'Holos': 'HolosGen',
    'Holtec': 'Holtec International',
    'Hyperion': 'Hyperion Power',
    'Kairos': 'Kairos Power',
    'Moltex': 'Moltex Energy',
    'NANO': 'NANO Nuclear',
    'NuScale': 'NuScale',
    'Oak Ridge': 'Oak Ridge National Laboratory',
    'Oklo': 'Oklo',
    'StarCore': 'StarCore Nuclear',
    'TerraPower': 'TerraPower',
    'Terrestial': 'Terrestial',
    'ThorCon': 'ThorCon',
    'Ultra Safe Nuclear': 'Ultra Safe Nuclear Corporation',
    'Berkeley': 'Berkeley',
    'Westinghouse': 'Westinghouse',
    'X-Energy': 'X-Energy',

    'Point Lepreau': 'Point Lepreau NPP',
    'China General Nuclear ': 'China General Nuclear Power Corporation',
    'Clinch River': 'Clinch River Site',
    'East Tennessee Technology': 'East Tennessee Technology Park',
    'Fukushima': 'Fukushima',
    'Jacobs UK': 'Jacobs',
    'Korea Electric Power': 'Korea Electric Power Company',
    'Magnox': 'Magnox',
    'ORLEN Synthos': 'ORLEN Synthos Green Energy',
    'Rolls Royce': 'Rolls-Royce',
    'Sizewell': 'Sizewell C',
    'WEC': 'WEC Group',
    'Temelin': 'Temelin NPP',
    'TransAlta': 'TransAlta  Corporation'
}

per_replacements = { # Replace if entity type == PER (substring match)
    'Grossi': 'Rafael Grossi'
}

loc_replacements = {
    'Point Lepreau': 'Point Lepreau NPP'
}

letter_replacements = {
    'à': 'a', 'À': 'A',
    'á': 'a', 'Á': 'A',
    'ä': 'a', 'Ä': 'A',
    'ã': 'a', 'Ã': 'A',
    'å': 'a', 'Å': 'A',
    'ą': 'a', 'Ą': 'A',
    'ç': 'c', 'Ç': 'C',
    'č': 'c', 'Č': 'C',
    'ď': 'd', 'Ď': 'D',
    'é': 'e', 'É': 'E',
    'è': 'e', 'È': 'E',
    'ě': 'e', 'Ě': 'E',
    'ğ': 'g', 'Ğ': 'G',
    'í': 'i', 'Í': 'I',
    'ï': 'i', 'Ï': 'I',
    'ı': 'i', 'İ': 'I',
    'ł': 'l', 'Ł': 'L',
    'ł': 'l', 'Ł': 'L',
    'ñ': 'n', 'Ñ': 'N',
    'ň': 'n', 'Ň': 'N',
    'ń': 'n', 'Ń': 'N',
    'ó': 'o', 'Ó': 'O',
    'ö': 'o', 'Ö': 'O',
    'ô': 'o', 'Ô': 'O',
    'õ': 'o', 'Õ': 'O',
    'ø': 'o', 'Ø': 'O',
    'ř': 'r', 'Ř': 'R',
    'š': 's', 'Š': 'S',
    'ť': 't', 'Ť': 'T',
    'ú': 'u', 'Ú': 'U',
    'ü': 'u', 'Ü': 'U',
    'û': 'u', 'Û': 'U',
    'ù': 'u', 'Ù': 'U',
    'ý': 'y', 'Ý': 'Y',
    'ž': 'z', 'Ž': 'Z',
    '.': ' ', ' “': '',
    '&': 'and',
}

def custom_replacement(word, entity=None):
    new = None
    
    for letter, replacement in letter_replacements.items():
        word = word.replace(letter, replacement)
    
    lower_word = word.strip().lower()

    for key, replacement in exact_replacements.items():
        if lower_word == key.lower():
            new = replacement
    
    if entity == 'ORG':
        for org_key, replacement in org_replacements.items():
            if org_key.lower() in lower_word:
                new = replacement
    elif entity == 'PER':
        for per_key, replacement in per_replacements.items():
            if per_key.lower() in lower_word:
                new = replacement
    elif entity == 'LOC':
        for loc_key, replacement in loc_replacements.items():
            if loc_key.lower() in lower_word:
                new = replacement
    
    #if not new:
        #print(word, entity)
    #if new and new != word:
        #print(f"Before: {word}")
        #print(f"After: {new}")
    if not new:
        new = word
    return new

In [10]:
# Make into Excel for data prep
if base_directory == 'data/triplets_all_time':
    filepath = base_directory + '/all_triplets' # use the unfiltered data

    with open(filepath + '.json', 'r') as file:
        triplets = json.load(file)

    only_triplets = []
    for triplet in triplets:

        head = triplet['head']['word']
        h_ent = triplet['head']['entity']
        tail = triplet['tail']['word']
        t_ent = triplet['tail']['entity']

        # Drop triplets DistilBERT was not able to identify
        if h_ent == 'Unknown' or t_ent == 'Unknown':
            continue

        head = custom_replacement(head, h_ent)
        tail = custom_replacement(tail, t_ent)

        only_triplets.append({
            'head': head,
            'h_ent': h_ent,
            'relation': triplet['relation'],
            'tail': tail,
            't_ent': t_ent
        })

    df = pd.json_normalize(only_triplets)
    df = df.drop_duplicates()
    df.to_excel(filepath + '_no_dupes.xlsx', index=False)
    
else:
    for name in name_dict:
    
        filepath = f"{base_directory}/{name.lower().replace(' ', '_')}/{name.lower().replace(' ','_')}_triplets"

        try:
            with open(filepath + '.json', 'r') as file:
                data = json.load(file)
        except FileNotFoundError:
            continue

        only_triplets = []
        for instance in data:
            for triplet in instance['triplets']:

                head = triplet['head']['word']
                h_ent = triplet['head']['entity']
                tail = triplet['tail']['word']
                t_ent = triplet['tail']['entity']

                # Drop triplets DistilBERT was not able to identify
                if h_ent == 'Unknown' or t_ent == 'Unknown':
                    continue

                head = custom_replacement(head, h_ent)
                tail = custom_replacement(tail, t_ent)

                # Drop circular relations
                if head == tail and h_ent == t_ent:
                    continue

                only_triplets.append({
                    'head': head,
                    'h_ent': h_ent,
                    'relation': triplet['relation'],
                    'tail': tail,
                    't_ent': t_ent
                })

        df = pd.json_normalize(only_triplets)
        df = df.drop_duplicates()

        print(f"{name}: {len(df)} triplets")
        df.to_excel(filepath + '_no_dupes.xlsx', index=False)

### Propagation Algoritm

In [11]:
# Get connected triplets
def propagate(entity_name, graph, triplets, visited, threshold=0.9):
    if entity_name in visited:
        return graph
    
    visited.add(entity_name)

    for triplet in triplets:
        head = triplet[0]
        tail = triplet[3]
        
        tail_sim = Levenshtein.ratio(tail.lower(), entity_name.lower())
        head_sim = Levenshtein.ratio(head.lower(), entity_name.lower())

        triplet_tuple = (head, triplet[1], tail, triplet[4], triplet[2])

        # Propagate inward: tail->head
        if tail_sim > threshold and triplet_tuple not in graph:
            graph.add(triplet_tuple)
            if head not in stopwords:
                print(f'Propagating inward from {entity_name} to {head} through {tail} (similarity: {tail_sim:.2f})')
                propagate(head, graph, triplets, visited, threshold)

        # Propagate outward: head->tail
        if head_sim > threshold and triplet_tuple not in graph:
            graph.add(triplet_tuple)
            if tail not in stopwords:
                print(f'Propagating outward from {entity_name} to {tail} through {head} (similarity: {head_sim:.2f})')
                propagate(tail, graph, triplets, visited, threshold)
    
    return graph

In [12]:
df = pd.read_excel('data/final_triplets.xlsx')
triplets = set(df.itertuples(index=False, name=None))
triplets = list(triplets)

results = {}
for name in name_dict:
    print(f"Now propagating for {name}...")
    graph = propagate(name, set(), triplets, set(), threshold=0.95)
    results[name] = list(graph)

filepath = f'{base_directory}/graphs.json'
print(f'Saving results to {filepath}...')
with open(filepath, 'w') as file:
    json.dump(results, file, indent=4, ensure_ascii=False)

Now propagating for ARC...
Propagating inward from ARC to Charlotte through ARC (similarity: 1.00)
Propagating outward from Charlotte to Belews Creek through Charlotte (similarity: 1.00)
Propagating inward from Belews Creek to Roxboro through Belews Creek (similarity: 1.00)
Propagating inward from ARC to Roman Estrada through ARC (similarity: 1.00)
Propagating outward from Roman Estrada to NPPD through Roman Estrada (similarity: 1.00)
Propagating inward from ARC to Andrey Baklitsky through ARC (similarity: 1.00)
Propagating inward from ARC to Cleveland through ARC (similarity: 1.00)
Propagating outward from Cleveland to Glenn Research Centre through Cleveland (similarity: 1.00)
Propagating inward from ARC to Bogdan Neculaes through ARC (similarity: 1.00)
Propagating inward from ARC to Craig Stover through ARC (similarity: 1.00)
Propagating inward from ARC to Lukasz Gadowski through ARC (similarity: 1.00)
Propagating inward from ARC to Petten through ARC (similarity: 1.00)
Propagating o

In [8]:
# For UNFILTERED sub selection of articles
results = {}
for name in name_dict:

    #filepath = f"{base_directory}/{name.lower().replace(' ', '_')}/{name.lower().replace(' ','_')}_triplets_no_dupes.xlsx"
    filepath = 'data/all_triplets_no_dupes.xlsx' # For unfiltered version of the code
    
    try:
        df = pd.read_excel(filepath)
    except FileNotFoundError:
        continue

    triplets = set(df.itertuples(index=False, name=None))
    triplets = list(triplets)
    length = len(triplets)

    if length > 0:
        graph = propagate(name, set(), triplets, set(), threshold=0.9)
    results[name] = list(graph)

filepath = f'{base_directory}/graphs.json'
print(f'Saving results to {filepath}...')
with open(filepath, 'w') as file:
    json.dump(results, file, indent=4, ensure_ascii=False)

print('\n')

Saving results to data/unfiltered/graphs.json...


