In [2]:
from pathlib import Path

path = Path('data/wiki_json/000')
assert path.exists(), f'{path} does not exist'


In [19]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [3]:
file = list(path.rglob('*.json'))[3]
file

PosixPath('data/wiki_json/000/johann-friedrich-von-salm-grumbach.json')

In [4]:
import json

with file.open() as f:
    data = json.load(f)

In [5]:
data

{'info': {'title': 'Johann Friedrich von Salm-Grumbach',
  'authors': [{'id': 178175, 'name': 'Invisigoth67'}],
  'bucket': '000',
  'file_name': 'johann-friedrich-von-salm-grumbach.wiki',
  'info': {'id': 12841755,
   'title': 'Johann Friedrich von Salm-Grumbach',
   'namespace': 0,
   'restrictions': []},
  'sha1': '6r2jugoc4e67bcef09vrc854ov5l0c8',
  'timestamp': '2024-05-26T14:14:44Z',
  'parent_id': 243041702,
  'namespace': {'name': '(Main/Article)', 'type': 'subject'}},
 'type': 'article',
 'title': 'Johann Friedrich von Salm-Grumbach',
 'sections': [{'section': {'idx': 0, 'title': 'Introduction', 'level': 1},
   'html': '<p><a href="/wiki/Datei:HUA-39185-Portret_van_Rijngraaf_Frederik_III_van_Salm_Salm_Grumbach_geboren_1745_bevelhebber_van_de_patriottische_vrijcorpsen_te_Utrecht_overleden_1794_Borstbee.jpg" title="HUA-39185-Portret van Rijngraaf Frederik III van Salm Salm Grumbach geboren 1745 bevelhebber van de patriottische vrijcorpsen te Utrecht overleden 1794 Borstbee.jpg">

In [6]:
data['sections'][6]

{'section': {'idx': 6, 'title': ' Kritik ', 'level': '2'},
 'html': '<h2> Kritik </h2>\n<p><a href="/wiki/Datei:HUA-32438-Afbeelding_van_een_stoet_genummerde_personen_op_weg_naar_een_kerkhof_Spotprent_op_de_Rijngraaf_van_Salm_en_zijn_verdediging_van_de_stad_Utrecht.jpg" title="HUA-32438-Afbeelding van een stoet genummerde personen op weg naar een kerkhof Spotprent op de Rijngraaf van Salm en zijn verdediging van de stad Utrecht.jpg">mini|Politische Karikatur die das Begräbnis des Rheingrafen Johann Friedrich von Salm-Grumbach metaphorisch als <a href="/wiki/Lachse" title="Lachse">Salm</a> 1787 dargestellt.</a>\n</p><p><a href="/wiki/Datei:Vers_op_de_vlucht_van_de_Rijngraaf_van_Salm%2C_1787_Op_de_Laaghartige_Vlucht_%28%29_van_den_Rhyngraaf_van_Salm%2C_opperbevelhebber_en_cheff_van_een_Corps_Troepen_%28..%29_October_1787_%28titel_op_object%29%2C_RP-P-OB-85.829.jpg" title="Vers op de vlucht van de Rijngraaf van Salm, 1787 Op de Laaghartige Vlucht () van den Rhyngraaf van Salm, opperbevelh

In [7]:
from IPython.core.display import display, HTML


html = data['sections'][2]['html']

display(HTML(html))

  from IPython.core.display import display, HTML


In [8]:
for section in data['sections']:
    display(HTML(section['html']))

In [9]:
data.keys()

dict_keys(['info', 'type', 'title', 'sections', 'categories', 'links', 'non_section_links'])

In [11]:
data.get('categories')

['General (Niederlande)',
 'Söldnerführer',
 'Familienmitglied der Rheingrafen (Linie Salm-Grumbach)',
 'Träger des Weißen Adlerordens',
 'Deutscher',
 'Geboren 1743',
 'Gestorben 1819',
 'Mann']

In [13]:
import logging
import click
from neo4j import GraphDatabase, Driver
from pathlib import Path
from tqdm import tqdm
from tqdm.contrib.concurrent import process_map
import multiprocessing as mp
import json
import pandas as pd 

In [14]:

driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

In [15]:
def run_cypher_query(driver: Driver, query: str) -> None:
    """
    Run a Cypher query on the provided Neo4j driver.

    :param driver: Neo4j driver instance.
    :param query: Cypher query string.
    """
    with driver.session() as session:
        session.run(query)

def clear_database(driver: Driver) -> None:
    """
    Clear all nodes and relationships in the Neo4j database.
    This may not work with too many nodes as Neo4j may run out of memory.

    :param driver: Neo4j driver instance.
    """
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")
    logger.info("All nodes and relationships deleted from the database.")

In [20]:
clear_database(driver)

2024-09-06 07:05:56,348 - INFO - All nodes and relationships deleted from the database.


In [22]:
def clear():
    clear_database(driver)

    queries = [
        "CREATE CONSTRAINT ON (a:Article) ASSERT a.title IS UNIQUE;",
        "CREATE CONSTRAINT ON (a:Author) ASSERT a.id IS UNIQUE;",
    ]
    with driver.session() as session:
        for query in queries:
            try:
                session.run(query)
            except:
                ...

clear()

2024-09-06 07:06:13,740 - INFO - All nodes and relationships deleted from the database.


In [23]:
def import_data(driver: Driver, csv_dir: str, csv_files:list[Path], query_template: str) -> None:
    """
    Generic function to import data from CSV files using a Cypher query.

    :param driver: Neo4j driver instance.
    :param csv_dir: Directory containing CSV files.
    :param pattern: Glob pattern to match the relevant CSV files.
    :param query_template: Cypher query template with placeholders for file paths.
    """
    if not csv_files:
        logger.warning(f"No files found for pattern: {pattern}")
        return

    for csv_file in tqdm(csv_files, desc=f"Importing {pattern}"):
        csv_file_path = csv_file.as_posix()
        query = query_template.format(csv_file_path=csv_file_path)
        try:
            run_cypher_query(driver, query)
            logger.info(f"Successfully imported {csv_file_path}")
        except Exception as e:
            logger.error(f"Error importing {csv_file_path}: {e}")

In [24]:
def get_authors(info):
    """
    Extract author information from a JSON 'info' dictionary.

    :param info: JSON dictionary containing 'authors' data.
    :return: List of tuples (author_id, author_name).
    """
    res = []
    for author in info["authors"]:
        author_id = author.get('id') or author.get('name')
        author_name = author.get('name') or author.get('id')
        if author_id:
            res.append((author_id, author_name))
    return res

def save_csv(objects, path, *args, **kwargs):
    pd.DataFrame(objects, *args, **kwargs).to_csv(path, index=False, escapechar='\\')

In [32]:

path = Path('data/wiki_json/000/')
csv_dir = Path('data/csv')

articles = []
persons = set()
article_links = []
author_links = []
redirects = []
categories = []
article_to_category = []
other_nodes = []
sections = []
section_links = []
article_to_section_links = []

# Iterate through all JSON files in the directory
for file in path.rglob('*.json'):
    try:
        data = json.loads(file.read_text())
    except json.decoder.JSONDecodeError as e:
        logger.error(f"Error decoding JSON {file}: {e}")

    info = data["info"]

    # Special case for redirect pages
    if data["type"] == "redirect":
        target = data["target"].split('#')[0]
        redirects.append({
            "from": data["info"]["title"],
            "to": target
        })
        continue

    if info["info"]["namespace"] == 14:
        # handle category
        categories.append({
            "id": info["info"]["id"],
            "title": 'Kategorie:' + info["title"],
            "type": data["type"],
            "namespace_id": info["info"]["namespace"],
            "namespace_name": info["namespace"]["name"],
            "namespace_type": info["namespace"]["type"],
            "parent_id": info["parent_id"],
            "timestamp": info["timestamp"],
            "sha1": info["sha1"],
            "path": info["bucket"] + '/' + info["file_name"]
        })
        continue
    
    for section in data.get('sections', []):
        s = section['section']
        s['title'] = s['title'].strip()
        s['id'] = f"{info['title']}#{s['title']}"

        sections.append(s)

        section_links.append({
            "from": s['id'],
            "to": info['title']
        })



    # Collect article data

    articles.append({
        "id": info["info"]["id"],
        "title": info["title"],
        "type": data["type"],
        "namespace_id": info["info"]["namespace"],
        "namespace_name": info["namespace"]["name"],
        "namespace_type": info["namespace"]["type"],
        "parent_id": info["parent_id"],
        "timestamp": info["timestamp"],
        "sha1": info["sha1"],
        "path": info["bucket"] + '/' + info["file_name"]
    })

    # Collect author data
    authors = get_authors(info)
    persons.update(authors)
    for author in authors:
        author_links.append({
            "article": info["title"],
            "person": author[0]
        })

    # Collect article links
    for target, _ in data["links"]:

        # Special case for section links
        if '#' in target:
            article_to_section_links.append({
                "article": info["title"],
                "section": target
            })

            target = target.split('#')[0]
            if not target:
                continue

        
        article_links.append({
            "from": info["title"],
            "to": target
        })


    for category in data.get('categories', []):
        category_title = f'Kategorie:{category}'
        article_to_category.append({
            "article": info["title"],
            "category": category_title
        })


# Save the collected data as CSV files
target = csv_dir / path.name
target.mkdir(exist_ok=True, parents=True)

save_csv(articles, target / 'articles.csv')
save_csv(list(persons), target / 'persons.csv', columns=["id", "name"])
save_csv(article_links, target / 'article_links.csv')
save_csv(author_links, target / 'author_links.csv')
save_csv(redirects, target / 'redirects.csv')
save_csv(categories, target / 'categories.csv')
save_csv(article_to_category, target / 'article_to_category.csv')
save_csv(sections, target / 'sections.csv')
save_csv(section_links, target / 'section_links.csv')
save_csv(article_to_section_links, target / 'article_to_section_links.csv')