In [2]:
from pathlib import Path

path = Path('data/wiki_json/000')
assert path.exists(), f'{path} does not exist'


In [19]:
import logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

In [14]:
driver = GraphDatabase.driver("bolt://localhost:7687", auth=("neo4j", "password"))

In [15]:
def run_cypher_query(driver: Driver, query: str) -> None:
    """
    Run a Cypher query on the provided Neo4j driver.

    :param driver: Neo4j driver instance.
    :param query: Cypher query string.
    """
    with driver.session() as session:
        session.run(query)

def clear_database(driver: Driver) -> None:
    """
    Clear all nodes and relationships in the Neo4j database.
    This may not work with too many nodes as Neo4j may run out of memory.

    :param driver: Neo4j driver instance.
    """
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")
    logger.info("All nodes and relationships deleted from the database.")

In [20]:
clear_database(driver)

2024-09-06 07:05:56,348 - INFO - All nodes and relationships deleted from the database.


In [22]:
def clear():
    clear_database(driver)

    queries = [
        "CREATE INDEX FOR (a:Article) ON a.title;",
        "CREATE INDEX FOR (a:Author) ON a.id;",
        'CREATE INDEX FOR (s:Section) ON (s.id);'
        "CREATE INDEX FOR (c:CATEGORY) ON c.title;",
    ]
    with driver.session() as session:
        for query in queries:
            try:
                session.run(query)
            except:
                ...

clear()

2024-09-06 07:06:13,740 - INFO - All nodes and relationships deleted from the database.


In [23]:
def import_data(driver: Driver, csv_dir: str, csv_files:list[Path], query_template: str) -> None:
    """
    Generic function to import data from CSV files using a Cypher query.

    :param driver: Neo4j driver instance.
    :param csv_dir: Directory containing CSV files.
    :param pattern: Glob pattern to match the relevant CSV files.
    :param query_template: Cypher query template with placeholders for file paths.
    """
    if not csv_files:
        logger.warning(f"No files found for pattern: {pattern}")
        return

    for csv_file in tqdm(csv_files, desc=f"Importing {pattern}"):
        csv_file_path = csv_file.as_posix()
        query = query_template.format(csv_file_path=csv_file_path)
        try:
            run_cypher_query(driver, query)
            logger.info(f"Successfully imported {csv_file_path}")
        except Exception as e:
            logger.error(f"Error importing {csv_file_path}: {e}")

In [24]:
def get_authors(info):
    """
    Extract author information from a JSON 'info' dictionary.

    :param info: JSON dictionary containing 'authors' data.
    :return: List of tuples (author_id, author_name).
    """
    res = []
    for author in info["authors"]:
        author_id = author.get('id') or author.get('name')
        author_name = author.get('name') or author.get('id')
        if author_id:
            res.append((author_id, author_name))
    return res

def save_csv(objects, path, *args, **kwargs):
    pd.DataFrame(objects, *args, **kwargs).to_csv(path, index=False, escapechar='\\')

In [49]:

path = Path('data/wiki_json/000/')
csv_dir = Path('data/csv')

articles = []
persons = set()
article_links = []
author_links = []
redirects = []
categories = []
article_to_category = []
other_nodes = []
sections = []
section_links = []
article_to_section_links = []

# Iterate through all JSON files in the directory
for file in path.rglob('*.json'):
    try:
        data = json.loads(file.read_text())
    except json.decoder.JSONDecodeError as e:
        logger.error(f"Error decoding JSON {file}: {e}")

    info = data["info"]

    # Special case for redirect pages
    if data["type"] == "redirect":
        target = data["target"].split('#')[0]
        redirects.append({
            "from": data["info"]["title"],
            "to": target
        })
        continue

    if info["info"]["namespace"] == 14:
        # handle category
        categories.append({
            "id": info["info"]["id"],
            "title": 'Kategorie:' + info["title"],
            "type": data["type"],
            "namespace_id": info["info"]["namespace"],
            "namespace_name": info["namespace"]["name"],
            "namespace_type": info["namespace"]["type"],
            "parent_id": info["parent_id"],
            "timestamp": info["timestamp"],
            "sha1": info["sha1"],
            "path": info["bucket"] + '/' + info["file_name"]
        })
        continue
    
    for section in data.get('sections', []):
        s = section['section']
        s['title'] = s['title'].strip()
        s['id'] = f"{info['title']}#{s['title']}"[:400]

        sections.append(s)

        section_links.append({
            "from": s['id'],
            "to": info['title']
        })



    # Collect article data

    articles.append({
        "id": info["info"]["id"],
        "title": info["title"],
        "type": data["type"],
        "namespace_id": info["info"]["namespace"],
        "namespace_name": info["namespace"]["name"],
        "namespace_type": info["namespace"]["type"],
        "parent_id": info["parent_id"],
        "timestamp": info["timestamp"],
        "sha1": info["sha1"],
        "path": info["bucket"] + '/' + info["file_name"]
    })

    # Collect author data
    authors = get_authors(info)
    persons.update(authors)
    for author in authors:
        author_links.append({
            "article": info["title"],
            "person": author[0]
        })

    # Collect article links
    for target, _ in data["links"]:

        # Special case for section links
        if '#' in target:
            article_to_section_links.append({
                "article": info["title"],
                "section": target[:400]
            })

            target = target.split('#')[0]
            if not target:
                continue

        
        article_links.append({
            "from": info["title"],
            "to": target
        })


    for category in data.get('categories', []):
        category_title = f'Kategorie:{category}'
        article_to_category.append({
            "article": info["title"],
            "category": category_title
        })


# Save the collected data as CSV files
target = csv_dir / path.name
target.mkdir(exist_ok=True, parents=True)

save_csv(articles, target / 'articles.csv')
save_csv(list(persons), target / 'persons.csv', columns=["id", "name"])
save_csv(article_links, target / 'article_links.csv')
save_csv(author_links, target / 'author_links.csv')
save_csv(redirects, target / 'redirects.csv')
save_csv(categories, target / 'categories.csv')
save_csv(article_to_category, target / 'article_to_category.csv')
save_csv(sections, target / 'sections.csv')
save_csv(section_links, target / 'section_links.csv')
save_csv(article_to_section_links, target / 'article_to_section_links.csv')

In [72]:
path = Path('data/csv/000')

In [68]:
def run_cypher_query(driver: Driver, query: str) -> None:
    """
    Run a Cypher query on the provided Neo4j driver.

    :param driver: Neo4j driver instance.
    :param query: Cypher query string.
    """
    with driver.session() as session:
        return session.run(query)

def clear_database(driver: Driver) -> None:
    """
    Clear all nodes and relationships in the Neo4j database.
    This may not work with too many nodes as Neo4j may run out of memory.

    :param driver: Neo4j driver instance.
    """
    with driver.session() as session:
        session.run("MATCH (n) DETACH DELETE n")
    logger.info("All nodes and relationships deleted from the database.")

def import_data(driver: Driver, csv_files: list[Path], query_template: str) -> None:
    """
    Generic function to import data from CSV files using a Cypher query.

    :param driver: Neo4j driver instance.
    :param csv_dir: Directory containing CSV files.
    :param pattern: Glob pattern to match the relevant CSV files.
    :param query_template: Cypher query template with placeholders for file paths.
    """
    for csv_file in tqdm(list(csv_files)):
        csv_file_path = csv_file.as_posix()
        query = query_template.format(csv_file_path=csv_file_path)
        try:
            res = run_cypher_query(driver, query)
            logger.info(f"Successfully imported {csv_file_path}")
            return res
        except Exception as e:
            logger.error(f"Error importing {csv_file_path}: {e}")

In [39]:
clear()

2024-09-06 08:20:43,625 - INFO - All nodes and relationships deleted from the database.


In [42]:
from tqdm.notebook import tqdm

In [43]:
article_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MERGE (a:Article {{title: row.title}})
ON CREATE SET 
    a.title = row.title,
    a.namespace_id = row.namespace_id,
    a.namespace_name = row.namespace_name,
    a.namespace_type = row.namespace_type,
    a.parent_id = toInteger(row.parent_id),
    a.timestamp = row.timestamp,
    a.sha1 = row.sha1,
    a.path = row.path;
"""
import_data(driver, path.rglob('**/articles.csv'), article_query_template)

  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 08:21:16,229 - INFO - Successfully imported data/csv/000/articles.csv


In [45]:
t = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MERGE (p:Author {{id: row.id}})
ON CREATE SET
    p.id = row.id,
    p.name = row.name;
"""
import_data(driver, path.rglob('**/persons.csv'), t)

  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 08:30:05,972 - INFO - Successfully imported data/csv/000/persons.csv


In [46]:
click.echo("Importing author links...")
author_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MATCH (article:Article {{title: row.article}})
MATCH (author:Author {{id: row.person}})
MERGE (author)-[:AUTHORED]->(article)
"""
import_data(driver, path.rglob('**/author_links.csv'), author_links_query_template)

Importing author links...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 08:30:31,832 - INFO - Successfully imported data/csv/000/author_links.csv


In [48]:
click.echo("Importing article links...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MATCH (a:Article {{title: row.from}})
MATCH (b:Article {{title: row.to}})
MERGE (a)-[:LINKS_TO]->(b)
"""
import_data(driver, path.rglob('**/article_links.csv'), article_links_query_template)

Importing article links...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 08:43:48,282 - INFO - Successfully imported data/csv/000/article_links.csv


In [76]:
click.echo("Importing redirects...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MATCH (a:Article {{title: row.from}})
MATCH (b:Article {{title: row.to}})
MERGE (a)-[:REDIRECTS_TO]->(b)
"""
import_data(driver, path.rglob('**/redirects.csv'), article_links_query_template);

Importing redirects...
[PosixPath('data/csv/000/redirects.csv')]


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 09:10:51,810 - INFO - Received notification from DBMS server: {severity: INFORMATION} {code: Neo.ClientNotification.Statement.EagerOperator} {category: PERFORMANCE} {title: The execution plan for this query contains the Eager operator, which forces all dependent data to be materialized in main memory before proceeding} {description: Using LOAD CSV with a large data set in a query where the execution plan contains the Eager operator could potentially consume a lot of memory and is likely to not perform well. See the Neo4j Manual entry on the Eager operator for more information and hints on how problems could be avoided.} {position: None} for query: "\nLOAD CSV WITH HEADERS FROM 'file:///data/csv/000/redirects.csv' AS row\nMERGE (a:Article {title: row.from})\nMERGE (b:Article {title: row.to})\nMERGE (a)-[:REDIRECTS_TO]->(b)\n"
2024-09-06 09:10:51,811 - INFO - Successfully imported data/csv/000/redirects.csv


In [70]:
click.echo("Importing sections...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MERGE (s:Section {{id: row.id}})
ON CREATE SET
    s.id = row.id,
    s.title = row.title,
    s.level = row.level,
    s.idx = row.idx;
"""
import_data(driver, path.rglob('**/sections.csv'), article_links_query_template);

Importing sections...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 09:01:58,024 - INFO - Successfully imported data/csv/000/sections.csv


In [78]:
click.echo("Importing categories...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MERGE (c:Category {{id: row.id}})
ON CREATE SET
    c.id = row.id,
    c.title = row.title,
    c.type = row.type,
    c.namespace_id = row.namespace_id,
    c.namespace_name = row.namespace_name,
    c.namespace_type = row.namespace_type,
    c.parent_id = row.parent_id,
    c.timestamp = row.timestamp,
    c.sha1 = row.sha1,
    c.path = row.path;
"""
import_data(driver, path.rglob('**/categories.csv'), article_links_query_template);

Importing categories...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 09:16:20,885 - INFO - Successfully imported data/csv/000/categories.csv


In [79]:
click.echo("Importing category/article relationships...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MATCH (a:Article {{title: row.article}})
MATCH (c:Article {{title: row.category}})
MERGE (a)-[:IS_IN_CATEGORY]->(c)
"""
import_data(driver, path.rglob('**/article_to_category.csv'), article_links_query_template);

Importing category/article relationships...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 09:19:19,646 - INFO - Successfully imported data/csv/000/article_to_category.csv


In [83]:
click.echo("Importing article to section links...")
article_links_query_template = """
LOAD CSV WITH HEADERS FROM 'file:///{csv_file_path}' AS row
MATCH (a:Article {{title: row.article}})
MATCH (s:Section {{id: row.section}})
MERGE (a)-[:LINKS_TO]->(s)
"""
import_data(driver, path.rglob('**/article_to_section_links.csv'), article_links_query_template);

Importing category/article relationships...


  0%|          | 0/1 [00:00<?, ?it/s]

2024-09-06 09:52:51,322 - INFO - Successfully imported data/csv/000/article_to_section_links.csv
