# KB population

Adding labeled entities from third-party datasets

In [2]:
%cd ..

/home/max/process/dist/app_noisemon


In [3]:
import json
from pathlib import Path

import pandas as pd
from tqdm import tqdm

In [4]:
from noisemon.database.database import SessionLocal
from noisemon.models.entity import EntityModel, get_insert_many_statement
from noisemon.models.document import DocumentData, DocumentModel, DocumentOrigin
from noisemon.models.document import dataclass_to_model as document_dataclass_to_model
from noisemon.models.mention import *

In [5]:
archive_dir = Path("./assets/entity_linking/archive/").resolve()

In [6]:
list(archive_dir.iterdir())

[PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/item_aliases.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/page.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/item.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/property.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/property_aliases.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/statements.csv'),
 PosixPath('/home/max/process/dist/app_noisemon/assets/entity_linking/archive/link_annotated_text.jsonl')]

In [7]:
page_df = pd.read_csv(archive_dir / "page.csv")
page_df.set_index("page_id", inplace=True)

In [8]:
item_df = pd.read_csv(archive_dir / "item.csv")
item_df.set_index("item_id", inplace=True)

## Populate with entities

KB does not have the entities to link with later; I currently do not do upserts on the run, as I assume entities already known at the runtime.

In [19]:
db = SessionLocal()
with db.begin():
    buffer = []
    for idx, row in tqdm(item_df.iterrows(), total=len(item_df)):
        new_entity = EntityModel(
            qid=id_to_qid(idx),
            name=row.en_label,
            type=None,
        )
        buffer.append(new_entity)
        
        if idx % 5000 == 1:
            statement = get_insert_many_statement(buffer)
            db.execute(statement)
            buffer = []

    else:
        statement = get_insert_many_statement(buffer)
        db.execute(statement)
       
db.commit()
db.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 51450316/51450316 [2:33:30<00:00, 5586.03it/s]


## Populate with mentions

In [9]:
def id_to_qid(id: str):
    return f"http://www.wikidata.org/entity/Q{id}"

In [11]:
def parse_section(section, page_df):
    for offset, length, target_page_id in zip(section["link_offsets"], section["link_lengths"], section["target_page_ids"]):
        span_start = offset
        span_end = offset+length
        span = section["text"][span_start:span_end]
        row = page_df.loc[target_page_id]
        title = row.title
        qid = row.item_id
        yield span_start, span_end, span, title, qid

In [12]:
!wc -l "./assets/entity_linking/archive/link_annotated_text.jsonl"

5343564 ./assets/entity_linking/archive/link_annotated_text.jsonl


In [13]:
db = SessionLocal()
with open(archive_dir / "link_annotated_text.jsonl", "r") as fin:
    for line_number, line in tqdm(enumerate(fin), total=5343564):
        if line_number <= 2902312:
            continue
        with db.begin():
            chunk = json.loads(line)
            page_id = chunk["page_id"]
            page_title = page_df.loc[page_id].title
            origin = DocumentOrigin(resource=f"WikiPedia Page [{page_title}]", from_process="kb_population")

            for section in chunk.get("sections", []):
                origin = DocumentOrigin(
                    resource=f"WikiPedia Page [{page_title}][section: {section['name']}]", 
                    from_process="kb_population"
                )
                document = DocumentData(
                    origin=origin,
                    text=section["text"],
                    raw_text=section["text"],
                )
                new_document = document_dataclass_to_model(document)
                db.add(new_document)
                db.flush()
                
                for span_start, span_end, span, title, qid in parse_section(section, page_df):
                    new_mention = MentionModel(
                        origin=new_document,
                        span=span,
                        span_start=span_start,
                        span_end=span_end,
                        entity_qid=id_to_qid(qid),
                        vector_index=0
                    )
                    db.add(new_mention)
                db.flush()

 90%|███████████████████████████████████████████████████████████████████████████████████████████████▊          | 4830378/5343564 [36:05:22<3:50:03, 37.18it/s]


OperationalError: (psycopg2.OperationalError) server closed the connection unexpectedly
	This probably means the server terminated abnormally
	before or while processing the request.

[SQL: INSERT INTO documents (id, origin, text, raw_text) VALUES (%(id)s, %(origin)s, %(text)s, %(raw_text)s)]
[parameters: {'id': '9b28f443-78c1-4e9d-a875-516815cc3890', 'origin': '{"from_process": "kb_population", "link": null, "resource": "WikiPedia Page [Anna constituency][section: Introduction]", "timestamp": null}', 'text': 'The Anna Constituency (No. 89) is a Russian legislative constituency in Voronezh Oblast.', 'raw_text': 'The Anna Constituency (No. 89) is a Russian legislative constituency in Voronezh Oblast.'}]
(Background on this error at: https://sqlalche.me/e/20/e3q8)

In [37]:
db.commit()

In [20]:
list(parse_section(section, page_df))

IndexError: single positional indexer is out-of-bounds

In [12]:
section = chunk["sections"][0]

In [14]:
section["text"][16:16+18]

'anti-authoritarian'

In [27]:
parse_section(section, page_df)

anti-authoritarian Anti-authoritarianism 1030234
political Political philosophy 179805
social philosophy Social philosophy 180592
hierarchies Hierarchy 188619
self-managed Organizational Self-management 15981562
self-governed Self-governance 417633
cooperative Cooperative 4539
stateless societies Stateless society 1708603
free associations Free association (Marxism and anarchism) 5500187
state Sovereign state 3624078
far-left Far-left politics 1129409
political spectrum Political spectrum 210918
economics Anarchist economics 4751666
legal philosophy Anarchist law 2738779
communism Anarcho-communism 188670
collectivism Collectivist anarchism 223178
syndicalism Anarcho-syndicalism 188993
mutualism Mutualism (economic theory) 844212
participatory economics Participatory economics 650953
anarchist types and traditions History of anarchism 2746978
Anarchist schools of thought Anarchist schools of thought 1278791
individualism Individualism 171995
collectivism Collectivism 237789
social Soci

In [15]:
section

{'name': 'Introduction',
 'text': "Anarchism is an anti-authoritarian political and social philosophy that rejects hierarchies deemed unjust and advocates their replacement with self-managed, self-governed societies based on voluntary, cooperative institutions. These institutions are often described as stateless societies, although several authors have defined them more specifically as distinct institutions based on non-hierarchical or free associations. Anarchism's central disagreement with other ideologies is that it holds the state to be undesirable, unnecessary, and harmful. Anarchism is usually placed on the far-left of the political spectrum, and much of its economics and legal philosophy reflect anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism, or participatory economics. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. A

In [18]:
page_df[page_df.page_id == 867979]

Unnamed: 0,page_id,item_id,title,views
260552,867979,1030234,Anti-authoritarianism,1914
