In [1]:
%cd ..

/home/max/process/dist/app_noisemon


In [2]:
import pandas as pd
from tqdm import tqdm
import json

from pathlib import Path

In [3]:
from noisemon.database.database import SessionLocal

from noisemon.models.entity import EntityModel, get_insert_many_statement
from noisemon.models.document import DocumentData, DocumentModel, DocumentOrigin
from noisemon.models.document import dataclass_to_model as document_dataclass_to_model
from noisemon.models.mention import *


In [4]:
archive_dir = Path("./assets/entity_linking/archive/").resolve()

In [5]:
for f in archive_dir.iterdir():
    print(f)

/home/max/process/dist/app_noisemon/assets/entity_linking/archive/item_aliases.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/page.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/item.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/property.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/property_aliases.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/statements.csv
/home/max/process/dist/app_noisemon/assets/entity_linking/archive/link_annotated_text.jsonl


In [6]:
page_df = pd.read_csv("./assets/entity_linking/archive/page.csv")

In [7]:
page_df.set_index('page_id', inplace=True)

In [8]:
page_df

Unnamed: 0_level_0,item_id,title,views
page_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
12,6199,Anarchism,31335
25,38404,Autism,49693
39,101038,Albedo,14573
290,9659,A,25859
303,173,Alabama,52765
...,...,...,...
62470350,76894635,Daming Zhu,16
62470423,76894633,Tony Dews,7
62470432,76896959,Samsung PL20,9
62470465,6034153,Nils-Fredrik Palmstierna,8


In [9]:
def id_to_qid(id: str):
    return f"http://www.wikidata.org/entity/Q{id}"

In [10]:
item_df = pd.read_csv("./assets/entity_linking/archive/item.csv")

In [11]:
item_df.set_index("item_id", inplace=True)

In [12]:
item_df

Unnamed: 0_level_0,en_label,en_description
item_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Universe,totality of space and all contents
2,Earth,third planet from the Sun in the Solar System
3,life,matter capable of extracting energy from the e...
4,death,permanent cessation of vital functions
5,human,"common name of Homo sapiens, unique extant spe..."
...,...,...
77257472,2dFGRS TGN256Z026,
77257483,2dFGRS TGS171Z171,
77257484,2dFGRS TGS373Z078,
77257491,2dFGRS TGS374Z114,


In [19]:
db = SessionLocal()
with db.begin():
    buffer = []
    for idx, row in tqdm(item_df.iterrows(), total=len(item_df)):
        new_entity = EntityModel(
            qid=id_to_qid(idx),
            name=row.en_label,
            type=None,
        )
        buffer.append(new_entity)
        
        if idx % 5000 == 1:
            statement = get_insert_many_statement(buffer)
            db.execute(statement)
            buffer = []

    else:
        statement = get_insert_many_statement(buffer)
        db.execute(statement)
       
db.commit()
db.close()

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 51450316/51450316 [2:33:30<00:00, 5586.03it/s]


In [32]:
def parse_section(section, page_df):
    for offset, length, target_page_id in zip(section["link_offsets"], section["link_lengths"], section["target_page_ids"]):
        span_start = offset
        span_end = offset+length
        span = section["text"][span_start:span_end]
        row = page_df.loc[target_page_id]
        title = row.title
        qid = row.item_id
        yield span_start, span_end, span, title, qid

In [35]:
!wc -l "./assets/entity_linking/archive/link_annotated_text.jsonl"

5343564 ./assets/entity_linking/archive/link_annotated_text.jsonl


In [None]:
db = SessionLocal()
with open("./assets/entity_linking/archive/link_annotated_text.jsonl", "r") as fin:
    for line in tqdm(fin, total=5343564):
        with db.begin():
            chunk = json.loads(line)
            page_id = chunk["page_id"]
            page_title = page_df.iloc[page_id].title
            origin = DocumentOrigin(resource=f"WikiPedia Page [{page_title}]", from_process="kb_population")

            for section in chunk.get("sections", []):
                origin = DocumentOrigin(
                    resource=f"WikiPedia Page [{page_title}][section: {section['name']}]", 
                    from_process="kb_population"
                )
                document = DocumentData(
                    origin=origin,
                    text=section["text"],
                    raw_text=section["text"],
                )
                new_document = document_dataclass_to_model(document)
                db.add(new_document)
                db.flush()
                
                for span_start, span_end, span, title, qid in parse_section(section, page_df):
                    new_mention = MentionModel(
                        origin=new_document,
                        span=span,
                        span_start=span_start,
                        span_end=span_end,
                        entity_qid=id_to_qid(qid),
                        vector_index=0
                    )
                    db.add(new_mention)
                db.flush()

  0%|                                                                                                              | 2452/5343564 [05:59<249:26:09,  5.95it/s]

In [20]:
list(parse_section(section, page_df))

IndexError: single positional indexer is out-of-bounds

In [12]:
section = chunk["sections"][0]

In [14]:
section["text"][16:16+18]

'anti-authoritarian'

In [27]:
parse_section(section, page_df)

anti-authoritarian Anti-authoritarianism 1030234
political Political philosophy 179805
social philosophy Social philosophy 180592
hierarchies Hierarchy 188619
self-managed Organizational Self-management 15981562
self-governed Self-governance 417633
cooperative Cooperative 4539
stateless societies Stateless society 1708603
free associations Free association (Marxism and anarchism) 5500187
state Sovereign state 3624078
far-left Far-left politics 1129409
political spectrum Political spectrum 210918
economics Anarchist economics 4751666
legal philosophy Anarchist law 2738779
communism Anarcho-communism 188670
collectivism Collectivist anarchism 223178
syndicalism Anarcho-syndicalism 188993
mutualism Mutualism (economic theory) 844212
participatory economics Participatory economics 650953
anarchist types and traditions History of anarchism 2746978
Anarchist schools of thought Anarchist schools of thought 1278791
individualism Individualism 171995
collectivism Collectivism 237789
social Soci

In [15]:
section

{'name': 'Introduction',
 'text': "Anarchism is an anti-authoritarian political and social philosophy that rejects hierarchies deemed unjust and advocates their replacement with self-managed, self-governed societies based on voluntary, cooperative institutions. These institutions are often described as stateless societies, although several authors have defined them more specifically as distinct institutions based on non-hierarchical or free associations. Anarchism's central disagreement with other ideologies is that it holds the state to be undesirable, unnecessary, and harmful. Anarchism is usually placed on the far-left of the political spectrum, and much of its economics and legal philosophy reflect anti-authoritarian interpretations of communism, collectivism, syndicalism, mutualism, or participatory economics. As anarchism does not offer a fixed body of doctrine from a single particular worldview, many anarchist types and traditions exist and varieties of anarchy diverge widely. A

In [18]:
page_df[page_df.page_id == 867979]

Unnamed: 0,page_id,item_id,title,views
260552,867979,1030234,Anti-authoritarianism,1914
