In [1]:
%load_ext autoreload
%autoreload 2

In [70]:
import itertools
import tempfile
import pipelines
from db import WikilanguageDB
import time
import os
import graph_tool
import graph_tool.search
from collections import defaultdict
from wikidata_parser import WikiDataParser, WikiData, WikiDataInheritanceGraph

In [22]:
inheritance_graph = WikiDataInheritanceGraph.load("data/wikidata_inheritance.pickle")

In [27]:
set(r_inheritance.label_for_id(e) for e in r_inheritance.descendent_ids("Q515"))

{'abstract object',
 'artificial entity',
 'artificial geographic entity',
 'artificial physical object',
 'city/town',
 'community',
 'concrete object',
 'entity',
 'geographic entity',
 'geographic location',
 'geographic region',
 'geographical object',
 'group',
 'group of humans',
 'group of living things',
 'group of physical objects',
 'human settlement',
 'human-geographic territorial entity',
 'locality',
 'location',
 'object',
 'object of science',
 'physical object',
 'physical system',
 'product',
 'research object',
 'social group',
 'spacio-temporal entity',
 'spatial entity',
 'statistical territorial entity',
 'structure',
 'system',
 'territorial entity',
 'unit',
 'unit of analysis',
 'urban area',
 'work'}

In [67]:
[inheritance_graph.label_for_id(p) for p in tb.parents("Q515")]

['city', 'community', 'structure', 'entity', 'system', 'social group']

TypeError: __init__() takes 1 positional argument but 2 were given

In [43]:
next(iter({}.values()))

StopIteration: 

In [99]:
def grouper(n, iterable):
    args = [iter(iterable)] * n
    return ([e for e in t if e != None] for t in itertools.zip_longest(*args))

def write_wikidata_into_db(db, input_path, parent_finder, whitelisted_wikis=None, limit=None, db_batch_size=10000):
    start = time.time()
    with pipelines._buffered_stream(input_path) as f:
        for i, entries in enumerate(grouper(db_batch_size, WikiDataParser.parse_dump(
            f, whitelisted_wikis=whitelisted_wikis
        ))):
            page_num = i * db_batch_size
            if limit and page_num > limit:
                break
                
            with db.con as cur:
                # Insert concept record
                cur.executemany(
                    """
                    INSERT INTO concepts(
                        concept_id, sample_title, coord_latitude, coord_longitude, coord_altitude, coord_precision
                    ) 
                    VALUES(?, ?, ?, ?, ?, ?)
                    """,
                    (
                        (
                            e.id,
                            e.sample_title,
                            e.sample_coord and e.sample_coord.latitude,
                            e.sample_coord and e.sample_coord.longitude,
                            e.sample_coord and e.sample_coord.altitude,
                            e.sample_coord and e.sample_coord.precision,
                        )
                        for e in entries
                    )
                )
                
                print("Done concepts")
                
                # Insert article records
                cur.executemany(
                    """
                    INSERT INTO concept_articles(
                        concept_id, wiki, article_title
                    ) 
                    VALUES(?, ?, ?)
                    """,
                    itertools.chain.from_iterable(
                        (
                            (
                                e.id,
                                wiki,
                                title,
                            ) 
                            for wiki, title in e.titles_by_wiki.items()
                        ) for e in entries
                    ),
                )
                
                print("Done records")
                
                # Insert instance of records
                instances = []
                for e in entries:
                    parent_concepts = set()
                    for c in e.direct_instance_of:
                        parent_finder.all_parents(c, parent_concepts)
                            
                    for parent_concept in parent_concepts:
                        instances.append((e.id, parent_concept))
                    
                cur.executemany(
                    """
                    INSERT INTO concept_instance_of (
                        concept_id,
                        instance_of_concept_id
                    )
                    VALUES (?, ?)
                    """, 
                    instances,
                )
                
                print("Done instances")

                
                
                if page_num % 100000 == 0:
                    delta = time.time() - start
                    pps = page_num / delta
                    print(f"DB Write: made it to page {page_num} in {delta:.2f}s {pps:.2f}pps") 


In [97]:
parent_finder = inheritance_graph.parent_finder()

In [None]:
try:
    os.remove("test.db")
except OSError:
    pass

with WikilanguageDB.con("test.db") as db:
    db.create_tables()
    #pipelines.write_articles_into_db(db, "wikis/jawiki-20190920-pages-articles-multistream.xml.bz2", "jawiki")
    write_wikidata_into_db(db, "wikis/wikidata-latest-all.json.gz", parent_finder)

Reached 10000 in 3.262936592102051s [58.71600149293995% in json](3064.7239741664102 lines per second)
Done concepts
Done records
Done instances
DB Write: made it to page 0 in 5.71s 0.00pps
Reached 20000 in 8.074604511260986s [40.69738738902068% in json](2476.901496798716 lines per second)
Done concepts
Done records
Done instances
Reached 30000 in 13.432533025741577s [41.45871201177803% in json](2233.3836769661525 lines per second)
Done concepts
Done records
Done instances
Reached 40000 in 18.49702548980713s [37.00012147124362% in json](2162.5098598713716 lines per second)
Done concepts
Done records
Done instances
Reached 50000 in 22.80553698539734s [35.072333845477765% in json](2192.4500191341954 lines per second)
Done concepts
Done records
Done instances
Reached 60000 in 27.362179040908813s [33.751848348373116% in json](2192.807813672107 lines per second)
Done concepts
Done records
Done instances
Reached 70000 in 32.07599139213562s [32.33125453376174% in json](2182.31758277509 lines p