In [1]:
import ujson as json
import time
import pickle
import sys
from itertools import zip_longest 
from collections import namedtuple

In [2]:
JSON_DATA_PATH = "/mnt/cold/Projects/data/latest-all.json"

In [18]:
GlobeCoordinate = namedtuple("GlobeCoordinate", ["latitude", "longitude", "altitude", "precision"])
WikiDataEntry = namedtuple("WikiDataEntry", ["id", "sample_name", "sitelinks", "sample_coord"])

def grouper(n, iterable, padvalue=None):
    "grouper(3, 'abcdefg', 'x') --> ('a','b','c'), ('d','e','f'), ('g','x','x')"
    return zip_longest(*[iter(iterable)]*n, fillvalue=padvalue)


class WikiData:
    
    @classmethod
    def parse_globe_coordinate(cls, claims, title="", line_id=""):
        if "P625" not in claims:
            return None

        coordinates = claims["P625"]
        #if len(coordinates) != 1:
        #    print(json.dumps(coordinates, indent=2))
        #    raise RuntimeError(f"({title}) Found item with more than one co-ordinate")

        coordinate = coordinates[0]
        if "mainsnak" not in coordinate:
            print(coordinate)
            raise RuntimeError(f"({line_id} {title}) Main snak not found in co-ordinate")

        mainsnak = coordinate["mainsnak"]
        if "snaktype" not in mainsnak:
            print(mainsnak)
            raise RuntimeError(f"({line_id} {title}) Main snak snaktype")
            
        if mainsnak["snaktype"] != "value":
            return None
        
        if "datavalue" not in mainsnak:
            print(mainsnak)
            raise RuntimeError(f"({line_id} {title}) Main snak without data value")

        datavalue = mainsnak["datavalue"]
        if "type" not in datavalue or datavalue["type"] != "globecoordinate":
            print(datavalue)
            raise RuntimeError(f"({line_id} {title}) Bad data type for globe coordinate") 

        value = datavalue["value"]
        return GlobeCoordinate(*(value[e] for e in ("latitude", "longitude", "altitude", "precision")))
    
    @classmethod
    def parse_line(cls, line):
        pass
    
    
    @classmethod
    def parse_dump(cls, path, whitelisted_wikis=None): 
        wiki_title_to_id = {}
        id_to_entry = {}
        start = time.time()
        all_json_time = 0
        
        whitelisted_wikis = whitelisted_wikis and set(whitelisted_wikis) 
           
        for i, line in enumerate(open(path)):
            if not line.startswith("{"):
                continue
               
            s_json = time.time()
            loaded = json.loads(line.rstrip(",\n"))
            all_json_time += (time.time() - s_json)
             
            line_type = loaded["type"]
            line_id = loaded["id"]

            if line_type == "property":
                continue

            if line_type != "item":
                print(json.dumps(loaded, indent=2))
                print(line_type)
                raise RuntimeError("Found non-item line")


            if "sitelinks" not in loaded:
                print(loaded)
                raise RuntimeError("No sitelinks found in entry")

            sitelinks = {}

            for wiki, v in loaded["sitelinks"].items():
                if whitelisted_wikis is not None and wiki not in whitelisted_wikis:
                    continue

                title = v["title"]
                if wiki not in wiki_title_to_id:
                    wiki_title_to_id[wiki] = {}

                sitelinks[wiki] = title
                wiki_title_to_id[wiki][title] = line_id

            try:
                sample_title = sitelinks["enwiki"] if "enwiki" in sitelinks else next(iter(sitelinks.values()))
            except StopIteration:
                sample_title = None

            sample_coord = cls.parse_globe_coordinate(loaded["claims"], sample_title, line_id)


            entry = WikiDataEntry(line_id, sample_title, sitelinks, sample_coord)
            id_to_entry[line_id] = entry

            if i % 10000 == 0:
                end = time.time()
                print(f"Reached {i} in {end - start}s [{100 * all_json_time / (end - start)}% in json] ({i / (end - start)} lines per second)")

            
        return cls(wiki_title_to_id, id_to_entry)
        
    @classmethod
    def load(cls, path):
        return pickle.load(open(path, 'rb'))
    
    def __init__(self, wiki_title_to_id, id_to_entry):
        self.wiki_title_to_id = wiki_title_to_id
        self.id_to_entry = id_to_entry
        
    def dump(self, path):
        pickle.dump(self, open(path, 'wb'), protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
wikidata = WikiData.parse_dump(JSON_DATA_PATH, whitelisted_wikis=[
    "enwiki",
    "zhwiki",
    "frwiki",
    "jawiki",
])

Reached 10000 in 3.2068278789520264s [93.17530891806949% in json] (3118.346346442499 lines per second)
Reached 20000 in 4.996929883956909s [92.30216860891232% in json] (4002.4576018590515 lines per second)
Reached 30000 in 6.870878219604492s [91.8546254509942% in json] (4366.254071335715 lines per second)
Reached 40000 in 8.562961101531982s [91.62013775494378% in json] (4671.281292267424 lines per second)
Reached 50000 in 10.000905752182007s [91.35768490207332% in json] (4999.547164924632 lines per second)
Reached 60000 in 11.56925630569458s [91.19844629295865% in json] (5186.158765491867 lines per second)
Reached 70000 in 12.997875690460205s [91.0307019225291% in json] (5385.495419945932 lines per second)
Reached 80000 in 14.271809816360474s [90.86333149399546% in json] (5605.455862247553 lines per second)
Reached 90000 in 15.631716251373291s [90.71067627333649% in json] (5757.525184868504 lines per second)
Reached 100000 in 16.58874797821045s [90.55714708095857% in json] (6028.182484

In [26]:
len(wikidata.wiki_title_to_id['zhwiki'])

1460001

In [22]:
print(wikidata.id_to_entry[wikidata.wiki_title_to_id['enwiki']['Douglas Adams']])

WikiDataEntry(id='Q42', sample_name='Douglas Adams', sitelinks={'frwiki': 'Douglas Adams', 'jawiki': 'ダグラス・アダムズ', 'zhwiki': '道格拉斯·亚当斯', 'enwiki': 'Douglas Adams'}, sample_coord=None)


In [24]:
wikidata.dump("/home/tdimson/projects/wikipedia-language-rank/wikidata_en_zh_ja_zh.pickle")