In [2]:
import gc
import msgpack
import json
import pickle
import xml.sax
import wikitextparser as wtp
import multiprocessing 
import sys
import time
import io
import queue
from sys import getsizeof
from collections import namedtuple, Counter, defaultdict, OrderedDict

In [3]:
class ParsedRawPage(namedtuple("ParsedRawPage", ["id", "title", "redirect", "links"])):
    @classmethod
    def dump_pages(cls, pages, path):
        with open(path, 'wb') as f:
            for page in pages:
                f.write(msgpack.packb(page, use_bin_type=True))

    @classmethod
    def read_pages(cls, path):
        with open(path, 'rb') as f:
            unpacker = msgpack.Unpacker(f, raw=False, use_list=False, max_map_len=1024**2)
            for (id, title, redirect, links) in unpacker:
                yield ParsedRawPage(id, title, redirect, Counter(links))
    
UnparsedRawPage = namedtuple("UnparsedRawPage", ["id", "title", "redirect", "text"])

In [4]:
class WikipediaPage(namedtuple("WikipediaPage", ["id", "title", "aliases", "links", "inlinks"])):
    @classmethod
    def dump_pages(cls, pages, path):
        with open(path, 'wb') as f:
            for (id, title, aliases, links, inlinks) in pages:
                f.write(msgpack.packb((
                    id, title, list(aliases), links, inlinks
                ), use_bin_type=True))

    @classmethod
    def read_pages(cls, path):
        with open(path, 'rb') as f:
            unpacker = msgpack.Unpacker(f, raw=False, use_list=False, max_map_len=1024**2)
            for (id, title, aliases, links, inlinks) in unpacker:
                yield WikipediaPage(id, title, set(aliases), Counter(links), Counter(inlinks))
                
    @classmethod
    def resolve_parsed_pages(cls, parsed_pages):
        title_to_wikipedia_page = {}
        redirects = {}
        print("Parsing pages")
        for p in parsed_pages:
            if p.redirect:
                redirects[p.title] = p.redirect
            else:
                title_to_wikipedia_page[p.title] = WikipediaPage(
                    id=p.id,
                    title=p.title,
                    aliases=set(),
                    links=p.links.copy(),
                    inlinks=Counter(),
                )
        
        # Making redirect chainer
        print("Creating aliases")
        circle_count = 0
        unresolvable_count = 0
        resolved_count = 0
        for title, redirect in redirects.items():
            resolution_pointer = redirect
            seen_pages = set()
            while True:
                if resolution_pointer in seen_pages:
                    # print(f"WARN: Circular loop with {seen_pages}")
                    redirects[title] = None
                    circle_count += 1
                    break
                
                seen_pages.add(resolution_pointer)
                
                if resolution_pointer in title_to_wikipedia_page:
                    redirects[title] = resolution_pointer
                    title_to_wikipedia_page[resolution_pointer].aliases.add(title)
                    resolved_count += 1
                    break
                elif resolution_pointer in redirects:
                    resolution_pointer = redirects[resolution_pointer]
                else:
                    # print(f"WARN: '{title}' contains unresolvable redirect '{redirect}'")
                    redirects[title] = None
                    unresolvable_count += 1
                    break
        
        t = circle_count + unresolvable_count + resolved_count
        assert t == len(redirects), "Not tautology with all redirects"
        print(f"Resolved {resolved_count} ({resolved_count / t}) redirects with {circle_count} ({circle_count / t}) cycles and {unresolvable_count} ({unresolvable_count / t}) unresolvables")
                   
        print("Resolving deepest links")
        bad_link_count = 0
        good_link_count = 0
        file_count = 0
        # Resolve links to deepest page
        for p in title_to_wikipedia_page.values():
            resolved_links = Counter()
            for raw_link, count in p.links.items():
                resolved = False
                # Wikipedia links occasionally upper case first letter
                for link in (raw_link, raw_link.capitalize()):    
                    if link in redirects and redirects[link] is not None:
                        resolved_link = redirects[link]
                        assert resolved_link in title_to_wikipedia_page, f"Bad redirect was formed from '{link}' to '{resolved_link}'!"
                        resolved_links[resolved_link] += count
                        title_to_wikipedia_page[resolved_link].inlinks[p.title] += count
                        resolved = True
                        break
                    elif link in title_to_wikipedia_page:
                        resolved_links[link] += count
                        title_to_wikipedia_page[link].inlinks[p.title] += count
                        resolved = True
                        break
                        
                if not resolved:
                    # print(f"WARN: '{p.title}' contains unresolved link '{link}'")
                    if raw_link.startswith("File:") or raw_link.startswith("Image:"):
                        file_count += count
                    else:
                        bad_link_count += count
                        if bad_link_count % 100000 == 0:
                            print(f"Sample bad link: '{p.title}' contains unresolved link '{link}'")
                    continue        
                else:
                    good_link_count += 1
             
            p.links.clear()
            p.links.update(resolved_links)
            
        t = good_link_count + bad_link_count
        print(f"Found {good_link_count} ({good_link_count / t}) good links and {bad_link_count} ({bad_link_count / t}) bad links and {file_count} ({file_count / t}) file links")
                

        while True:
            try:
                _, val = title_to_wikipedia_page.popitem()
            except KeyError:
                break
                
            yield val

In [5]:
class WikiHandler(xml.sax.ContentHandler):
    def __init__(self, queue, limit=None):
        super().__init__()
        
        self.queue = queue
        self.element_count = 0
        self.page_count = 0
        self.in_page = False
        self.limit = limit
        
        # Per page
        self.in_title = False
        self.title_buffer = None
        self.page_title = None
        self.page_text = None
        self.page_redirect = None
        
        self.seen_page_revision = False
        self.in_revision = False
        self.in_revision_text = False
        
        self.in_id = False
        self.id_buffer = None
        
        self.page_id = None 
        self.start_time = time.time()

         
    def startElement(self, name, attrs):
        self.element_count += 1
        
            
        if self.in_title:
            raise RuntimeException(f"Encountered element {name} within a title")
        if self.in_revision_text:
            raise RuntimeException(f"Encountered element {name} within revision text")
            
        if self.in_id:
            raise RuntimeException(f"Encountered element {name} within id")
        
        if name == "page":
            if self.in_page:
                raise RuntimeException("Recursive page")

            self.in_page = True
            
        if self.in_page:
            
            if name == "title":
                if self.page_title:
                    raise RuntimeException("Encountered a second title for the page!")

                self.in_title = True
                self.title_buffer = io.StringIO()

            elif name == "redirect":
                if self.page_redirect:
                    raise RuntimeException(f"Already had a redirect for {self.page_title}")
                if attrs.getLength() != 1:
                    raise RuntimeException(f"More than one redirect attribute for {self.page_title}")
                self.page_redirect = attrs.getValue("title")
                

            elif name == "revision":
                if self.seen_page_revision:
                    raise RuntimeException(f"Saw a second page revision for {self.page_title}")

                self.in_revision = True
            elif self.in_revision and name == "text":
                self.in_revision_text = True
                self.revision_text_buffer = io.StringIO()
            elif name == "id":
                self.in_id = True
                self.id_buffer = io.StringIO()

    def endElement(self, name):
        if name == "page":
            self.handle_page()
            
            self.in_page = False
            self.page_title = None
            self.page_text = None
            self.seen_page_revision = False
            self.page_redirect = None
            
        if self.in_page:
            if name == "title":
                self.in_title = False
                self.page_title = self.title_buffer.getvalue()
                self.title_buffer = None
            elif name == "redirect":
                pass
            elif name == "revision":
                self.in_revision = False
                self.seen_page_revision = True
            elif self.in_revision and name == "text":
                self.in_revision_text = False
                self.page_text = self.revision_text_buffer.getvalue()
                self.revision_text_buffer = None
            elif name == "id":
                self.in_id = False
                self.page_id = self.id_buffer.getvalue().strip()
                self.id_buffer = None
            

        
    def characters(self, data):
        if self.in_title:
            self.title_buffer.write(data)
        elif self.in_revision_text:
            self.revision_text_buffer.write(data)
        elif self.in_id:
            self.id_buffer.write(data)
            
            
    def handle_page(self):
        self.page_count += 1
        
        self.queue.put(
            UnparsedRawPage(self.page_id, self.page_title, self.page_redirect, self.page_text)
        )
        
    
        if self.limit and self.page_count >= self.limit:
            raise StopIteration("Stopping")
        elif self.page_count % 10000 == 0:
            delta = time.time() - self.start_time
            print(f"Made it to {self.page_title} ({self.page_count}) in {delta}s ({self.page_count / delta})pps")

In [6]:
def parsed_wikipedia_pages(filename, limit=None):
    def unparsed2parsed_worker(reader_queue, writer_queue): 
        while True:
            unparsed_page = reader_queue.get()
            if unparsed_page is None:
                return

            parsed = wtp.parse(unparsed_page.text)    
            page = ParsedRawPage(unparsed_page.id, unparsed_page.title, unparsed_page.redirect, Counter(
                e.title.strip() for e in parsed.wikilinks
            ))
            writer_queue.put(page)

    reader_queue = multiprocessing.Queue(multiprocessing.cpu_count() * 10)
    writer_queue = multiprocessing.Queue()
    processes = []
    try:
        for i in range(multiprocessing.cpu_count() * 2):  
            p = multiprocessing.Process(
                target=unparsed2parsed_worker, 
                args=(reader_queue, writer_queue), 
                daemon=True
            )
            p.start()
            processes.append(p)

        handler = WikiHandler(reader_queue, limit=limit)
        try:
            xml.sax.parse(filename, handler)
        except StopIteration:
            pass

        for p in processes:
            reader_queue.put(None)


        pages = []
        while True:
            try:
                pages.append(writer_queue.get(False))
            except queue.Empty:
                if any(p.is_alive() for p in processes):
                    continue
                else:
                    break

        return pages
    except:
        print("Exception raised, terminating subprocesses")
        for p in processes:
            p.terminate()
        raise

In [None]:
filename = "/mnt/cold/Projects/data/enwiki-20190901-pages-articles-multistream.xml"
pages = parsed_wikipedia_pages(filename)
WikipediaPage.dump_collection(pages, "enwiki-parsed-pages.pickle")

In [None]:
WikipediaPage.dump_collection(pages, "enwiki-parsed-pages.pickle")

In [None]:
ParsedRawPage.dump_pages(pages, "enwiki-parsed-pages.msgpack")

In [None]:
dump_parsed_pages(pages, "enwiki-parsed-pages.msgpack")

In [None]:
pages = list(ParsedRawPage.read_pages("enwiki-parsed-pages.msgpack"))

In [7]:
wiki_pages = list(
    WikipediaPage.resolve_parsed_pages(ParsedRawPage.read_pages("enwiki-parsed-pages.msgpack"))
)
wiki_pages.sort(key=lambda x: x.title)

Parsing pages
Creating aliases
Resolved 9219426 (0.999524491609326) redirects with 4 (4.336601830132704e-07) cycles and 4382 (0.00047507473049103775) unresolvables
Resolving deepest links
Sample bad link: 'Airliner' contains unresolved link 'De havilland dragon'
Sample bad link: 'Oakdale, New York' contains unresolved link 'Francis gow-smith'
Sample bad link: 'Supporter' contains unresolved link 'Kazimierz raczyński'
Sample bad link: 'BNSF Railway' contains unresolved link 'Laurel subdivision'
Sample bad link: 'List of California state parks' contains unresolved link 'Standish-hickey state recreation area'
Sample bad link: 'Church of Jesus Christ (Cutlerite)' contains unresolved link 'Deacon (latter day saints)'
Sample bad link: 'Wikipedia:Upload log archive/February 2004 (3)' contains unresolved link ':image:usq-la.jpg'
Sample bad link: 'Abitibi River' contains unresolved link 'Island falls, ontario'
Sample bad link: 'List of populated places in the Netherlands' contains unresolved li

Sample bad link: 'Wikipedia:Featured article candidates/Shahbag' contains unresolved link 'User:splot'
Sample bad link: 'Nikolay Anichkov' contains unresolved link 'Autogenic infection'
Sample bad link: 'Wikipedia:Featured picture candidates/Soyuz Launch' contains unresolved link 'User:thegreenj'
Sample bad link: 'Wikipedia:Articles for deletion/TV IV (3rd nomination)' contains unresolved link 'Special:contributions/loldramalulz'
Sample bad link: 'Franz Kafka bibliography' contains unresolved link 'S:de:eine kreuzung'
Sample bad link: 'El Dorado (football)' contains unresolved link 'Lauro rodríguez'
Sample bad link: 'Wikipedia:Articles for deletion/List Of Project Lead They Way Schools' contains unresolved link 'User:jayvdb'
Sample bad link: 'Wikipedia:Requests for adminship/E' contains unresolved link 'User:acalamari'
Sample bad link: 'Wikipedia:WikiProject Spam/LinkReports/kqed.org' contains unresolved link ':en:special:contributions/wikidemo'
Sample bad link: 'Wikipedia:WikiProject 

Sample bad link: 'Al-Kompars' contains unresolved link 'Ahmed attia'
Sample bad link: 'The Set-Up (1995 film)' contains unresolved link 'Strathford hamilton'
Sample bad link: 'Wikipedia:Featured picture candidates/File:Pharyngeal jaws of moray eels.jpg' contains unresolved link 'User talk:makeemlighter'
Sample bad link: 'Wikipedia:WikiProject Military history/Early Modern warfare task force/Popular pages' contains unresolved link 'Three hundred and thirty five years&'
Sample bad link: 'The Hands' contains unresolved link 'Belén blanco'
Sample bad link: 'List of members of the Virginia House of Burgesses' contains unresolved link 'George jordan (burgess)'
Sample bad link: 'Wikipedia:Database reports/User categories/4' contains unresolved link ':category:wikipedians_by_alma_mater:_avondale_college'
Sample bad link: 'May 2009 in sports' contains unresolved link ':category:2009 in rugby union'
Sample bad link: '1996 Indian general election in Tamil Nadu' contains unresolved link 'K. thulas

Sample bad link: 'List of caves in Brazil' contains unresolved link 'Poço azul'
Sample bad link: 'Wikipedia:Templates for discussion/Log/2011 February 27' contains unresolved link 'Special:contributions/floydian'
Sample bad link: 'Wikipedia:WikiProject Military history/Maritime warfare task force/Article alerts/Archive' contains unresolved link 'User:rave'
Sample bad link: 'Wikipedia:Deletion review/Log/2011 March 17' contains unresolved link 'User talk:binksternet'
Sample bad link: 'List of township-level divisions of Liaoning' contains unresolved link 'Hongqi subdistrict, panjin'
Sample bad link: 'Mystery Writers of Japan Award' contains unresolved link 'Makoto usami(writer)'
Sample bad link: '1920 New Zealand rugby league season' contains unresolved link 'Neville st george'
Sample bad link: 'Wikipedia:Administrators' noticeboard/IncidentArchive689' contains unresolved link 'Wp:dr'
Sample bad link: 'Serbia national under-17 football team' contains unresolved link 'Nikola komljenović'

Sample bad link: 'Wikipedia:Articles for deletion/Chloe Khan' contains unresolved link 'User talk:malcolmxl5'
Sample bad link: 'Wikipedia:WikiProject Articles for creation/Help desk/Archives/2013 February 14' contains unresolved link 'Wp:bio'
Sample bad link: 'Wikipedia:Tambayan Philippines/Get started/Geography' contains unresolved link ':pongon'
Sample bad link: 'Template:Did you know nominations/Max von Widnmann' contains unresolved link 'User talk:johnbod'
Sample bad link: 'Template:Did you know nominations/Elephant racing' contains unresolved link 'Talk:elephant racing'
Sample bad link: 'Wikipedia:Miscellany for deletion/Talk:Albert Ghiorso/Comments' contains unresolved link 'User talk:mercurywoodrose'
Sample bad link: 'Easterners (Korean political faction)' contains unresolved link 'Injo banjeong'
Sample bad link: 'Wikipedia:WikiProject North Macedonia/Article alerts/Archive' contains unresolved link ':category:terrorism in the republic of macedonia'
Sample bad link: 'Template:Di

Sample bad link: '2016 U-20 Copa Libertadores' contains unresolved link 'Víctor davila'
Sample bad link: 'Wikipedia:New Zealand Wikipedians' notice board/Archive 20' contains unresolved link 'User:kiwikikiwi'
Sample bad link: 'Wikipedia:WikiProject Women in Red/Missing articles by nationality/Cuba' contains unresolved link 'Interior designer (q2133309)'
Sample bad link: 'Wikipedia:Wikipedia Signpost/2016-02-24/Blog' contains unresolved link 'C:file:colors_of_africa.jpg'
Sample bad link: 'Wikipedia:Articles for deletion/Tsutae Yuzu' contains unresolved link 'User talk:narutolovehinata5'
Sample bad link: 'Wikipedia:WikiProject Stub sorting/Proposals/Archive/2016' contains unresolved link ':category:1970s italian comedy film stubs'
Sample bad link: 'Wikipedia:Files for discussion/2016 March 23' contains unresolved link 'Special:contributions/finnusertop'
Sample bad link: 'Wikipedia:Reliable sources/Noticeboard/Archive 205' contains unresolved link 'Wp:iar'
Sample bad link: 'Wikipedia:Poss

Sample bad link: '1954 Davis Cup America Zone' contains unresolved link 'Orlando garrido (tennis)'
Sample bad link: 'Wikipedia:WikiProject Norse history and culture/List of articles' contains unresolved link 'Talk:battle of the neva'
Sample bad link: 'Wikipedia:WikiProject Yorkshire/Article Talk List' contains unresolved link ':talk:hms fowey (1744)'
Sample bad link: 'List of Hero of the Soviet Union forfeitures' contains unresolved link ':ru:сысоев, михаил андреевич'
Sample bad link: 'Wikipedia:WikiProject Women in Red/Missing articles by occupation/Mayors' contains unresolved link ':d:q65592875'
Sample bad link: 'Wikipedia:Requests for history merge/Archive 34' contains unresolved link 'User:philipterrygraham/articles'
Sample bad link: 'Deno (software)' contains unresolved link 'Chrome_v8'
Sample bad link: '2019–20 Butler Bulldogs men's basketball team' contains unresolved link '2019–20 wofford terriers men's basketball team'
Sample bad link: 'Wikipedia:Contributor copyright investig

In [13]:
wiki_pages[1000001]

WikipediaPage(id='355698', title='Brungle, New South Wales', aliases={'Brungle'}, links=Counter({'Gundagai': 2, 'Buccleuch County': 1, 'Electoral district of Wagga Wagga': 1, 'Division of Eden-Monaro': 1, 'Riverina': 1, 'New South Wales': 1, 'Australia': 1, 'Tumut': 1, 'Wiradjuri': 1, 'Indigenous Australians': 1, 'Category:Towns in the Riverina': 1, 'Category:Towns in New South Wales': 1, 'Category:Parishes of Buccleuch County': 1, 'Category:Geography of New South Wales': 1, 'Category:Snowy Valleys Council': 1}), inlinks=Counter({'Warangesda Aboriginal Mission': 2, 'Tumut Shire': 1, 'Template:Riverina': 1, 'Buccleuch County': 1, 'Jimmy Clements': 1, 'List of government schools in New South Wales: A–F': 1, 'Gundagai lore': 1, 'Shane Mortimer': 1, 'Snowy Valleys Council': 1, 'Template:Localities in Snowy Valleys Council': 1, 'Montreal Community Theatre': 1, 'Queanbeyan Showground': 1, 'Wikipedia:WikiProject Women in Red/Women by ethnicity': 1}))

In [14]:
WikipediaPage.dump_pages(wiki_pages, "enwiki-wikipedia-pages.msgpack")