In [1]:
import xml.etree.ElementTree as etree
import codecs
import csv
import time
import os

In [2]:
PATH_WIKI_XML = 'azwiki_analysis'
FILENAME_WIKI = 'azwiki-latest-pages-articles.xml'
FILENAME_ARTICLES = 'articles.csv'
FILENAME_REDIRECT = 'articles_redirect.csv'
FILENAME_TEMPLATE = 'articles_template.csv'
ENCODING = "utf-8-sig"

In [3]:
def hms_string(sec_elapsed):
    h = int(sec_elapsed / (60 * 60))
    m = int((sec_elapsed % (60 * 60)) / 60)
    s = sec_elapsed % 60
    return "{}:{:>02}:{:>05.2f}".format(h, m, s)

In [4]:
def strip_tag_name(t):
    t = elem.tag
    idx = k = t.rfind("}")
    if idx != -1:
        t = t[idx + 1:]
    return t

In [5]:
pathWikiXML = os.path.join(PATH_WIKI_XML, FILENAME_WIKI)
pathArticles = os.path.join(PATH_WIKI_XML, FILENAME_ARTICLES)
pathArticlesRedirect = os.path.join(PATH_WIKI_XML, FILENAME_REDIRECT)
pathTemplateRedirect = os.path.join(PATH_WIKI_XML, FILENAME_TEMPLATE)

In [6]:
totalCount = 0
articleCount = 0
redirectCount = 0
templateCount = 0
title = None
start_time = time.time()

In [7]:
with codecs.open(pathArticles, "w", ENCODING) as articlesFH, \
        codecs.open(pathArticlesRedirect, "w", ENCODING) as redirectFH, \
        codecs.open(pathTemplateRedirect, "w", ENCODING) as templateFH:
    articlesWriter = csv.writer(articlesFH, quoting=csv.QUOTE_MINIMAL)
    redirectWriter = csv.writer(redirectFH, quoting=csv.QUOTE_MINIMAL)
    templateWriter = csv.writer(templateFH, quoting=csv.QUOTE_MINIMAL)

    articlesWriter.writerow(['id', 'title', 'text'])
    redirectWriter.writerow(['id', 'title', 'redirect'])
    templateWriter.writerow(['id', 'title'])
    
    for event, elem in etree.iterparse(pathWikiXML, events=('start', 'end')):
        tname = strip_tag_name(elem.tag)
    
        if event == 'start':
            if tname == 'page':
                title = ''
                id = -1
                text = ''
                redirect = ''
                inrevision = False
                ns = 0
            elif tname == 'revision':
                inrevision = True
            elif tname == 'title':
                title = elem.text
            elif tname == 'id' and not inrevision and elem.text is not None:
                id = int(elem.text)
            elif tname == 'text':
                text = elem.text
            elif tname == 'redirect':
                redirect = elem.get('title', '')
            elif tname == 'ns' and elem.text is not None:
                ns = int(elem.text)
        elif event == 'end' and tname == 'page':
            totalCount += 1

            if ns == 10:
                templateCount += 1
                templateWriter.writerow([id, title])
            elif redirect:
                redirectCount += 1
                redirectWriter.writerow([id, title, redirect])
            else:
                articleCount += 1
                articlesWriter.writerow([id, title, text])
    
            if totalCount > 1 and (totalCount % 100000) == 0:
                print("{:,}".format(totalCount))
    
        elem.clear()

time_took = time.time() - start_time
print(f"Total runtime: {hms_string(time_took)}")

100,000
200,000
300,000
400,000
Total runtime: 0:01:41.21
