In [1]:
import xml.etree.cElementTree as ET
from IPython.display import clear_output
from ipywidgets import FloatProgress, IntText
from IPython.display import display
from nltk.tokenize.stanford import StanfordTokenizer
import time
import re

In [7]:
class ArticleProcessor:
    
    def __init__(self, categories):
        self.categories = categories        
        self.global_matcher = re.compile("\[\[Category:[^\]]*(" + 
                                         "|".join(categories) + 
                                         ")[^\]]*", re.IGNORECASE)
        self.category_matcher = {}
        self.article_writer = {}
        self.global_writer = open("data/all-articles", "w")
        for category in self.categories:
            self.category_matcher[category] = re.compile("\[\[Category:[^\]]*" + 
                                                         category + 
                                                         "[^\]]*", re.IGNORECASE)
            self.article_writer[category] = open("data/" + category + "-articles", "w")
            
    def is_article_of_interest(self, article_text):
        return self.global_matcher.search(article_text)

    def process_article(self, article_text):
        self.global_writer.write(article_text)
        self.global_writer.write("\n")
        for category in self.categories:
            if self.category_matcher[category].search(article_text):
                self.article_writer[category].write(article_text)
                self.article_writer[category].write("\n")
    
    def close_all(self):
        self.global_writer.close()
        for writer in self.article_writer.values():
            writer.close()
    

In [None]:
p = ET.iterparse("data/enwiki-20170820-pages-articles.xml", 
                 events=("start", "end"))

start = time.time()
article_count = 0
root = None
f = FloatProgress(min=0, max=17773690)
t = IntText(value=0, description="Articles")
m = IntText(value=0, description="Matching Articles")
display(t, m, f)

processor = ArticleProcessor(["sportspeople",
                              "artists",
                              "politicians",
                              "military personnel",
                              "scientist",
                              #sportmanager
                              #cleric
                              "monarch",
                              "Fictional\ characters",
                              "nobility",
                              "criminals",
                              "judges"
                              
                             ])
try:
    
    for event, elem in p:
        if root == None:
            root = elem
        if event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':
            article_count += 1
            if article_count % 1000 == 0:
                f.value = article_count
                t.value = article_count
            if elem.text and processor.is_article_of_interest(elem.text):
                m.value += 1 
                processor.process_article(elem.text)
            root.clear()
finally:
    processor.close_all()
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget