In [18]:
import xml.etree.cElementTree as ET
from IPython.display import clear_output
from ipywidgets import FloatProgress, IntText
from IPython.display import display
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import word_tokenize
import nltk
import time
import re
import pandas as pd
import collections

In [None]:
#nltk.download()

# Extracting Articles from Specific Categories

This is my first attempt at processing the Wikipedia dump.  It streamingly parses the Wikipedia XML and processes any article with a category tag that contains the specified strings.  Currently, it just writes the body of the articles to both a single file and a category specific file.  It probably should also do the tokenization, but doesn't yet.

In [7]:
class ArticleProcessor:
    
    def __init__(self, categories):
        self.categories = categories        
        self.global_matcher = re.compile("\[\[Category:[^\]]*(" + 
                                         "|".join(categories) + 
                                         ")[^\]]*", re.IGNORECASE)
        self.category_matcher = {}
        self.article_writer = {}
        self.global_writer = open("data/all-articles", "w")
        for category in self.categories:
            self.category_matcher[category] = re.compile("\[\[Category:[^\]]*" + 
                                                         category + 
                                                         "[^\]]*", re.IGNORECASE)
            self.article_writer[category] = open("data/" + category + "-articles", "w")
            
    def is_article_of_interest(self, article_text):
        return self.global_matcher.search(article_text)

    def process_article(self, article_text):
        self.global_writer.write(article_text)
        self.global_writer.write("\n")
        for category in self.categories:
            if self.category_matcher[category].search(article_text):
                self.article_writer[category].write(article_text)
                self.article_writer[category].write("\n")
    
    def close_all(self):
        self.global_writer.close()
        for writer in self.article_writer.values():
            writer.close()
    

In [10]:
p = ET.iterparse("data/enwiki-20170820-pages-articles.xml", 
                 events=("start", "end"))

start = time.time()
article_count = 0
root = None
f = FloatProgress(min=0, max=17773690)
t = IntText(value=0, description="Articles")
m = IntText(value=0, description="Matching Articles")
display(t, m, f)

processor = ArticleProcessor(["sportspeople",
                              "artists",
                              "politicians",
                              "military personnel",
                              "scientist",
                              #sportmanager
                              #cleric
                              "monarch",
                              "Fictional\ characters",
                              "nobility",
                              "criminals",
                              "judges"
                              
                             ])
try:
    
    for event, elem in p:
        if root == None:
            root = elem
        if event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':
            article_count += 1
            if article_count % 1000 == 0:
                f.value = article_count
                t.value = article_count
            if elem.text and processor.is_article_of_interest(elem.text):
                m.value += 1 
                processor.process_article(elem.text)
            root.clear()
finally:
    processor.close_all()
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

Articles: 17773690 Time: 5338.332955360413 seconds


# Extracting Articles with Gender Labels

In [7]:
gender_label_table = pd.read_csv("data/wiki.genders.txt", sep='\t')
gender_label_table.head(10)

Unnamed: 0,wiki id,gender,name
0,307,MALE,Abraham Lincoln
1,339,FEMALE,Ayn Rand
2,340,MALE,Alain Connes
3,344,MALE,Allan Dwan
4,595,MALE,Andre Agassi
5,628,MALE,Aldous Huxley
6,676,MALE,Andrei Tarkovsky
7,700,MALE,Arthur Schopenhauer
8,711,MALE,Albert Sidney Johnston
9,736,MALE,Albert Einstein


In [10]:
wiki_ids_with_gender = set([str(x) for x in gender_label_table["wiki id"]])
len(gender_label_table)

862171

In [61]:
p = ET.iterparse("data/enwiki-20170820-pages-articles.xml", 
                 events=("start", "end"))

start = time.time()
article_count = 0
root = None
f = FloatProgress(min=0, max=17773690)
t = IntText(value=0, description="Articles")
m = IntText(value=0, description="Matching Articles")
display(t, m, f)

article = None
id = None

article_writer = open("data/gendered-labeled-articles", "w")

try:
    is_current_article_labeled = False
    for event, elem in p:
        if root == None:
            root = elem
        if event == "start" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
            id = None
        if id == None and event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}id':
            is_current_article_labeled = (elem.text in wiki_ids_with_gender)
            id = elem.text
        if event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':
            article_count += 1
            if article_count % 1000 == 0:
                f.value = article_count
                t.value = article_count
            if is_current_article_labeled and elem.text:
                m.value += 1 
                article = elem.text
                article_writer.write(id)
                article_writer.write(' ')
                article_writer.write(article.replace('\n', ' '))
                article_writer.write('\n')
            root.clear()
finally:
    article_writer.close()
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

Articles: 17773690 Time: 4292.169456958771 seconds


Once this completes, we have a file where each line starts with an article ID and is followed by the full markdown text of the article.  For speed, I'm cleaning it up with this command line chain of seds:

```
cat gendered-labeled-articles | sed 's/[^a-zA-Z0-9]/ /g' | \
sed 's/\s\{2,\}/ /g' | \
tr '[:upper:]' '[:lower:]' > \
gendered-labeled-articles.stripped
```

That takes about 17.5 minutes to run.  This reduces the file from 5.8GB to 5.1GB.

# Creating a Vocabulary

In [28]:
fp = FloatProgress(min=0, max=850399)
uw = IntText(value=0, description="Unique")
display(fp, uw)
article_count = 0
word_count = 0


article_ids = set()
vocab_mapping = {}

start = time.time()
try:
    with open("data/gendered-labeled-articles.stripped", 'r') as f:
        for line in f:
            article_count += 1
            if article_count % 1000 == 0:
                fp.value = article_count
                uw.value = word_count
            words = line.split()
            article_ids.add(words[0])
            for word in words[1:]:
                val = vocab_mapping.get(word, -1)
                if val == -1:
                    word_count += 1
                    vocab_mapping[word] = word_count
                
finally:        
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

Articles: 850399 Time: 404.5375759601593 seconds


In [31]:
len(vocab_mapping)

7869208

In [42]:
vocab_mapping["death"]

102