In [22]:
import xml.etree.cElementTree as ET
from IPython.display import clear_output
from ipywidgets import FloatProgress, IntText
from IPython.display import display
from nltk.tokenize.stanford import StanfordTokenizer
from nltk.tokenize import word_tokenize
import nltk
import time
import re
import pandas as pd

In [28]:
nltk.download()

NLTK Downloader
---------------------------------------------------------------------------
    d) Download   l) List    u) Update   c) Config   h) Help   q) Quit
---------------------------------------------------------------------------
Downloader> l

Packages:
  [ ] abc................. Australian Broadcasting Commission 2006
  [ ] alpino.............. Alpino Dutch Treebank
  [ ] averaged_perceptron_tagger Averaged Perceptron Tagger
  [ ] averaged_perceptron_tagger_ru Averaged Perceptron Tagger (Russian)
  [ ] basque_grammars..... Grammars for Basque
  [ ] biocreative_ppi..... BioCreAtIvE (Critical Assessment of Information
                           Extraction Systems in Biology)
  [ ] bllip_wsj_no_aux.... BLLIP Parser: WSJ Model
  [ ] book_grammars....... Grammars from NLTK Book
  [*] brown............... Brown Corpus
  [ ] brown_tei........... Brown Corpus (TEI XML Version)
  [ ] cess_cat............ CESS-CAT Treebank
  [ ] cess_esp............ CESS-ESP Treebank
  [ ] chat80.....

KeyboardInterrupt: 

# Extracting Articles from Specific Categories

This is my first attempt at processing the Wikipedia dump.  It streamingly parses the Wikipedia XML and processes any article with a category tag that contains the specified strings.  Currently, it just writes the body of the articles to both a single file and a category specific file.  It probably should also do the tokenization, but doesn't yet.

In [7]:
class ArticleProcessor:
    
    def __init__(self, categories):
        self.categories = categories        
        self.global_matcher = re.compile("\[\[Category:[^\]]*(" + 
                                         "|".join(categories) + 
                                         ")[^\]]*", re.IGNORECASE)
        self.category_matcher = {}
        self.article_writer = {}
        self.global_writer = open("data/all-articles", "w")
        for category in self.categories:
            self.category_matcher[category] = re.compile("\[\[Category:[^\]]*" + 
                                                         category + 
                                                         "[^\]]*", re.IGNORECASE)
            self.article_writer[category] = open("data/" + category + "-articles", "w")
            
    def is_article_of_interest(self, article_text):
        return self.global_matcher.search(article_text)

    def process_article(self, article_text):
        self.global_writer.write(article_text)
        self.global_writer.write("\n")
        for category in self.categories:
            if self.category_matcher[category].search(article_text):
                self.article_writer[category].write(article_text)
                self.article_writer[category].write("\n")
    
    def close_all(self):
        self.global_writer.close()
        for writer in self.article_writer.values():
            writer.close()
    

In [10]:
p = ET.iterparse("data/enwiki-20170820-pages-articles.xml", 
                 events=("start", "end"))

start = time.time()
article_count = 0
root = None
f = FloatProgress(min=0, max=17773690)
t = IntText(value=0, description="Articles")
m = IntText(value=0, description="Matching Articles")
display(t, m, f)

processor = ArticleProcessor(["sportspeople",
                              "artists",
                              "politicians",
                              "military personnel",
                              "scientist",
                              #sportmanager
                              #cleric
                              "monarch",
                              "Fictional\ characters",
                              "nobility",
                              "criminals",
                              "judges"
                              
                             ])
try:
    
    for event, elem in p:
        if root == None:
            root = elem
        if event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':
            article_count += 1
            if article_count % 1000 == 0:
                f.value = article_count
                t.value = article_count
            if elem.text and processor.is_article_of_interest(elem.text):
                m.value += 1 
                processor.process_article(elem.text)
            root.clear()
finally:
    processor.close_all()
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

Articles: 17773690 Time: 5338.332955360413 seconds


# Extracting Articles with Gender Labels

In [7]:
gender_label_table = pd.read_csv("data/wiki.genders.txt", sep='\t')
gender_label_table.head(10)

Unnamed: 0,wiki id,gender,name
0,307,MALE,Abraham Lincoln
1,339,FEMALE,Ayn Rand
2,340,MALE,Alain Connes
3,344,MALE,Allan Dwan
4,595,MALE,Andre Agassi
5,628,MALE,Aldous Huxley
6,676,MALE,Andrei Tarkovsky
7,700,MALE,Arthur Schopenhauer
8,711,MALE,Albert Sidney Johnston
9,736,MALE,Albert Einstein


In [10]:
wiki_ids_with_gender = set([str(x) for x in gender_label_table["wiki id"]])
len(gender_label_table)

862171

In [None]:
p = ET.iterparse("data/enwiki-20170820-pages-articles.xml", 
                 events=("start", "end"))

start = time.time()
article_count = 0
root = None
f = FloatProgress(min=0, max=17773690)
t = IntText(value=0, description="Articles")
m = IntText(value=0, description="Matching Articles")
display(t, m, f)

article = None
id = None

article_writer = open("data/gendered-labeled-articles", "w")

try:
    is_current_article_labeled = False
    for event, elem in p:
        if root == None:
            root = elem
        if event == "start" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}page':
            id = None
        if id == None and event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}id':
            is_current_article_labeled = (elem.text in wiki_ids_with_gender)
            id = elem.text
        if event == "end" and elem.tag == '{http://www.mediawiki.org/xml/export-0.10/}text':
            article_count += 1
            if article_count % 1000 == 0:
                f.value = article_count
                t.value = article_count
            if is_current_article_labeled and elem.text:
                m.value += 1 
                article = elem.text
                article_writer.write(id)
                article_writer.write(' ')
                article_writer.write(article.replace('\n', ' '))
                article_writer.write('\n')
            root.clear()
finally:
    article_writer.close()
    print("Articles:", article_count, "Time:", (time.time() - start), "seconds")

A Jupyter Widget

A Jupyter Widget

A Jupyter Widget

In [27]:
word_tokenize(article)

['{',
 '{',
 'Infobox',
 'Martial',
 'art',
 '|',
 'logosize',
 '=',
 '40px',
 '|',
 'image',
 '=',
 'Shihonage.jpg',
 '|',
 'imagecaption',
 '=',
 'A',
 'version',
 'of',
 'the',
 '``',
 'four-direction',
 'throw',
 "''",
 '(',
 "''shihōnage",
 "''",
 ')',
 'with',
 'standing',
 'attacker',
 'and',
 'seated',
 'defender',
 '.',
 '|',
 'imagesize',
 '=',
 '300px',
 '|',
 'alt',
 '=',
 'A',
 'man',
 'kneeling',
 'throws',
 'another',
 'man',
 'from',
 'a',
 'standing',
 'position',
 ';',
 'both',
 'are',
 'wearing',
 'robes',
 '|',
 'name',
 '=',
 'Aikido',
 '<',
 'br',
 '>',
 '(',
 '{',
 '{',
 'lang|ja|合気道',
 '}',
 '}',
 ')',
 '|',
 'aka',
 '=',
 '|',
 'focus',
 '=',
 '[',
 '[',
 'Grappling',
 ']',
 ']',
 'and',
 '[',
 '[',
 'Soft',
 'style|softness',
 ']',
 ']',
 '<',
 '!',
 '--',
 'see',
 'the',
 'many',
 'discussions',
 'at',
 'Talk',
 ':',
 'Aikido',
 '--',
 '>',
 '|',
 'hardness',
 '=',
 '|',
 'country',
 '=',
 '[',
 '[',
 'Japan',
 ']',
 ']',
 '|',
 'creator',
 '=',
 '[',
 '[',
 