# Downloading and preprocessing Wikipedia data

Download Wikipedia XML dump, decompress and filter articles to only include those with a geotag in Allegheny County.
This notebook was heavily inspired by [this notebook](https://github.com/WillKoehrsen/wikipedia-data-science/blob/master/notebooks/Downloading%20and%20Parsing%20Wikipedia%20Articles.ipynb).

Link to dumps: https://dumps.wikimedia.org/enwiki/

### Import packages

In [1]:
import os
import sys
import bz2
import subprocess
import io
import re
import gc
import json
import hashlib
from multiprocessing import Pool, set_start_method
from itertools import chain
from functools import partial
from timeit import default_timer as timer
import warnings
warnings.filterwarnings(action="ignore")

import requests                    # make http requests
import xml.sax                     # parse xml
import mwparserfromhell            # parse wikimedia
import pandas as pd                # data processing
from bs4 import BeautifulSoup      # parsing HTML
from tqdm.notebook import tqdm     # progress bars
from keras.utils import get_file   # downloading files

import multiprocessor_wiki         # mulitprocessing
from data_processor import *       # parsing coordinates

Define constants.

- ``PATH``: Path to the base data folder
- ``COORD_RANGE_LAT``: latitude coordinate range of Allegheny County
- ``COORD_RANGE_LONG``: longitude coordinate range of Allegheny County
- ``CPU_CORES``: how many cpu cores to use, default = all

In [2]:
PATH = "C:\\Users\\Tim\\.keras\\datasets\\wikipedia_real_estate\\"
COORD_RANGE_LAT = (40.000000, 40.870000)
COORD_RANGE_LONG = (-80.550000, -79.500000)
# CPU_CORES = os.cpu_count()
CPU_CORES = 8

Show list of dumps.

In [3]:
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20201120/',
 '20201201/',
 '20201220/',
 '20210101/',
 '20210120/',
 '20210201/',
 '20210220/',
 'latest/']

In [4]:
dump_url = base_url + '20201201/'

# Retrieve the html
dump_html = requests.get(dump_url).text
dump_html[:10]

'<!DOCTYPE '

In [5]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find li elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10)[:4]

[<li class="file"><a href="/enwiki/20201201/enwiki-20201201-pages-articles-multistream.xml.bz2">enwiki-20201201-pages-articles-multistream.xml.bz2</a> 17.7 GB</li>,
 <li class="file"><a href="/enwiki/20201201/enwiki-20201201-pages-articles-multistream-index.txt.bz2">enwiki-20201201-pages-articles-multistream-index.txt.bz2</a> 217.3 MB</li>,
 <li class="file"><a href="/enwiki/20201201/enwiki-20201201-pages-articles-multistream1.xml-p1p41242.bz2">enwiki-20201201-pages-articles-multistream1.xml-p1p41242.bz2</a> 233.1 MB</li>,
 <li class="file"><a href="/enwiki/20201201/enwiki-20201201-pages-articles-multistream-index1.txt-p1p41242.bz2">enwiki-20201201-pages-articles-multistream-index1.txt-p1p41242.bz2</a> 221 KB</li>]

Iterate through files to find all downloadable files and show first 5.

In [6]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('enwiki-20201201-pages-articles-multistream.xml.bz2', ['17.7', 'GB']),
 ('enwiki-20201201-pages-articles-multistream-index.txt.bz2', ['217.3', 'MB']),
 ('enwiki-20201201-pages-articles-multistream1.xml-p1p41242.bz2',
  ['233.1', 'MB']),
 ('enwiki-20201201-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB']),
 ('enwiki-20201201-pages-articles-multistream2.xml-p41243p151573.bz2',
  ['315.2', 'MB'])]

Select all compressed xml files.

In [7]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]
files_to_download[-5:]

['enwiki-20201201-pages-articles25.xml-p60025656p61525655.bz2',
 'enwiki-20201201-pages-articles25.xml-p61525656p62585850.bz2',
 'enwiki-20201201-pages-articles26.xml-p62585851p63975909.bz2',
 'enwiki-20201201-pages-articles27.xml-p63975910p65475909.bz2',
 'enwiki-20201201-pages-articles27.xml-p65475910p65998774.bz2']

Disregard multistream files.

In [8]:
files_to_download = [x for x in files_to_download if "multistream" not in x]
files_to_download[-5:]

['enwiki-20201201-pages-articles25.xml-p60025656p61525655.bz2',
 'enwiki-20201201-pages-articles25.xml-p61525656p62585850.bz2',
 'enwiki-20201201-pages-articles26.xml-p62585851p63975909.bz2',
 'enwiki-20201201-pages-articles27.xml-p63975910p65475909.bz2',
 'enwiki-20201201-pages-articles27.xml-p65475910p65998774.bz2']

Download all relevant files. If file is already downloaded, display size of file.

In [9]:
data_paths = []
file_info = []

# create project dirs
if not os.path.exists(PATH):
        os.mkdir(PATH)
wiki_path = PATH + "wikipedia\\"
if not os.path.exists(wiki_path):
        os.mkdir(wiki_path)
compressed_path = wiki_path + "compressed\\"
if not os.path.exists(compressed_path):
        os.mkdir(compressed_path)

# Iterate through each file
for file in files_to_download:
    path = compressed_path + file
    
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(path):
        # If not, download the file
        data_paths.append(get_file(file, dump_url + file))
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file, file_size, file_articles))
        
    # If the file is already downloaded find some information
    else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        print(f"Found File {file}, size: {round(file_size, 2)} MB")
        
        # Find the number of articles
        file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file.split('-')[-1], file_size, file_number))

Found File enwiki-20201201-pages-articles1.xml-p1p41242.bz2, size: 243.53 MB
Found File enwiki-20201201-pages-articles2.xml-p41243p151573.bz2, size: 326.97 MB
Found File enwiki-20201201-pages-articles3.xml-p151574p311329.bz2, size: 354.32 MB
Found File enwiki-20201201-pages-articles4.xml-p311330p558391.bz2, size: 392.69 MB
Found File enwiki-20201201-pages-articles5.xml-p558392p958045.bz2, size: 423.18 MB
Found File enwiki-20201201-pages-articles6.xml-p958046p1483661.bz2, size: 453.09 MB
Found File enwiki-20201201-pages-articles7.xml-p1483662p2134111.bz2, size: 465.23 MB
Found File enwiki-20201201-pages-articles8.xml-p2134112p2936260.bz2, size: 474.32 MB
Found File enwiki-20201201-pages-articles9.xml-p2936261p4045402.bz2, size: 515.06 MB
Found File enwiki-20201201-pages-articles10.xml-p4045403p5399366.bz2, size: 505.11 MB
Found File enwiki-20201201-pages-articles11.xml-p5399367p6899366.bz2, size: 489.51 MB
Found File enwiki-20201201-pages-articles11.xml-p6899367p7054859.bz2, size: 47.05

Download md5 checksums.

In [10]:
checksums = get_file(compressed_path + "md5_checksums", dump_url + "enwiki-20201201-md5sums.txt")

In [11]:
def md5(fname):
    """Returns md5 hash of file with name fname"""
    hash_md5 = hashlib.md5()
    with open(compressed_path + fname, "rb") as f:
        for chunk in iter(lambda: f.read(4096), b""):
            hash_md5.update(chunk)
    return hash_md5.hexdigest()

Check if data was downloaded correctly.

In [12]:
with open(checksums) as file:
   data = file.read()

md5_lst = data.split("\n")
md5_lst = [x.split("  ") for x in md5_lst][:-1]
md5_lst = [file for file in md5_lst if ".xml-p" in file[1] and not "multistream" in file[1] and not "meta" in file[1]]

downloaded_md5 = []
if [x[1] for x in md5_lst] == files_to_download:
    
    for file in tqdm(files_to_download):
        downloaded_md5.append(md5(file))
        
    if [x[0] for x in md5_lst] == downloaded_md5:
        print("Downloads verified by MD5 checksums")
    else:
        print("Download was faulty, the following files could not be verified:")
        
        for file in [x for x in downloaded_md5 if x not in md5_lst]:
            print(file)
else:
    print("Files to downloaded are not equal to md5 checksum files")

HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=59.0), HTML(value='')))


Downloads verified by MD5 checksums


Display total size and article count of downloaded dump.

In [13]:
file_sizes = [file[1] for file in file_info]
article_count = [file[2] for file in file_info]

print(f"The total size of files on disk is {round(sum(file_sizes) / 1e3, 2)} GB")
print(f"The total number of articles is {sum(article_count)}")

The total size of files on disk is 17.99 GB
The total number of articles is 65998715


Let's take a peek at the data.

In [14]:
data_path = data_paths[15]
data_path

lines = []
for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    lines.append(line)
    if i > 5e5:
        break

lines[27190:27210]

[b'        <id>2278355</id>\n',
 b'      </contributor>\n',
 b'      <minor />\n',
 b'      <comment>/* top */Remove empty invalid ref parameter</comment>\n',
 b'      <model>wikitext</model>\n',
 b'      <format>text/x-wiki</format>\n',
 b'      <text bytes="3419" xml:space="preserve">{{short description|English cricketer}}\n',
 b'{{Use dmy dates|date=March 2016}}\n',
 b'{{Use British English|date=March 2016}}\n',
 b"'''Philip Hodgson''' (21 September 1935 \xe2\x80\x93 30 March 2015) was an English [[first-class cricket]]er.&lt;ref name=&quot;YB&quot;&gt;{{cite book |title=The Yorkshire County Cricket Club: 2011 Yearbook |last=Warner |first=David |year=2011 |edition=113th |publisher=Great Northern Books |location=Ilkley, Yorkshire |isbn=978-1-905080-85-4 |page=371|url= }}&lt;/ref&gt;\n",
 b'\n',
 b'==Biography==\n',
 b"Hodgson was born in  [[Todmorden]], [[Yorkshire]], and educated at [[Woodhouse Grammar School]] in London.&lt;ref&gt;{{cite web |title=Philip Hodgson |url=https://www.e

Define content handler to parse XML.

In [15]:
class SimpleWikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

Parse compressed files to find first 500 articles and display title for 10 of those.

In [16]:
# Object for handling xml
handler = SimpleWikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    
    parser.feed(line)
    
    # Stop when 500 articles have been found
    if len(handler._pages) > 500:
        break
        
print([x[0] for x in handler._pages][190:200])

['Live at Thee Mardi Gras', 'Northfield Chateau', 'Video games in China', 'Branch Closing (The Office episode)', 'Brooklyn Tip Tops', 'Category:Low-importance Shopping center articles', 'Chalet Schell', 'Wikipedia:Wikiquette assistance/Archive/2005', 'Category:Unknown-importance Shopping center articles', 'Wikipedia:Wikiquette alerts/archive']


Select one to take a closer look.

In [17]:
print(handler._pages[191][0])
page_text = handler._pages[191][1]
wiki = mwparserfromhell.parse(page_text)
wiki[:1000]

Northfield Chateau


"[[Image:Northfield Chateau (Northfield, MA) - exterior.jpg|thumb|right|250px|The Northfield Chateau]] \n [[Image:Northfield Chateau (Northfield, MA) - interior.jpg|thumb|right|250px|Interior]] \n \n The '''Northfield Chateau''', also variously known as '''Chalet Schell''' and '''Birnam House''', was a large mansion on Birnham Road in [[Northfield, Massachusetts]]. It no longer exists. \n \n The chateau was designed by noted architect [[Bruce Price]] (of the [[Château Frontenac]]) for Francis Robert Schell, a New York capitalist attracted by his interest in [[Dwight Lyman Moody]]'s work at the nearby [[Northfield Seminary]] and [[Northfield Mount Hermon School|Mount Hermon School]]. It was completed in 1903 on grounds of {{convert|125|acre}}. \n \n The building was loosely patterned upon a French [[chateau]] but fanciful in style, with 99 rooms in a compact, three-story structure ornamented with prominent turrets. Contrary to popular rumors that Mrs. Schell despised the Chateau and ref

In [18]:
links = [x.title for x in wiki.filter_wikilinks()]
print(f"There are {len(links)} wikilinks in this article:")
links[:5]

There are 18 wikilinks in this article:


['Image:Northfield Chateau (Northfield, MA) - exterior.jpg',
 'Image:Northfield Chateau (Northfield, MA) - interior.jpg',
 'Northfield, Massachusetts',
 'Bruce Price',
 'Château Frontenac']

Comments were not downloaded, so the following will always be empty.

In [19]:
print(wiki.filter_arguments())
print(wiki.filter_comments())

[]
[]


In [20]:
external_links = [x.url for x in wiki.filter_external_links()]
print(f'There are {len(external_links)} external links:')
external_links[:5]

There are 4 external links:


['http://www.nmhschool.org/alumni/history/chateauhistory.php',
 'http://www.nmhschool.org/alumni/history/chateau.php',
 'http://hdl.loc.gov/loc.pnp/hhh.ma0192',
 'http://www.eric-goldscheider.com/id121.html']

In [21]:
templates = wiki.filter_templates()
print(f'There are {len(templates)} templates:')
templates[:5]

There are 2 templates:


['{{convert|125|acre}}', '{{Coord|42|42|10.31|N|72|26|47.40|W|display=title}}']

Look for coordinates.

In [22]:
infobox = wiki.filter_templates(matches="coord")[0]
print(infobox)
print(infobox.name.strip_code().strip().lower())

{{Coord|42|42|10.31|N|72|26|47.40|W|display=title}}
coord


Test the extract coordinates method.

In [23]:
extract_coordinates(str(infobox))

(42.70286388888889, -72.4465)

Display main text of article.

In [24]:
wiki.strip_code().strip()

'thumb|right|250px|The Northfield Chateau \n thumb|right|250px|Interior \n \n The Northfield Chateau, also variously known as Chalet Schell and Birnam House, was a large mansion on Birnham Road in Northfield, Massachusetts. It no longer exists. \n \n The chateau was designed by noted architect Bruce Price (of the Château Frontenac) for Francis Robert Schell, a New York capitalist attracted by his interest in Dwight Lyman Moody\'s work at the nearby Northfield Seminary and Mount Hermon School. It was completed in 1903 on grounds of . \n \n The building was loosely patterned upon a French chateau but fanciful in style, with 99 rooms in a compact, three-story structure ornamented with prominent turrets. Contrary to popular rumors that Mrs. Schell despised the Chateau and refused to live in it, the Schells summered at their beautiful home for 25 years. It was only after the death of her beloved husband in 1928 that Mrs. Schell refused to set foot in the house again, insisting when she stay

Define a method to extract all selected templates from an article and return them.

In [25]:
def process_article(title, text, timestamp, coord_range_lat, coord_range_long, template="coord"):
    """Process wikipedia article looking for template"""
    
    # Create a parsing object
    wikicode = mwparserfromhell.parse(text)
    
    # Search through templates for the template
    coord_matches = wikicode.filter_templates(matches=template)
    
    # Filter out errant matches
    coord_matches = [x for x in coord_matches if x.name.strip_code().strip().lower() == template.lower()]
    
    # check if match contains coordinates
    if len(coord_matches) >= 1:
        
        # extract coordinates
        coords = extract_coordinates(str(coord_matches[0]))
        
        # coords have wrong format
        if not coords:
            return None
        
        # check if coordinates are in Western Pennsylvania region
        if coord_range_lat[0] < coords[0] < coord_range_lat[1] and coord_range_long[0] < coords[1] < coord_range_long[1]:
            
            # Extract all templates
            all_templates = wikicode.filter_templates()
            
            infobox = [x for x in all_templates if "infobox" in x.name.strip_code().strip().lower()]
            
            if len(infobox) >= 1:
                # Extract information from infobox if existing
                properties = {param.name.strip_code().strip(): param.value.strip_code().strip() 
                              for param in infobox[0].params
                              if param.value.strip_code().strip()}
            else:
                properties = None

            # Extract internal wikilinks
            wikilinks = [x.title.strip_code().strip() for x in wikicode.filter_wikilinks()]
    
            # Extract external links
            exlinks = [x.url.strip_code().strip() for x in wikicode.filter_external_links()]
        
            # Find approximate length of article
            text_length = len(wikicode.strip_code().strip())

            return [title, coords, properties, wikilinks, exlinks, text_length]
        else:
            # object not in Western Pennsylvania region, disregard
            return None

Define a more complex content handler which can filter for articles from Allegheny County.

In [26]:
class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Parse through XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._articles = []
        self._article_count = 0
        self._non_matches = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._article_count += 1
            # Search through the page to see if the geocoordinate is in Allegheny County
            article = process_article(**self._values, coord_range_lat=COORD_RANGE_LAT, coord_range_long=COORD_RANGE_LONG)
            # Append to the list of articles
            if article:
                self._articles.append(article)

Search for objects in Western Pennsylvania region in the 16th Wikipedia file and stop if 3 are found.

In [27]:
# Object for handling xml
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

print(f"Searching for articles in {data_path}...")
for i, line in enumerate(bz2.BZ2File(data_path, 'r')):
    
    parser.feed(line)
    
    # Stop when 3 objects have been found
    if len(handler._articles) > 2:
        break
        
print(f'Searched through {handler._article_count} articles to find {len(handler._articles)} objects in Western Pennsylvania region.')

Searching for articles in C:\Users\Tim\.keras\datasets\wikipedia_real_estate\wikipedia\compressed\enwiki-20201201-pages-articles13.xml-p10672789p11659682.bz2...
Searched through 40915 articles to find 3 objects in Western Pennsylvania region.


Let's see what articles have been identified to be in Allegheny County.

In [28]:
print(*[article[0] for article in handler._articles], sep=", ")

Washington County Courthouse (Pennsylvania), Wild Things Park, Thackeray Hall


## Process all articles with multiprocessing

Check if all files are correctly in the compressed path and display an exemplary data path.

In [29]:
partitions = [compressed_path + file for file in os.listdir(compressed_path) if 'xml-p' in file]
len(partitions), partitions[-1]

(59,
 'C:\\Users\\Tim\\.keras\\datasets\\wikipedia_real_estate\\wikipedia\\compressed\\enwiki-20201201-pages-articles9.xml-p2936261p4045402.bz2')

Run the script to process all compressed files and look for articles in Allegheny County.

In [30]:
multiprocessor_wiki.process(compressed_path, CPU_CORES)

Articles already processed.


## Joining the data back together

Read all json files containing information about locations from each partition.

In [31]:
def read_data(file_path):
    """Read in json data from `file_path`"""
    
    data = []
    # Open the file and load in json
    with open(file_path, 'r') as fin:
        for l in fin.readlines():
            data.append(json.loads(l))
            
    return data

In [32]:
uncompressed_path = os.path.dirname(os.path.dirname(compressed_path)) + "\\uncompressed\\"

saved_files = [uncompressed_path + x for x in os.listdir(uncompressed_path)]  # find all data to read

articles = []
for file in saved_files:
    articles.extend(read_data(file))

Save all articles in one file.

In [33]:
f_path = os.path.dirname(os.path.dirname(uncompressed_path)) + "\\wikipedia_selected.ndjson"  # create path to new file

if not os.path.exists(f_path):
    with open(f_path, 'wt') as fout:
        json.dump(articles, fout)
    print('Articles saved.')
else:
    print('File already saved.')

File already saved.


Assert whether data was successfully exported.

In [34]:
with open(f_path) as fin:
    data_loaded = json.load(fin)

assert data_loaded == articles