In [2]:
import requests

# Parsing HTML
from bs4 import BeautifulSoup

# File system management
import os

In [3]:
# Make a request to the dump of Wikipedia
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

In [4]:
# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]
dumps

['../',
 '20220301/',
 '20220320/',
 '20220401/',
 '20220420/',
 '20220501/',
 '20220520/',
 '20220601/',
 'latest/']

In [5]:
# The url for the most recent date
dump_url = base_url + '20220601/'

# Retrieve the html
dump_html = requests.get(dump_url).text

In [6]:
# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find li elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10)[:4]

[<li class="file"><a href="/enwiki/20220601/enwiki-20220601-pages-articles-multistream.xml.bz2">enwiki-20220601-pages-articles-multistream.xml.bz2</a> 19.4 GB</li>,
 <li class="file"><a href="/enwiki/20220601/enwiki-20220601-pages-articles-multistream-index.txt.bz2">enwiki-20220601-pages-articles-multistream-index.txt.bz2</a> 231.1 MB</li>,
 <li class="file"><a href="/enwiki/20220601/enwiki-20220601-pages-articles-multistream1.xml-p1p41242.bz2">enwiki-20220601-pages-articles-multistream1.xml-p1p41242.bz2</a> 248.7 MB</li>,
 <li class="file"><a href="/enwiki/20220601/enwiki-20220601-pages-articles-multistream-index1.txt-p1p41242.bz2">enwiki-20220601-pages-articles-multistream-index1.txt-p1p41242.bz2</a> 221 KB</li>]

In [7]:
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))
        
files[:5]

[('enwiki-20220601-pages-articles-multistream.xml.bz2', ['19.4', 'GB']),
 ('enwiki-20220601-pages-articles-multistream-index.txt.bz2', ['231.1', 'MB']),
 ('enwiki-20220601-pages-articles-multistream1.xml-p1p41242.bz2',
  ['248.7', 'MB']),
 ('enwiki-20220601-pages-articles-multistream-index1.txt-p1p41242.bz2',
  ['221', 'KB']),
 ('enwiki-20220601-pages-articles-multistream2.xml-p41243p151573.bz2',
  ['333.4', 'MB'])]

In [8]:
files_to_download = [file[0] for file in files if '.xml-p' in file[0] and 'multistream' in file[0]]
files_to_download

['enwiki-20220601-pages-articles-multistream1.xml-p1p41242.bz2',
 'enwiki-20220601-pages-articles-multistream2.xml-p41243p151573.bz2',
 'enwiki-20220601-pages-articles-multistream3.xml-p151574p311329.bz2',
 'enwiki-20220601-pages-articles-multistream4.xml-p311330p558391.bz2',
 'enwiki-20220601-pages-articles-multistream5.xml-p558392p958045.bz2',
 'enwiki-20220601-pages-articles-multistream6.xml-p958046p1483661.bz2',
 'enwiki-20220601-pages-articles-multistream7.xml-p1483662p2134111.bz2',
 'enwiki-20220601-pages-articles-multistream8.xml-p2134112p2936260.bz2',
 'enwiki-20220601-pages-articles-multistream9.xml-p2936261p4045402.bz2',
 'enwiki-20220601-pages-articles-multistream10.xml-p4045403p5399366.bz2',
 'enwiki-20220601-pages-articles-multistream11.xml-p5399367p6899366.bz2',
 'enwiki-20220601-pages-articles-multistream11.xml-p6899367p7054859.bz2',
 'enwiki-20220601-pages-articles-multistream12.xml-p7054860p8554859.bz2',
 'enwiki-20220601-pages-articles-multistream12.xml-p8554860p91727

In [9]:
data_path = files_to_download[0]

In [12]:
import subprocess

In [29]:
# Get XML in the bz2 file
lines = []

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    lines.append(line)
    if i > 5e5:
        break
        
lines[:100]

[b'<mediawiki xmlns="http://www.mediawiki.org/xml/export-0.10/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.mediawiki.org/xml/export-0.10/ http://www.mediawiki.org/xml/export-0.10.xsd" version="0.10" xml:lang="en">\n',
 b'  <siteinfo>\n',
 b'    <sitename>Wikipedia</sitename>\n',
 b'    <dbname>enwiki</dbname>\n',
 b'    <base>https://en.wikipedia.org/wiki/Main_Page</base>\n',
 b'    <generator>MediaWiki 1.39.0-wmf.13</generator>\n',
 b'    <case>first-letter</case>\n',
 b'    <namespaces>\n',
 b'      <namespace key="-2" case="first-letter">Media</namespace>\n',
 b'      <namespace key="-1" case="first-letter">Special</namespace>\n',
 b'      <namespace key="0" case="first-letter" />\n',
 b'      <namespace key="1" case="first-letter">Talk</namespace>\n',
 b'      <namespace key="2" case="first-letter">User</namespace>\n',
 b'      <namespace key="3" case="first-letter">User talk</namespace>\n',
 b'      <namespace key="4" case="first-letter">W

In [32]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    """Content handler for Wiki XML data using SAX"""
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        """Characters between opening and closing tags"""
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        """Opening tag of element"""
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        """Closing tag of element"""
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [40]:
# Content handler for Wiki XML
handler = WikiXmlHandler()

# Parsing object
parser = xml.sax.make_parser()
parser.setContentHandler(handler)

handler._pages

[]

In [47]:
for l in lines:
    parser.feed(l)
    if len(handler._pages) > 50:
        break

In [48]:
handler._pages

[('AccessibleComputing',
  '#REDIRECT [[Computer accessibility]] \n \n {{rcat shell| \n {{R from move}} \n {{R from CamelCase}} \n {{R unprintworthy}} \n }}'),
 ('Anarchism',
  '{{short description|Political philosophy and movement}} \n {{other uses}} \n {{redirect2|Anarchist|Anarchists|other uses|Anarchist (disambiguation)}} \n {{distinguish|Anarchy}} \n {{pp-semi-indef}} \n {{good article}} \n {{use British English|date=August 2021}} \n {{use dmy dates|date=August 2021}} \n {{anarchism sidebar}} \n {{basic forms of government}} \n \'\'\'Anarchism\'\'\' is a [[political philosophy]] and [[Political movement|movement]] that is sceptical of [[authority]] and rejects all involuntary, coercive forms of [[Social hierarchy|hierarchy]].{{sfn|Suissa|2019b|ps=:  " ...as many anarchists have stressed, it is not government as such that they find objectionable, but the hierarchical forms of government associated with the nation state. " }} Anarchism calls for the abolition of the [[State (polity)

In [49]:
print(len(handler._pages))

51


In [53]:
for page in handler._pages:
    print(page[0])

AccessibleComputing
Anarchism
AfghanistanHistory
AfghanistanGeography
AccessibleComputing
AccessibleComputing
Anarchism
AfghanistanHistory
AfghanistanGeography
AfghanistanPeople
AfghanistanCommunications
AfghanistanTransportations
AfghanistanMilitary
AfghanistanTransnationalIssues
AssistiveTechnology
AmoeboidTaxa
Autism
AlbaniaHistory
AlbaniaPeople
AsWeMayThink
AlbaniaGovernment
AlbaniaEconomy
Albedo
AfroAsiaticLanguages
ArtificalLanguages
AbacuS
AbalonE
AbbadideS
AbbesS
AbbevilleFrance
AbbeY
AbboT
Abbreviations
AtlasShrugged
ArtificialLanguages
AtlasShruggedCharacters
AtlasShruggedCompanies
AyersMusicPublishingCompany
AfricanAmericanPeople
AdolfHitler
AbeceDarians
AbeL
AbensbergGermany
AberdeenSouthDakota
ArthurKoestler
AynRand
AlexanderTheGreat
AnchorageAlaska
ArgumentForms
ArgumentsForTheExistenceOfGod
AnarchY


In [52]:
# Process actual text
import mwparserfromhell 

print(handler._pages[6][0])

# Create the wiki article
wiki = mwparserfromhell.parse(handler._pages[6][1])

Anarchism


In [54]:
print(type(wiki))
wiki[:100]

<class 'mwparserfromhell.wikicode.Wikicode'>


'{{short description|Political philosophy and movement}} \n {{other uses}} \n {{redirect2|Anarchist|Ana'

In [59]:
# Internal links
wikilinks = [x.title for x in wiki.filter_wikilinks()]
print(f'There are {len(wikilinks)} wikilinks.')
wikilinks[:5]

There are 473 wikilinks.


['political philosophy',
 'Political movement',
 'authority',
 'Social hierarchy',
 'State (polity)']

In [62]:
# Get the text of the page
plain_text = wiki.strip_code().strip()