## This code downloads Wikipedia dump files, parses them and extracts title and article text

### This code is a shorten and modified version of the project "Wikipedia Data Science: Working with the World’s Largest Encyclopedia" by Will Koehrsen. Full code for his project with a very clear description can be retrieved from: https://github.com/WillKoehrsen/wikipedia-data-science/blob/master/notebooks/Downloading%20and%20Parsing%20Wikipedia%20Articles.ipynb 


###### Step 1: download raw data files from Wikimedia

In [3]:
import requests
from timeit import default_timer as timer

# Parsing HTML
from bs4 import BeautifulSoup

# File system management
import os

# Libraries we will use as we go
import pandas as pd
import bz2
import subprocess



In [None]:
# Source of the Wiki data
base_url = 'https://dumps.wikimedia.org/enwiki/'
index = requests.get(base_url).text
soup_index = BeautifulSoup(index, 'html.parser')

# Find the links that are dates of dumps
dumps = [a['href'] for a in soup_index.find_all('a') if 
         a.has_attr('href')]

dump_url = base_url + '20190920/' #Date stands for the version of the files

# Retrieve the html
dump_html = requests.get(dump_url).text

# Convert to a soup
soup_dump = BeautifulSoup(dump_html, 'html.parser')

# Find elements with the class file
soup_dump.find_all('li', {'class': 'file'}, limit = 10)[:4]



In [None]:
# This would list all files, but we need only the most recent version of the articles
files = []

# Search through all files
for file in soup_dump.find_all('li', {'class': 'file'}):
    text = file.text
    # Select the relevant files
    if 'pages-articles' in text:
        files.append((text.split()[0], text.split()[1:]))

# We don't need meta data, just xml files of articles
files_to_download = [file[0] for file in files if '.xml-p' in file[0]]

# Make sure you don't accidentally download irrelevant xmls
files_to_download=files_to_download[0:57]

In [None]:
#Libraries that help download data in bulks
import sys
from keras.utils import get_file
keras_home = '/Users/yulia_zhestkova/Wikidata/' #specify where to safe
# You can also manually download what you need

In [None]:
data_paths = []
file_info = []

# Iterate through each file
for file in files_to_download:
    path = keras_home + file
    
    # Check to see if the path exists (if the file is already downloaded)
    if not os.path.exists(keras_home + file):
        print('Downloading')
        # If not, download the file
        data_paths.append(get_file(file, dump_url))
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_articles = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file, file_size, file_articles))
        
    # If the file is already downloaded find some information
    else:
        data_paths.append(path)
        # Find the file size in MB
        file_size = os.stat(path).st_size / 1e6
        
        # Find the number of articles
        file_number = int(file.split('p')[-1].split('.')[-2]) - int(file.split('p')[-2])
        file_info.append((file.split('-')[-1], file_size, file_number))

 

##### Step 2: Extracting relevant information using XML parser

In [None]:
import xml.sax

class WikiXmlHandler(xml.sax.handler.ContentHandler):
    #Content handler for Wiki XML data
    def __init__(self):
        xml.sax.handler.ContentHandler.__init__(self)
        self._buffer = None
        self._values = {}
        self._current_tag = None
        self._pages = []

    def characters(self, content):
        #Characters between opening and closing tags
        if self._current_tag:
            self._buffer.append(content)

    def startElement(self, name, attrs):
        #Opening tag of element
        if name in ('title', 'text', 'timestamp'):
            self._current_tag = name
            self._buffer = []

    def endElement(self, name):
        #Closing tag of element
        if name == self._current_tag:
            self._values[name] = ' '.join(self._buffer)

        if name == 'page':
            self._pages.append((self._values['title'], self._values['text']))

In [None]:
lines = []

for i, line in enumerate(subprocess.Popen(['bzcat'], 
                         stdin = open(data_path), 
                         stdout = subprocess.PIPE).stdout):
    lines.append(line)
    
    #if i > 5e5:
     #   break

#Check the structure of XML, see if there are anything specific about this version of the files
lines[-112:-32]

In [None]:
import gc
import mwparserfromhell 

start=timer()
# Loop through the downloaded files and pass the handler to a SAX parser
for j in range(1,57):
    print("Processing partition {}".format(j)) 
    data_path=data_paths[j]
    handler = WikiXmlHandler()

    # Parsing object
    parser = xml.sax.make_parser()
    parser.setContentHandler(handler)

    for i, line in enumerate(subprocess.Popen(['bzcat'], 
                    stdin = open(data_path), 
                    stdout = subprocess.PIPE).stdout):
        parser.feed(line)

    df=pd.DataFrame(list(handler._pages))
    df.columns = ['title', 'text']
    df['text'] = df.text.apply(mwparserfromhell.parse)
    df['text']=df['text'].str[:15000] #saving only first n symbols from the text for memory management
    df.is_copy = None
    df=df[df['text'].str.len()>300] #getting rid of abnormally short articles
    df['first']=df['text'].str[:9]
    df = df[df['first'] != "#REDIRECT"] #getting rid of doubled articles due to redirect
    df=df.drop(['first'], axis=1)
    partition_dir = '/Users/yulia_zhestkova/Wikipedia parsing/'
    export_csv = df.to_csv (r'output'+str(j)+'.csv', index = None)

    # Memory management
    del handler
    del parser
    del df

end=timer()




####### outputN.csv files now have processed dataframes with title and text of the articles. Note that the text data still needs lots of cleaninig.