In [1]:
import xml.etree.ElementTree as ET
import codecs
import re
import csv 


In [6]:
def clean_article_text(article_txt):
    # remove text written between double curly braces
    article_txt = re.sub(r"{{.*}}","",article_txt)

    # remove file attachments
    article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)

    # remove Image attachments
    article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)

    # remove unwanted lines starting from special characters
    article_txt = re.sub(r"\n: \'\'.*","",article_txt)
    article_txt = re.sub(r"\n!.*","",article_txt)
    article_txt = re.sub(r"^:\'\'.*","",article_txt)

    # remove non-breaking space symbols
    article_txt = re.sub(r"&nbsp","",article_txt)

    # remove URLs link
    article_txt = re.sub(r"http\S+","",article_txt)

    # remove digits from text
    article_txt = re.sub(r"\d+","",article_txt)

    # remove text written between small braces   
    article_txt = re.sub(r"\(.*\)","",article_txt)

    # remove sentence which tells category of article
    article_txt = re.sub(r"Category:.*","",article_txt)

    # remove the sentences inside infobox or taxobox
    article_txt = re.sub(r"\| .*","",article_txt)
    article_txt = re.sub(r"\n\|.*","",article_txt)
    article_txt = re.sub(r"\n \|.*","",article_txt)
    article_txt = re.sub(r".* \|\n","",article_txt)
    article_txt = re.sub(r".*\|\n","",article_txt)

    # remove infobox or taxobox
    article_txt = re.sub(r"{{Infobox.*","",article_txt)
    article_txt = re.sub(r"{{infobox.*","",article_txt)
    article_txt = re.sub(r"{{taxobox.*","",article_txt)
    article_txt = re.sub(r"{{Taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Infobox.*","",article_txt)
    article_txt = re.sub(r"{{ infobox.*","",article_txt)
    article_txt = re.sub(r"{{ taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Taxobox.*","",article_txt)

    # remove lines starting from *
    article_txt = re.sub(r"\* .*","",article_txt)

    # remove text written between angle bracket
    article_txt = re.sub("[\<].*?[\>]", "", article_txt)

    # Remove all headings
    article_txt = re.sub("[\==].*?[\==]", "", article_txt)
    
    # remove new line character
    article_txt = re.sub(r"\n","",article_txt)  

    # replace all punctuations with space 
    article_txt = re.sub(r"\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|\:|\;|\|\?|\@|\[|\\|\]|\^|\_|\`|\{|\||\}|\~"," ",article_txt)

    # replace consecutive multiple space with single space
    article_txt = re.sub(r" +"," ",article_txt)

    # replace non-breaking space with regular space 
    article_txt = article_txt.replace(u'\xa0', u' ')
    
    return article_txt
     

In [3]:
tree = ET.parse('data/wikipedia/simplewiki-20180901-pages-meta-current.xml')  
root = tree.getroot()  
path = 'articles-corpus//' 
url  = '{http://www.mediawiki.org/xml/export-0.10/}page'

In [7]:
csv_file_name = 'data/wikipedia/processed_data.csv'
headers = ['wiki_title', 'text']

def append_row(csv_file_path, row):
    """This function was written to deal with having files left open"""
    with open(csv_file_path, encoding='UTF-16', mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(row)

# intialize file
with open(csv_file_name, 'w', encoding='UTF-16') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(headers)

In [8]:
for i,page in enumerate(root.findall(url)):
    
    for p in page:
        if p.tag == '{http://www.mediawiki.org/xml/export-0.10/}title':
            title = p.text
        r_tag = "{http://www.mediawiki.org/xml/export-0.10/}revision"                 
        if p.tag == r_tag:  
            for x in p:
#                 print(x.tag)
                tag = "{http://www.mediawiki.org/xml/export-0.10/}text"
                
                if x.tag == tag:                                                              
                    text = x.text                                          
                    if not text == None:  
                        # Extracting the text portion from the article                                                 
#                         text = text[:text.find("==")]
#                         print(title)
#                         print(text)
                        # <em><strong>Cleaning of Text (described in Section 2)</strong></em>                                                     
                        # Printing the article 
#                         print(title)
#                         print(text)
#                         print('\n====================================\n')
                        if 'User:' in title:
                            continue
                        elif 'Talk:' in title:
                            continue
                        elif 'Wikipedia' in title:
                            continue
                        elif 'talk:' in title:
                            continue
                        elif 'Category:' in title:
                            continue
                        
                        text = clean_article_text(text)
                        append_row(csv_file_name, [title, text])
#                         input()

In [9]:
import pandas as pd

In [10]:
data_frame = pd.read_csv('data/wikipedia/processed_data.csv', encoding='UTF-16')

In [11]:
data_frame = data_frame[-data_frame.text.apply(lambda x: 'REDIRECT' in str(x))]

In [12]:
data_frame = data_frame[-data_frame.wiki_title.apply(lambda x: 'Template:' in str(x))]

In [13]:
data_frame = data_frame[data_frame.text.apply(lambda x: len(str(x))>200)]

In [14]:
print('Total articles: {}'.format(len(data_frame)))

Total articles: 98939


Let's subset these articles to train on. Let's consider 50,000.

In [15]:
import random
random.seed('computer what?????')

wikipedia_subset = data_frame.sample(n=70000, random_state=12)

In [16]:
data_frame.to_csv('data/wikipedia/cleaned_wiki_data_full_text.csv')
wikipedia_subset.to_csv('data/wikipedia/subsetted_wiki_data_full_text.csv')