In [1]:
import xml.etree.ElementTree as ET
import codecs
import re
import csv 
import wikidata


In [2]:
def clean_article_text(article_txt):
    # remove text written between double curly braces
    article_txt = re.sub(r"{{.*}}","",article_txt)

    # remove file attachments
    article_txt = re.sub(r"\[\[File:.*\]\]","",article_txt)

    # remove Image attachments
    article_txt = re.sub(r"\[\[Image:.*\]\]","",article_txt)

    # remove unwanted lines starting from special characters
    article_txt = re.sub(r"\n: \'\'.*","",article_txt)
    article_txt = re.sub(r"\n!.*","",article_txt)
    article_txt = re.sub(r"^:\'\'.*","",article_txt)

    # remove non-breaking space symbols
    article_txt = re.sub(r"&nbsp","",article_txt)

    # remove URLs link
    article_txt = re.sub(r"http\S+","",article_txt)

    # remove digits from text
    article_txt = re.sub(r"\d+","",article_txt)

    # remove text written between small braces   
    article_txt = re.sub(r"\(.*\)","",article_txt)

    # remove sentence which tells category of article
    article_txt = re.sub(r"Category:.*","",article_txt)

    # remove the sentences inside infobox or taxobox
    article_txt = re.sub(r"\| .*","",article_txt)
    article_txt = re.sub(r"\n\|.*","",article_txt)
    article_txt = re.sub(r"\n \|.*","",article_txt)
    article_txt = re.sub(r".* \|\n","",article_txt)
    article_txt = re.sub(r".*\|\n","",article_txt)

    # remove infobox or taxobox
    article_txt = re.sub(r"{{Infobox.*","",article_txt)
    article_txt = re.sub(r"{{infobox.*","",article_txt)
    article_txt = re.sub(r"{{taxobox.*","",article_txt)
    article_txt = re.sub(r"{{Taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Infobox.*","",article_txt)
    article_txt = re.sub(r"{{ infobox.*","",article_txt)
    article_txt = re.sub(r"{{ taxobox.*","",article_txt)
    article_txt = re.sub(r"{{ Taxobox.*","",article_txt)

    # remove lines starting from *
    article_txt = re.sub(r"\* .*","",article_txt)

    # remove text written between angle bracket
    article_txt = re.sub("[\<].*?[\>]", "", article_txt)
    
    # remove new line character
    article_txt = re.sub(r"\n","",article_txt)  

    # replace all punctuations with space 
    article_txt = re.sub(r"\!|\"|\#|\$|\%|\&|\'|\(|\)|\*|\+|\,|\-|\.|\/|\:|\;|\|\?|\@|\[|\\|\]|\^|\_|\`|\{|\||\}|\~"," ",article_txt)

    # replace consecutive multiple space with single space
    article_txt = re.sub(r" +"," ",article_txt)

    # replace non-breaking space with regular space 
    article_txt = article_txt.replace(u'\xa0', u' ')
    
    return article_txt
     

In [26]:
def split_text_into_headings(text):
    output = []
    for i in re.split("[\==].*?[\==]", clean_article_text(text)):
        if len(i) > 150:
            # Filter out short enteries (likely headings themselves)
            output.append(i)
    return output

In [4]:
tree = ET.parse('data/wikipedia/simplewiki-20180901-pages-meta-current.xml')  
root = tree.getroot()  
path = 'articles-corpus//' 
url  = '{http://www.mediawiki.org/xml/export-0.10/}page'

In [30]:
csv_file_name = 'data/wikipedia/processed_data_split.csv'
headers = ['wiki_title', 'text']

def append_row(csv_file_path, row):
    """This function was written to deal with having files left open"""
    with open(csv_file_path, encoding='UTF-16', mode='a', newline='') as file:
        writer = csv.writer(file)
        writer.writerow(row)

# intialize file
with open(csv_file_name, 'w', encoding='UTF-16') as csv_file:
    writer = csv.writer(csv_file)
    writer.writerow(headers)

In [31]:
for i,page in enumerate(root.findall(url)):
    
    for p in page:

        if p.tag == '{http://www.mediawiki.org/xml/export-0.10/}title':
            title = p.text
 
        r_tag = "{http://www.mediawiki.org/xml/export-0.10/}revision"                 
        if p.tag == r_tag:  
            
            for x in p:
#                 print(x.tag)
                tag = "{http://www.mediawiki.org/xml/export-0.10/}text"
                
                if x.tag == tag:                                                              
                    text = x.text                                          
                    if not text == None:  
                        # Extracting the text portion from the article                                                 
#                         text = text[:text.find("==")]
#                         print(title)
#                         print(text)
                        # <em><strong>Cleaning of Text (described in Section 2)</strong></em>                                                     
                        # Printing the article 
#                         print(title)
#                         print(text)
#                         print('\n====================================\n')
                        if 'User:' in title:
                            continue
                        elif 'Talk:' in title:
                            continue
                        elif 'Wikipedia' in title:
                            continue
                        elif 'talk:' in title:
                            continue
                        elif 'Category:' in title:
                            continue

                        text = clean_article_text(text)
                        text_chunks = split_text_into_headings(text)
                        for count, textbit in enumerate(text_chunks):
                            
                            append_row(csv_file_name, [title+'--'+str(count), textbit])
                        

In [24]:
def split_text_into_headings(text):
    output = []
    for i in re.split("[\==].*?[\==]", clean_article_text(text)):
        if len(i) > 150:
            # Filter out short enteries (likely headings themselves)
            output.append(i)
    return output

In [1]:
import pandas as pd

In [2]:
data_frame = pd.read_csv('data/wikipedia/processed_data_split.csv', encoding='UTF-16')

In [3]:
data_frame = data_frame[-data_frame.wiki_title.apply(lambda x: 'Template:' in str(x))]

In [4]:
data_frame = data_frame[-data_frame.text.apply(lambda x: 'REDIRECT' in str(x))]

In [5]:
data_frame = data_frame[data_frame.text.apply(lambda x: len(str(x))>200)]

In [6]:
data_frame = data_frame[-data_frame.wiki_title.apply(lambda x: 'Module:' in str(x))]

In [7]:
print('Total articles: {}'.format(len(data_frame)))

Total articles: 152895


Let's subset these articles to train on. Let's consider 50,000.

In [8]:
import random
random.seed('computer what?????')

wikipedia_subset = data_frame.sample(n=120000, random_state=12)

In [9]:
data_frame.to_csv('data/wikipedia/cleaned_wiki_data_full_text_chunks.csv')
wikipedia_subset.to_csv('data/wikipedia/subsetted_wiki_data_full_text_text_chunks.csv')

In [16]:
data_frame[data_frame.index.isin(wikipedia_subset.index)]

KeyError: '[-1 -1 -1 ...  0 -1 -1] not in index'

In [27]:
test = data_frame[~data_frame.index.isin(wikipedia_subset.index)]

In [29]:
test.to_csv('data/wikipedia/subsetted_wiki_data_full_text_text_chunks_leftout.csv')

In [26]:
test

array([ True,  True,  True, ..., False,  True,  True])