In [1]:
import xml.etree.ElementTree as ET
import mwparserfromhell
import csv

In [2]:
DATA_FOLDER = "/net/projects/veitch/datasets/"
WIKI_XML = "enwiki-20231001-pages-articles-multistream.xml"

In [3]:
context = ET.iterparse(DATA_FOLDER + WIKI_XML, events=("start", "end"))

In [4]:
title, text = None, None
titles = []
articles = []

# Iterate through XML tags
for event, elem in context:
    if event == "start" and elem.tag.endswith("title"):
        title = elem.text  # Grab the title when the <title> start tag is found
    if event == "end" and elem.tag.endswith("text"):
        text = elem.text  # Grab the text content when the </text> end tag is found

    if title and text:
        parsed_text = mwparserfromhell.parse(text)
        titles.append(title)
        articles.append(text)
        title, text = None, None

    elem.clear()  # Free up memory by clearing the element from the tree
    
    if len(titles) == 100:
        break

In [9]:
def remove_wikitext(parsed_text, filter_method):
    elements_to_remove = list(filter_method())  # Create a copy of elements to avoid in-place modification issues
    for elem in elements_to_remove:
        try:
            parsed_text.remove(elem)
        except ValueError as e:
            print(f"Failed to remove: {elem}. Error: {e}")

def extract_categories(parsed):
    categories = []
    for link in parsed.filter_wikilinks():
        if 'Category:' in str(link):
            try:
                # parsed.remove(link)
                link_title = str(link.title)
                if "Category:" in link_title:
                    categories.append(link_title.replace("Category:", ""))
            except ValueError as e:
                print(f"Failed to extract category: {link}. Error: {e}")
    return categories

In [10]:
with open('data/parsed-paragraphs.csv', 'w', newline='', encoding='utf-8') as f:
    fieldnames = ['text', 'label']
    writer = csv.DictWriter(f, fieldnames=fieldnames)
    writer.writeheader()

    for i, article in enumerate(articles):
        parsed = mwparserfromhell.parse(article)

        remove_wikitext(parsed, parsed.filter_templates)
        remove_wikitext(parsed, parsed.filter_headings)
        categories = extract_categories(parsed) 

        plain_text = parsed.strip_code()
        paragraphs = [para.strip() for para in plain_text.split('\n') if para.strip()]

        for j, p in enumerate(paragraphs):
            # writer.writerow({'text': p, 'label': titles[i]})
            writer.writerow({'text': p, 'label': categories})
            if j == 10:
                break
    
        if i == 5:
            break

Failed to remove: {{sfn|Guérin|1970|p=12}}. Error: {{sfn|Guérin|1970|p=12}}
Failed to remove: {{sfn|Arvidsson|2017}}. Error: {{sfn|Arvidsson|2017}}
Failed to remove: {{sfn|Chomsky|2005|p=123}}. Error: {{sfn|Chomsky|2005|p=123}}
Failed to remove: {{sfn|Otero|1994|p=617}}. Error: {{sfn|Otero|1994|p=617}}
Failed to remove: {{sfn|Osgood|1889|p=1}}. Error: {{sfn|Osgood|1889|p=1}}
Failed to remove: {{sfn|Marshall|1992|p=641}}. Error: {{sfn|Marshall|1992|p=641}}
Failed to remove: {{sfn|Jennings|1999|p=147}}. Error: {{sfn|Jennings|1999|p=147}}
Failed to remove: {{sfn|Walter|2002|p=44}}. Error: {{sfn|Walter|2002|p=44}}
Failed to remove: {{sfn|Newman|2005|p=15}}. Error: {{sfn|Newman|2005|p=15}}
Failed to remove: {{sfn|Morris|2015|p=64}}. Error: {{sfn|Morris|2015|p=64}}
Failed to remove: {{sfnm|1a1=McLaughlin|1y=2007|1p=166|2a1=Jun|2y=2009|2p=507|3a1=Franks|3y=2013|3pp=386–388}}. Error: {{sfnm|1a1=McLaughlin|1y=2007|1p=166|2a1=Jun|2y=2009|2p=507|3a1=Franks|3y=2013|3pp=386–388}}
Failed to remove: 

In [11]:
categories

['Abraham Lincoln',
 '1809 births',
 '1865 deaths',
 '1865 murders in the United States',
 '1860s assassinated politicians',
 '19th-century American politicians',
 '19th-century presidents of the United States',
 'American abolitionists',
 'American colonization movement',
 'American lawyers admitted to the practice of law by reading law',
 'American military personnel of the Indian Wars',
 'American militia officers',
 'American nationalists',
 'American political party founders',
 'Illinois postmasters',
 'American surveyors',
 'Assassinated presidents of the United States',
 'Burials at Oak Ridge Cemetery',
 'Candidates in the 1860 United States presidential election',
 'Candidates in the 1864 United States presidential election',
 'Hall of Fame for Great Americans inductees',
 'Illinois Central Railroad people',
 'Illinois Republicans',
 'Illinois lawyers',
 'Lincoln family',
 'Male murder victims',
 'Members of the Illinois House of Representatives',
 'People associated with the a

In [None]:
# use | as delimiter in categories

#### Old Code

In [24]:
templates = parsed.filter_templates()

short_desc = None
for template in templates:
    if template.name.matches("Short description"):
        print(template)
        break

{{Short description|Ratio of how much light is reflected back from a body}}


In [32]:
str(template.get(1).value)

'Ratio of how much light is reflected back from a body'

In [17]:
text = parsed.strip_code()

In [19]:
print(text)

thumb|Albedo change in Greenland: The map shows the difference between the amount of sunlight Greenland reflected in the summer of 2011 versus the average percent it reflected between 2000 and 2006. Some areas reflecting close to 20 percent less light than a decade ago.
Albedo (; ) is the fraction of sunlight that is diffusely reflected by a body. It is measured on a scale from 0 (corresponding to a black body that absorbs all incident radiation) to 1 (corresponding to a body that reflects all incident radiation).

Surface albedo is defined as the ratio of radiosity Je to the irradiance Ee (flux per unit area) received by a surface. The proportion reflected is not only determined by properties of the surface itself, but also by the spectral and angular distribution of solar radiation reaching the Earth's surface. These factors vary with atmospheric composition, geographic location, and time (see position of the Sun). While bi-hemispherical reflectance is calculated for a single angle o

In [35]:
titles

['Anarchism',
 'Albedo',
 'A',
 'Alabama',
 'Achilles',
 'Abraham Lincoln',
 'Aristotle',
 'An American in Paris',
 'Academy Award for Best Production Design',
 'Academy Awards',
 'Animalia (book)',
 'International Atomic Time',
 'Altruism',
 'Ayn Rand',
 'Alain Connes',
 'Allan Dwan',
 'Algeria',
 'List of Atlas Shrugged characters',
 'Anthropology',
 'Agricultural science',
 'Alchemy',
 'Astronomer',
 'ASCII',
 'Animation',
 'Apollo',
 'Andre Agassi',
 'Austroasiatic languages',
 'Afroasiatic languages',
 'Andorra',
 'American Football Conference',
 'Animal Farm',
 'Amphibian',
 'Alaska',
 'Agriculture',
 'Aldous Huxley',
 'Algae',
 'Analysis of variance',
 'Alkane',
 'Appellate procedure in the United States',
 'Appellate court',
 'Arraignment',
 'America the Beautiful',
 'Assistive technology',
 'Abacus',
 'Acid',
 'Bitumen',
 'American National Standards Institute',
 'Apollo 11',
 'Apollo 8',
 'Astronaut',
 'A Modest Proposal',
 'Alkali metal',
 'Alphabet',
 'Atomic number',
 'Ana