In [1]:
import xml.etree.ElementTree as ET
import mwparserfromhell
import csv

In [2]:
DATA_FOLDER = "/net/projects/veitch/datasets/"
WIKI_XML = "enwiki-20231001-pages-articles-multistream.xml"

In [16]:
context = ET.iterparse(DATA_FOLDER + WIKI_XML, events=("start", "end"))

In [17]:
title, text = None, None
titles = []
articles = []

# Iterate through XML tags
for event, elem in context:
    if event == "start" and elem.tag.endswith("title"):
        title = elem.text  # Grab the title when the <title> start tag is found
    if event == "end" and elem.tag.endswith("text"):
        text = elem.text  # Grab the text content when the </text> end tag is found

    if title and text:
        parsed_text = mwparserfromhell.parse(text)
        titles.append(title)
        articles.append(text)
        title, text = None, None

    elem.clear()  # Free up memory by clearing the element from the tree
    
    if len(titles) == 40:
        break

In [18]:
def remove_wikitext(parsed, filter_method):
    elements_to_remove = list(filter_method())  # Create a copy of elements to avoid in-place modification issues
    for elem in elements_to_remove:
        try:
             if elem in parsed:  # Handle nested templates
                parsed.remove(elem)
        except ValueError as e:
            print(f"Failed to remove: {elem}. Error: {e}")

def extract_categories(parsed):
    categories = []
    elements_to_remove = []
    for link in parsed.filter_wikilinks():
            try:
                elements_to_remove.append(link)
                link_title = str(link.title)
                if "Category:" in link_title:
                    categories.append(link_title.replace("Category:", ""))
            except ValueError as e:
                print(f"Failed to extract category: {link}. Error: {e}")
    # TODO: need to figure out which links to actually remove
    # for elem in elements_to_remove:
    #     try:
    #         parsed.remove(elem)
    #     except ValueError as e:
    #         print(f"Failed to remove link: {elem}. Error: {e}")
    return categories

In [19]:
titles

['Anarchism',
 'Albedo',
 'A',
 'Alabama',
 'Achilles',
 'Abraham Lincoln',
 'Aristotle',
 'An American in Paris',
 'Academy Award for Best Production Design',
 'Academy Awards',
 'Animalia (book)',
 'International Atomic Time',
 'Altruism',
 'Ayn Rand',
 'Alain Connes',
 'Allan Dwan',
 'Algeria',
 'List of Atlas Shrugged characters',
 'Anthropology',
 'Agricultural science',
 'Alchemy',
 'Astronomer',
 'ASCII',
 'Animation',
 'Apollo',
 'Andre Agassi',
 'Austroasiatic languages',
 'Afroasiatic languages',
 'Andorra',
 'American Football Conference',
 'Animal Farm',
 'Amphibian',
 'Alaska',
 'Agriculture',
 'Aldous Huxley',
 'Algae',
 'Analysis of variance',
 'Alkane',
 'Appellate procedure in the United States',
 'Appellate court']

In [21]:
with open('data/parsed-paragraphs-train-anarchy-anthropology.csv', 'w', newline='', encoding='utf-8') as f_train, open('data/parsed-paragraphs-test.csv', 'w', newline='', encoding='utf-8') as f_test:
    fieldnames = ['text', 'title', 'categories']
    writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
    writer_train.writeheader()

    for i, article in enumerate(articles):
        print(f"Title: {titles[i]}")
        if titles[i] not in ["Anarchism", "Anthropology"]:
            continue
        parsed = mwparserfromhell.parse(article)

        remove_wikitext(parsed, parsed.filter_templates)
        remove_wikitext(parsed, parsed.filter_headings)
        categories = extract_categories(parsed) 

        plain_text = parsed.strip_code()
        paragraphs = [para.strip() for para in plain_text.split('\n') if para.strip()]

        for j, para in enumerate(paragraphs):
            # writer.writerow({'text': p, 'label': titles[i]})
            writer_train.writerow({'text': para, 'title': titles[i], 'categories': categories})



Title: Anarchism
Failed to remove: {{sfn|Guérin|1970|p=12}}. Error: {{sfn|Guérin|1970|p=12}}
Title: Albedo
Title: A
Title: Alabama
Title: Achilles
Title: Abraham Lincoln
Title: Aristotle
Title: An American in Paris
Title: Academy Award for Best Production Design
Title: Academy Awards
Title: Animalia (book)
Title: International Atomic Time
Title: Altruism
Title: Ayn Rand
Title: Alain Connes
Title: Allan Dwan
Title: Algeria
Title: List of Atlas Shrugged characters
Title: Anthropology
Title: Agricultural science
Title: Alchemy
Title: Astronomer
Title: ASCII
Title: Animation
Title: Apollo
Title: Andre Agassi
Title: Austroasiatic languages
Title: Afroasiatic languages
Title: Andorra
Title: American Football Conference
Title: Animal Farm
Title: Amphibian
Title: Alaska
Title: Agriculture
Title: Aldous Huxley
Title: Algae
Title: Analysis of variance
Title: Alkane
Title: Appellate procedure in the United States
Title: Appellate court


In [12]:
titles

['Alchemy',
 'Astronomer',
 'ASCII',
 'Animation',
 'Apollo',
 'Andre Agassi',
 'Austroasiatic languages',
 'Afroasiatic languages',
 'Andorra',
 'American Football Conference',
 'Animal Farm',
 'Amphibian',
 'Alaska',
 'Agriculture',
 'Aldous Huxley',
 'Algae',
 'Analysis of variance',
 'Alkane',
 'Appellate procedure in the United States',
 'Appellate court']

In [8]:
with open('data/parsed-paragraphs-train-anarchy-anthropology.csv', 'w', newline='', encoding='utf-8') as f_train, open('data/parsed-paragraphs-test.csv', 'w', newline='', encoding='utf-8') as f_test:
    fieldnames = ['text', 'title', 'categories']
    writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
    writer_train.writeheader()

    writer_test = csv.DictWriter(f_test, fieldnames=fieldnames)
    writer_test.writeheader()

    for i, article in enumerate(articles):
        if titles[i] not in ["Anarchism", "Anthropology"]:
            break
        parsed = mwparserfromhell.parse(article)

        remove_wikitext(parsed, parsed.filter_templates)
        remove_wikitext(parsed, parsed.filter_headings)
        categories = extract_categories(parsed) 

        plain_text = parsed.strip_code()
        paragraphs = [para.strip() for para in plain_text.split('\n') if para.strip()]

        for j, para in enumerate(paragraphs):
            # writer.writerow({'text': p, 'label': titles[i]})
            if j < 2:
                writer_train.writerow({'text': para, 'title': titles[i], 'categories': categories})
            elif j < 4:
                writer_test.writerow({'text': para, 'title': titles[i], 'categories': categories})
            else:
                break



Failed to remove: {{sfn|Guérin|1970|p=12}}. Error: {{sfn|Guérin|1970|p=12}}


In [38]:
categories

['Agricultural science']

In [17]:
# use | as delimiter in categories

#### Old Code

In [18]:
templates = parsed.filter_templates()

short_desc = None
for template in templates:
    if template.name.matches("Short description"):
        print(template)
        break

In [19]:
str(template.get(1).value)

NameError: name 'template' is not defined

In [None]:
text = parsed.strip_code()

In [None]:
print(text)

thumb|Albedo change in Greenland: The map shows the difference between the amount of sunlight Greenland reflected in the summer of 2011 versus the average percent it reflected between 2000 and 2006. Some areas reflecting close to 20 percent less light than a decade ago.
Albedo (; ) is the fraction of sunlight that is diffusely reflected by a body. It is measured on a scale from 0 (corresponding to a black body that absorbs all incident radiation) to 1 (corresponding to a body that reflects all incident radiation).

Surface albedo is defined as the ratio of radiosity Je to the irradiance Ee (flux per unit area) received by a surface. The proportion reflected is not only determined by properties of the surface itself, but also by the spectral and angular distribution of solar radiation reaching the Earth's surface. These factors vary with atmospheric composition, geographic location, and time (see position of the Sun). While bi-hemispherical reflectance is calculated for a single angle o

In [None]:
titles

['Anarchism',
 'Albedo',
 'A',
 'Alabama',
 'Achilles',
 'Abraham Lincoln',
 'Aristotle',
 'An American in Paris',
 'Academy Award for Best Production Design',
 'Academy Awards',
 'Animalia (book)',
 'International Atomic Time',
 'Altruism',
 'Ayn Rand',
 'Alain Connes',
 'Allan Dwan',
 'Algeria',
 'List of Atlas Shrugged characters',
 'Anthropology',
 'Agricultural science',
 'Alchemy',
 'Astronomer',
 'ASCII',
 'Animation',
 'Apollo',
 'Andre Agassi',
 'Austroasiatic languages',
 'Afroasiatic languages',
 'Andorra',
 'American Football Conference',
 'Animal Farm',
 'Amphibian',
 'Alaska',
 'Agriculture',
 'Aldous Huxley',
 'Algae',
 'Analysis of variance',
 'Alkane',
 'Appellate procedure in the United States',
 'Appellate court',
 'Arraignment',
 'America the Beautiful',
 'Assistive technology',
 'Abacus',
 'Acid',
 'Bitumen',
 'American National Standards Institute',
 'Apollo 11',
 'Apollo 8',
 'Astronaut',
 'A Modest Proposal',
 'Alkali metal',
 'Alphabet',
 'Atomic number',
 'Ana