In [1]:
import xml.etree.ElementTree as ET
import mwparserfromhell
import csv

In [2]:
WIKI_FOLDER = "/net/projects/veitch/datasets/"
WIKI_XML = "enwiki-20231001-pages-articles-multistream.xml"

In [3]:
ARTICLES_TO_PARSE = 40

In [None]:
context = ET.iterparse(WIKI_FOLDER + WIKI_XML, events=("start", "end"))

In [None]:
title, text = None, None
titles = []
articles = []

# Iterate through XML tags
for event, elem in context:
    if event == "start" and elem.tag.endswith("title"):
        title = elem.text  # Grab the title when the <title> start tag is found
    if event == "end" and elem.tag.endswith("text"):
        text = elem.text  # Grab the text content when the </text> end tag is found

    if title and text:
        parsed_text = mwparserfromhell.parse(text)
        titles.append(title)
        articles.append(text)
        title, text = None, None

    elem.clear()  # Free up memory by clearing the element from the tree
    
    if ARTICLES_TO_PARSE:
        if len(titles) == ARTICLES_TO_PARSE:
            break

In [None]:
def remove_wikitext(parsed, filter_method):
    elements_to_remove = list(filter_method())  # Create a copy of elements to avoid in-place modification issues
    for elem in elements_to_remove:
        try:
             if elem in parsed:  # Handle nested templates
                parsed.remove(elem)
        except ValueError as e:
            print(f"Failed to remove: {elem}. Error: {e}")

def extract_categories(parsed):
    categories = []
    elements_to_remove = []
    for link in parsed.filter_wikilinks():
            try:
                elements_to_remove.append(link)
                link_title = str(link.title)
                if "Category:" in link_title:
                    categories.append(link_title.replace("Category:", ""))
            except ValueError as e:
                print(f"Failed to extract category: {link}. Error: {e}")
    # TODO: need to figure out which links to actually remove
    # for elem in elements_to_remove:
    #     try:
    #         parsed.remove(elem)
    #     except ValueError as e:
    #         print(f"Failed to remove link: {elem}. Error: {e}")
    return categories

In [None]:
titles

In [None]:
with open('data/parsed-paragraphs-train-anarchy-anthropology.csv', 'w', newline='', encoding='utf-8') as f_train, open('data/parsed-paragraphs-test.csv', 'w', newline='', encoding='utf-8') as f_test:
    fieldnames = ['text', 'title', 'categories']
    writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
    writer_train.writeheader()

    for i, article in enumerate(articles):
        print(f"Title: {titles[i]}")
        if titles[i] not in ["Anarchism", "Anthropology"]:
            continue
        parsed = mwparserfromhell.parse(article)

        remove_wikitext(parsed, parsed.filter_templates)
        remove_wikitext(parsed, parsed.filter_headings)
        categories = extract_categories(parsed) 

        plain_text = parsed.strip_code()
        paragraphs = [para.strip() for para in plain_text.split('\n') if para.strip()]

        for j, para in enumerate(paragraphs):
            # writer.writerow({'text': p, 'label': titles[i]})
            writer_train.writerow({'text': para, 'title': titles[i], 'categories': categories})



In [None]:
titles

In [None]:
ARTICLES_TO_SAVE = ["Anarchism", "Anthropology", "Animation", "Amphibian", "Appellate court", "Algae", "Alchemy", "Alaska", "Astronomer", "Agriculture"]
DATA_FOLDER = "data/"
TRAIN_FILE = DATA_FOLDER + "train-10-articles.csv"
VAL_FILE = DATA_FOLDER + "val-10_articles.csv"

In [None]:
with open(TRAIN_FILE, 'w', newline='', encoding='utf-8') as f_train:
    fieldnames = ['text', 'title', 'categories']
    writer_train = csv.DictWriter(f_train, fieldnames=fieldnames)
    writer_train.writeheader()

    # writer_test = csv.DictWriter(f_test, fieldnames=fieldnames)
    # writer_test.writeheader()

    for i, article in enumerate(articles):
        if titles[i] not in ARTICLES_TO_SAVE:
            continue
        parsed = mwparserfromhell.parse(article)

        remove_wikitext(parsed, parsed.filter_templates)
        remove_wikitext(parsed, parsed.filter_headings)
        categories = extract_categories(parsed) 

        plain_text = parsed.strip_code()
        paragraphs = [para.strip() for para in plain_text.split('\n') if para.strip()]

        for j, para in enumerate(paragraphs):
            writer_train.writerow({'text': para, 'title': titles[i], 'categories': categories})
            # TODO: below is for splitting within the same wiki article
            # if j < 2:
            #     writer_train.writerow({'text': para, 'title': titles[i], 'categories': categories})
            # elif j < 4:
            #     writer_test.writerow({'text': para, 'title': titles[i], 'categories': categories})
            # else:
            #     break



In [None]:
categories

#### Old Code

In [None]:
templates = parsed.filter_templates()

short_desc = None
for template in templates:
    if template.name.matches("Short description"):
        print(template)
        break

In [None]:
str(template.get(1).value)

In [None]:
text = parsed.strip_code()

In [None]:
print(text)

In [None]:
titles

### Use API to get specific articles

In [1]:
import requests

In [2]:
response = requests.get('https://en.wikipedia.org/w/api.php',params={
    'action': 'query',
    'format': 'json',
    'titles': 'basketball',
    'prop': 'revisions',
    'rvprop': 'content',
    }
    ).json()

In [3]:
response

{'batchcomplete': '',
  'revisions': {'*': 'Because "rvslots" was not specified, a legacy format has been used for the output. This format is deprecated, and in the future the new format will always be used.'}},
 'query': {'normalized': [{'from': 'basketball', 'to': 'Basketball'}],
  'pages': {'3921': {'pageid': 3921,
    'ns': 0,
    'title': 'Basketball',
    'revisions': [{'contentformat': 'text/x-wiki',
      'contentmodel': 'wikitext',
      '*': '{{Short description|Team sport}}\n{{About|the sport|the ball used in the sport|Basketball (ball)|other uses}}\n{{Pp-semi-indef}}\n{{Pp-move-indef}}\n{{Use mdy dates|date=November 2022}}\n{{Infobox sport\n| name       = Basketball\n| image      = Chicago Bulls and New Jersey Nets, March 28, 1991.jpg\n| imagesize  = 275px\n| caption    = [[Chris Dudley]] (#22), playing for the [[Brooklyn Nets|New Jersey Nets]], squares off with [[Michael Jordan]] (#23), of the [[Chicago Bulls]] on March 28, 1991. Other players including Chicago\'s [[Bill C

In [None]:

>>> page = next(iter(response['query']['pages'].values()))
>>> wikicode = page['revisions'][0]['*']
>>> parsed_wikicode = mwparserfromhell.parse(wikicode)