### Purpose and use

This Jupyter Notebook is to process the scraped revisions to construct network-ready dataframe.

**The output**: Each record in the output **nodeEdge.csv** presents a one-to-one relationship between two articles. Each node represents an information source, and an edge between any two node (the row in nodeEdge) represents the interlinked relationship, which cues a potential trace of information creation.

**Next step**: The output **nodeEdge** will be imported into Gephi for network visualisation and modularity computation.

**Output data schema:**
- Id: Revision ID, string
- ParentId: Parent Revision ID, string
- ArticleName: Article title, string
- TimeStamp: Revision time, string
- Link: An hyperlink in the revision text, string
- LinkTitle: Hyperlink title, string
- LinkType: Internal or external hyperlink, string

In [1]:
import os
import re
import csv
import glob
from bs4 import BeautifulSoup
from datetime import datetime
from pathlib import Path
from lxml import etree


In [51]:
def parse_revision(revision, article_name):
    # metadata
    rev_id = revision.find('id').text if revision.find('id') is not None else None
    parent_id = revision.find('parentid').text if revision.find('parentid') is not None else None
    timestamp_str = revision.find('timestamp').text if revision.find('timestamp') is not None else None
    timestamp = datetime.fromisoformat(timestamp_str.replace('Z', '')) if timestamp_str else None
    year, month, day = (timestamp.year, timestamp.month, timestamp.day) if timestamp else (None, None, None)

    text = revision.find('text').text if revision.find('text') is not None else ""
    if text:
        internal_link_pattern = re.compile(r'\[\[([^\]|]+)(?:\|([^\]]+))?\]\]')
        external_link_pattern = re.compile(r'\{\{(cite\s\w+)\s.*?url\s*=\s*([^|]+).*?title\s*=\s*([^|]+)')

        links = []

        # Process internal links
        for match in internal_link_pattern.finditer(text):
            link = "https://en.wikipedia.org/wiki/" + match.group(1).replace(' ', '_')
            title = match.group(2) if match.group(2) else match.group(1)
            link_type = "internal"
            links.append((link.strip(), title.strip(), link_type))

        # Process external links
        for match in external_link_pattern.finditer(text):
            link_type = match.group(1).strip()  # Get type after "cite"
            link = match.group(2).strip()
            title = match.group(3).strip()
            links.append((link, title, link_type))
        return [
            {
                "Id": rev_id,
                "ParentId": parent_id,
                "ArticleName": article_name,
                "TimeStamp": timestamp,
                "Year": year,
                "Month": month,
                "Day": day,
                "Link": link,
                "LinkTitle": title,
                "LinkType": link_type
            }
            for link, title, link_type in links
        ]
    else:
        return []

In [52]:
def crawl_all_xml_files(root_folder, article_name, file_name='nodeEdge.csv'):
    folder = os.path.join(root_folder, article_name)
    output_file = os.path.join(root_folder, file_name)
    print("Crawling folder:", folder)
    file_mode = 'a' if os.path.exists(output_file) else 'w'
    
    with open(output_file, mode=file_mode, newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=[
            "Id", "ParentId", "ArticleName", "TimeStamp", "Year", "Month", "Day", "Link", "LinkTitle", "LinkType"
        ])
        
        if file_mode == 'w':
            writer.writeheader()

        pattern = os.path.join(folder, "**", "*.xml")
        xml_files = glob.glob(pattern, recursive=True)
        print("Found XML files:", len(xml_files))
        
        for xml_file in xml_files:
            try:
                for event, elem in etree.iterparse(xml_file, tag='revision', events=('end',)):
                    link_data = parse_revision(elem, article_name)
                    for data in link_data:
                        writer.writerow(data)
                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
            except Exception as e:
                print(f"Error processing {xml_file}: {e}")

In [53]:
folder_path = os.path.dirname(os.path.dirname(os.getcwd()))+os.sep+"data"
kanye = "Kanye West"
crawl_all_xml_files(folder_path, kanye)

Crawling folder: /Users/Administrator/Desktop/OII/FSDS24/Groupwork/wiki_project/data/Kanye West
Found XML files: 10343


In [54]:
taylor = "Taylor Swift"
crawl_all_xml_files(folder_path, taylor)

Crawling folder: /Users/Administrator/Desktop/OII/FSDS24/Groupwork/wiki_project/data/Taylor Swift
Found XML files: 7796
