### Purpose and use

This Jupyter Notebook is to process the scraped revisions to construct network-ready dataframe.

**The output**: Each record in the output **nodeEdge.csv** presents a one-to-one relationship between two articles. Each node represents an information source, and an edge between any two node (the row in nodeEdge) represents the interlinked relationship, which cues a potential trace of information creation.

**Next step**: The output **nodeEdge** will be imported into Gephi for network visualisation and modularity computation.

**Output data schema:**
- Id: Revision ID, string
- ParentId: Parent Revision ID, string
- ArticleName: Article title, string
- TimeStamp: Revision time, string
- Link: An hyperlink in the revision text, string
- LinkTitle: Hyperlink title, string
- LinkType: Internal or external hyperlink, string

In [42]:
import os
import re
import csv
import glob
import pandas as pd
from bs4 import BeautifulSoup
from datetime import datetime, timedelta
from pathlib import Path
from lxml import etree

In [2]:
node_index = {"Taylor_Swift": 1, "Kanye_West": 2}
node_id_counter = 3
edge_id_counter = 1

In [3]:
def load_existing_data(nodes_file, edges_file):
    global node_index, node_id_counter, edge_id_counter
    if os.path.exists(nodes_file):
        with open(nodes_file, mode='r', encoding='utf-8') as nodes_f:
            reader = csv.DictReader(nodes_f)
            for row in reader:
                node_id = int(row["nodeId"])
                node_index[row["name"]] = node_id
                node_id_counter = max(node_id_counter, node_id + 1)

    if os.path.exists(edges_file):
        with open(edges_file, mode='r', encoding='utf-8') as edges_f:
            reader = csv.DictReader(edges_f)
            for row in reader:
                edge_id = int(row["edgeId"])
                edge_id_counter = max(edge_id_counter, edge_id + 1)

# Load existing data if files already exist
load_existing_data('node.csv', 'edge.csv')


In [4]:
def parse_revision(revision, article_name):
    # metadata
    rev_id = revision.find('id').text if revision.find('id') is not None else None
    parent_id = revision.find('parentid').text if revision.find('parentid') is not None else None
    timestamp_str = revision.find('timestamp').text if revision.find('timestamp') is not None else None
    timestamp = datetime.fromisoformat(timestamp_str.replace('Z', '')) if timestamp_str else None
    year, month, day = (timestamp.year, timestamp.month, timestamp.day) if timestamp else (None, None, None)

    text = revision.find('text').text if revision.find('text') is not None else ""
    if text:
        internal_link_pattern = re.compile(r'\[\[([^\]|]+)(?:\|([^\]]+))?\]\]')
        external_link_pattern = re.compile(r'\{\{(cite\s\w+)\s.*?url\s*=\s*([^|]+).*?title\s*=\s*([^|]+)')

        links = []

        # Process internal links
        for match in internal_link_pattern.finditer(text):
            link = "https://en.wikipedia.org/wiki/" + match.group(1).replace(' ', '_')
            title = match.group(2) if match.group(2) else match.group(1)
            link_type = "internal"
            links.append((link.strip(), title.strip(), link_type))

        # Process external links
        for match in external_link_pattern.finditer(text):
            link_type = match.group(1).strip()  # Get type after "cite"
            link = match.group(2).strip()
            title = match.group(3).strip()
            links.append((link, title, link_type))
        return [
            {
                "revId": rev_id,
                "ParentId": parent_id,
                "ArticleName": article_name,
                "TimeStamp": timestamp,
                "Year": year,
                "Month": month,
                "Day": day,
                "Link": link,
                "LinkTitle": title,
                "LinkType": link_type
            }
            for link, title, link_type in links
        ]
    else:
        return []

In [46]:
node_df = pd.read_csv("nodes_indexed.csv")

In [15]:
def crawl_all_xml_files(root_folder, article_name, nodes_file='node.csv', edges_file='edge.csv', nodeEdge_file='nodeEdge.csv'):
    global node_index, node_id_counter, edge_id_counter
    folder = os.path.join(root_folder, article_name)
    print("Crawling folder:", folder)
    curr = "Taylor_Swift" if "aylor" in folder else "Kanye_West"

    # Open node, edge, and nodeEdge files for appending
    with open(nodes_file, mode='a', newline='', encoding='utf-8') as nodes_f, \
         open(edges_file, mode='a', newline='', encoding='utf-8') as edges_f, \
         open(nodeEdge_file, mode='a', newline='', encoding='utf-8') as nodeEdges_f:

        node_writer = csv.DictWriter(nodes_f, fieldnames=["nodeId", "name"])
        edge_writer = csv.DictWriter(edges_f, fieldnames=["edgeId","revId", "TimeStamp", "from", "to", "Year", "Month","Day","LinkType"])
        nodeEdge_writer = csv.DictWriter(nodeEdges_f, fieldnames=["edgeId", "revId", "ParentId", "ArticleName", "TimeStamp", "Year", "Month", "Day", "Link", "LinkTitle", "LinkType"])

        if nodes_f.tell() == 0:
            node_writer.writeheader()
            node_writer.writerow({"nodeId": 1, "name": "Taylor_Swift"})
            node_writer.writerow({"nodeId": 2, "name": "Kanye_West"})
        if edges_f.tell() == 0:
            edge_writer.writeheader()
        if nodeEdges_f.tell() == 0:
            nodeEdge_writer.writeheader()

        pattern = os.path.join(folder, "**", "*.xml")
        xml_files = glob.glob(pattern, recursive=True)
        print("Found XML files:", len(xml_files))

        for xml_file in xml_files:
            try:
                for event, elem in etree.iterparse(xml_file, tag='revision', events=('end',)):
                    link_data = parse_revision(elem, article_name)

                    for data in link_data:
                        source_name = curr
                        target_name = data["LinkTitle"]
                        if source_name not in node_index:
                            node_index[source_name] = node_id_counter
                            node_writer.writerow({"nodeId": node_index[source_name], "name": source_name})
                            node_id_counter += 1
                        
                        if target_name not in node_index:
                            node_index[target_name] = node_id_counter
                            node_writer.writerow({"nodeId": node_index[target_name], "name": target_name})
                            node_id_counter += 1

                        edge_writer.writerow({
                            "edgeId": edge_id_counter,
                            "TimeStamp": data["TimeStamp"],
                            "revId": data["revId"],
                            "from": node_index[source_name],
                            "to": node_index[target_name],
                            "Year": data["Year"],
                            "Month": data["Month"],
                            "Day": data["Day"],
                            "LinkType": data["LinkType"]
                        })

                        nodeEdge_writer.writerow({
                            "edgeId": edge_id_counter,
                            "revId": data["revId"],
                            "ParentId": data["ParentId"],
                            "ArticleName": data["ArticleName"],
                            "TimeStamp": data["TimeStamp"],
                            "Year": data["Year"],
                            "Month": data["Month"],
                            "Day": data["Day"],
                            "Link": data["Link"],
                            "LinkTitle": data["LinkTitle"],
                            "LinkType": data["LinkType"]
                        })
                        edge_id_counter += 1

                    elem.clear()
                    while elem.getprevious() is not None:
                        del elem.getparent()[0]
            except Exception as e:
                print(f"Error processing {xml_file}: {e}")

In [17]:
folder_path = os.path.dirname(os.path.dirname(os.getcwd())) + os.sep + "data"
crawl_all_xml_files(folder_path, "Kanye_West")
crawl_all_xml_files(folder_path, "Taylor_Swift")

Crawling folder: /Users/Administrator/Desktop/OII/FSDS24/Groupwork/wiki_project/data/Kanye_West
Found XML files: 9642
Crawling folder: /Users/Administrator/Desktop/OII/FSDS24/Groupwork/wiki_project/data/Taylor_Swift
Found XML files: 19301


## Helpers

### Transforming columns name
Transforming the column names for Gephi ready schema

In [19]:
df = pd.read_csv("node.csv")
df.rename(columns={"name": "Label","nodeId":"Id"}, inplace=True)
df.to_csv("node.csv", index=False)


In [18]:
df = pd.read_csv("edge.csv")
df.rename(columns={"from": "source", "to": "target"}, inplace=True)
df.to_csv("edge.csv", index=False)


In [24]:
node_index_df = pd.DataFrame(node_index, index=[0])
transformed_df = pd.DataFrame({
    "Id": node_index_df.iloc[0].values,
    "Label": node_index_df.columns
})
transformed_df.to_csv("nodes_indexed.csv")

In [26]:
def filter_and_aggregate_edges(edges_file, filterOffsetDate):
    cutoff_date = datetime.strptime(filterOffsetDate, '%Y-%m-%d')
    df = pd.read_csv(edges_file)

    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
    filtered_df = df[df['TimeStamp'] <= cutoff_date]

    aggregated_df = filtered_df.groupby(['source', 'target'], as_index=False).agg({
        'edgeId': 'first',
        'revId': 'count',
        'Year': 'first',
        'Month': 'first',
        'Day': 'first',
        'LinkType': 'first',
    })

    aggregated_df.rename(columns={'revId': 'weight'}, inplace=True)

    output_file = f"TK_Edge_{filterOffsetDate}.csv"
    aggregated_df.to_csv(output_file, index=False)
    print(f"Aggregated edges saved to {output_file}")


filter_and_aggregate_edges(edges_file="edge.csv", filterOffsetDate="2012-07-01")


Aggregated edges saved to TK_Edge_2012-07-01.csv


### Helper function for filtering common

    Filters nodes that have edges connecting to both "Taylor Swift" and "Kanye West" for the first time after the specified date and aggregates the edges by adding a weight.

    Args:
    - edges_file (str): The path to the edge.csv file.
    - filterOffsetDate (str): The cutoff date in the format 'YYYY-MM-DD'.
    - A CSV file named TK_NewlyConnectedNodes_[filterOffsetDate].csv containing the filtered and aggregated edges.

    

In [40]:

def filter_newly_connected_nodes(edges_file, filterOffsetDate):
    cutoff_date = datetime.strptime(filterOffsetDate, '%Y-%m-%d')
    df = pd.read_csv(edges_file)
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])

    taylor_id = 1
    kanye_id = 2

    df = df.sort_values(by='TimeStamp')
    first_appearance_df = df.groupby(['source', 'target'], as_index=False).first()
    filtered_df = first_appearance_df[first_appearance_df['TimeStamp'] > cutoff_date]
    relevant_nodes = filtered_df.groupby('target').filter(lambda x: {taylor_id, kanye_id}.issubset(x['source'].values))
    relevant_edges = df[df['target'].isin(relevant_nodes['target']) & (df['TimeStamp'] > cutoff_date)]
    aggregated_edges = relevant_edges.groupby(['source', 'target'], as_index=False).agg({
        'TimeStamp': 'first',
        "Year":"first",
        "Month":"first",
        'edgeId': 'first',
        'LinkType': 'first',
        'revId': 'count',
    }).rename(columns={'revId': 'weight', "year":"first_appeared_in_year","month":"first_appeared_in_month"})

    output_file = f"TK_NewlyConnectedNodes_{filterOffsetDate}.csv"
    aggregated_edges.to_csv(output_file, index=False)
    print(f"Filtered and aggregated edges saved to {output_file}")

filter_newly_connected_nodes(edges_file="edge.csv", filterOffsetDate="2006-12-30")


Filtered and aggregated edges saved to TK_NewlyConnectedNodes_2012-05-01.csv


### Helper function: Getting comparative datasets and unique newly emerged dataset

    Generates two CSV files:
    1. All unique edges with weights before the cutoff date, including target names.
    2. New edges formed within the specified day range after the cutoff date, excluding edges that appeared before.

    Args:
    - edges_file (str): The path to the edge.csv file.
    - cutoff_date (str): The cutoff date in the format 'YYYY-MM-DD'.
    - day_range (int): Number of days after the cutoff date to capture new edges.

    Outputs:
    - CSV file `UniqueEdgesBefore_[cutoff_date].csv` for edges before the cutoff date.
    - CSV file `NewEdgesWithinRange_[cutoff_date]_Range_[day_range].csv` for new edges in the specified range.

In [48]:
nodes_df = pd.read_csv("nodes_indexed.csv") 
id_to_label = dict(zip(nodes_df['Id'], nodes_df['Label']))


def generate_edge_csvs(edges_file, cutoff_date, day_range):
    cutoff_datetime = datetime.strptime(cutoff_date, '%Y-%m-%d')
    range_end_date = cutoff_datetime + timedelta(days=day_range)
    df = pd.read_csv(edges_file)
    df['TimeStamp'] = pd.to_datetime(df['TimeStamp'])
    df['target_name'] = df['target'].map(id_to_label)

    edges_before_cutoff = df[df['TimeStamp'] < cutoff_datetime]
    unique_edges_before = edges_before_cutoff.groupby(['source', 'target', 'target_name'], as_index=False).agg({
        'TimeStamp': 'first', 
        'edgeId': 'first', 
        'LinkType': 'first',  
        'revId': 'count', 
    }).rename(columns={'revId': 'weight'})

    # Save unique edges before the cutoff date to CSV
    output_file_before = f"UniqueEdgesBefore_{cutoff_date}.csv"
    unique_edges_before.to_csv(output_file_before, index=False)
    print(f"Unique edges before {cutoff_date} saved to {output_file_before}")
    existing_edges = set(zip(unique_edges_before['source'], unique_edges_before['target']))
    edges_after_cutoff = df[(df['TimeStamp'] >= cutoff_datetime) & (df['TimeStamp'] <= range_end_date)]

    # Exclude any edge that appeared in the prior dataset
    new_edges_in_range = edges_after_cutoff[~edges_after_cutoff[['source', 'target']].apply(tuple, axis=1).isin(existing_edges)]
    aggregated_new_edges = new_edges_in_range.groupby(['source', 'target', 'target_name'], as_index=False).agg({
        'TimeStamp': 'first',
        'edgeId': 'first',
        'LinkType': 'first',
        'revId': 'count',
    }).rename(columns={'revId': 'weight'})

    # Save new edges within the specified day range to CSV
    output_file_after = f"NewEdgesWithinRange_{cutoff_date}_Range_{day_range}.csv"
    aggregated_new_edges.to_csv(output_file_after, index=False)
    print(f"New edges within {day_range} days after {cutoff_date} saved to {output_file_after}")

generate_edge_csvs(edges_file="edge.csv", cutoff_date="2009-09-11", day_range=30)


Unique edges before 2009-09-11 saved to UniqueEdgesBefore_2009-09-11.csv
New edges within 30 days after 2009-09-11 saved to NewEdgesWithinRange_2009-09-11_Range_30.csv
