In [43]:
import json, time, requests
from dotenv import load_dotenv
import os
import time
import pandas as pd
import math

In [9]:
################################################################################################ 
# The code and comments below are adopted, with light modifications, from Dr. David McDonald,
# who provided them for use in DATA 512, a course in the University of Washington MS of Data
# Science Program. The code is provided and utilized here 
# under the Creative Commons CC-BY license
################################################################################################

#########
#
#    CONSTANTS
#    The current LiftWing ORES API endpoint and prediction model
API_ORES_LIFTWING_ENDPOINT = "https://api.wikimedia.org/service/lw/inference/v1/models/{model_name}:predict"
API_ORES_EN_QUALITY_MODEL = "enwiki-articlequality"

#
#    The throttling rate is a function of the Access token that you are granted when you request the token. The constants
#    come from dissecting the token and getting the rate limits from the granted token. An example of that is below.
#
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = ((60.0*60.0)/5000.0)-API_LATENCY_ASSUMED  # The key authorizes 5000 requests per hour

# Defining the request header

REQUEST_HEADER_TEMPLATE = {
    'User-Agent': "<{email_address}>, University of Washington, MSDS DATA 512 - AUTUMN 2024",
    'Content-Type': 'application/json',
    'Authorization': "Bearer {access_token}"
}
#
#    This is a template for the parameters that we need to supply in the headers of an API request
#
REQUEST_HEADER_PARAMS_TEMPLATE = {
    'email_address' : "dvogler@uw.edu",         # your email address should go here
    'access_token'  : ""          # the access token you create will need to go here
}

#    This is a template of the data required as a payload when making a scoring request of the ORES model
#
ORES_REQUEST_DATA_TEMPLATE = {
    "lang":        "en",     # required that its english - we're scoring English Wikipedia revisions
    "rev_id":      "",       # this request requires a revision id
    "features":    True
}

#
#    These are used later - defined here so they, at least, have empty values
#
USERNAME = ""
ACCESS_TOKEN = ""

Aside from the constants above, one more input is needed to make the API requests to ORES for article quality scores: a mapping of article names to their latest revision IDs. For this, I use the ordinary page info APIs to define `ARTICLE_REVISIONS`.

In [10]:
######### CONSTANTS

# The basic English Wikipedia API endpoint
API_ENWIKIPEDIA_ENDPOINT = "https://en.wikipedia.org/w/api.php"
API_HEADER_AGENT = 'User-Agent'

PAGEINFO_EXTENDED_PROPERTIES = "talkid|url|watched|watchers"

# This should include an email - your UW email would be good to put in there
REQUEST_HEADERS = {
    'User-Agent': '<dvogler@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2024'
}

PAGEINFO_PARAMS_TEMPLATE = {
    "action": "query",
    "format": "json",
    "titles": "",           # to simplify this should be a single page title at a time
    "prop": "info",
    "inprop": PAGEINFO_EXTENDED_PROPERTIES
}

#########
#
#    PROCEDURES/FUNCTIONS
#

def request_pageinfo_per_article(article_title = None, 
                                 endpoint_url = API_ENWIKIPEDIA_ENDPOINT, 
                                 request_template = PAGEINFO_PARAMS_TEMPLATE,
                                 headers = REQUEST_HEADERS):
    
    # article title can be as a parameter to the call or in the request_template
    if article_title:
        request_template['titles'] = article_title

    if not request_template['titles']:
        raise Exception("Must supply an article title to make a pageinfo request.")

    if API_HEADER_AGENT not in headers:
        raise Exception(f"The header data should include a '{API_HEADER_AGENT}' field that contains your UW email address.")

    if 'uwnetid@uw' in headers[API_HEADER_AGENT]:
        raise Exception(f"Use your UW email address in the '{API_HEADER_AGENT}' field.")

    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free
        # data source like Wikipedia - or any other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(endpoint_url, headers=headers, params=request_template)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [24]:
info = request_pageinfo_per_article("Donald Trump")
print(json.dumps(info['query']['pages'],indent=4))

{
    "4848272": {
        "pageid": 4848272,
        "ns": 0,
        "title": "Donald Trump",
        "contentmodel": "wikitext",
        "pagelanguage": "en",
        "pagelanguagehtmlcode": "en",
        "pagelanguagedir": "ltr",
        "touched": "2024-10-13T20:17:17Z",
        "lastrevid": 1251002499,
        "length": 415035,
        "watchers": 3912,
        "talkid": 570793,
        "fullurl": "https://en.wikipedia.org/wiki/Donald_Trump",
        "editurl": "https://en.wikipedia.org/w/index.php?title=Donald_Trump&action=edit",
        "canonicalurl": "https://en.wikipedia.org/wiki/Donald_Trump"
    }
}


In [52]:
def make_batch(names):
    
    separator = " | "
    result = separator.join(names)

    return result

In [66]:
cleaned_politicians_path = "../cleaned_data/politicians_by_country_AUG_2024_clean.csv"

politicians = pd.read_csv(cleaned_politicians_path)

BATCH_SIZE_LIMIT = 50

iterations = math.ceil(len(politicians) / BATCH_SIZE_LIMIT)

ARTICLE_REVISIONS = {}

for iteration in range(iterations):

    print(f"Starting iteration {iteration} out of {iterations}")
    start = iteration * BATCH_SIZE_LIMIT
    end = start + BATCH_SIZE_LIMIT - 1

    names = politicians.loc[start:end, "name"]

    batch_key = make_batch(names)

    batch_info = request_pageinfo_per_article(batch_key)

    payload = batch_info["query"]["pages"]

    for k in payload.keys():
        
        try:
            revid = payload[k]["lastrevid"]
            title = payload[k]["title"]
        except:
            revid = None
            print(f"Warning: no revid found for article with title {title}")
        
        ARTICLE_REVISIONS[title] = revid
    





Starting iteration 0 out of 144
Starting iteration 1 out of 144
Starting iteration 2 out of 144
Starting iteration 3 out of 144
Starting iteration 4 out of 144
Starting iteration 5 out of 144
Starting iteration 6 out of 144
Starting iteration 7 out of 144
Starting iteration 8 out of 144
Starting iteration 9 out of 144
Starting iteration 10 out of 144
Starting iteration 11 out of 144
Starting iteration 12 out of 144
Starting iteration 13 out of 144
Starting iteration 14 out of 144
Starting iteration 15 out of 144
Starting iteration 16 out of 144
Starting iteration 17 out of 144
Starting iteration 18 out of 144
Starting iteration 19 out of 144
Starting iteration 20 out of 144
Starting iteration 21 out of 144
Starting iteration 22 out of 144
Starting iteration 23 out of 144
Starting iteration 24 out of 144
Starting iteration 25 out of 144
Starting iteration 26 out of 144
Starting iteration 27 out of 144
Starting iteration 28 out of 144
Starting iteration 29 out of 144
Starting iteration 3

In [68]:
with open("../cleaned_data/article_revisions.json", 'w') as file:
    json.dump(ARTICLE_REVISIONS, file, indent = 4)

In [69]:
USERNAME = "voglerdaniel"
load_dotenv()
ACCESS_TOKEN = os.getenv("ACCESS_TOKEN")

In [70]:
#########
#
#    PROCEDURES/FUNCTIONS
#

def request_ores_score_per_article(article_revid = None, email_address=None, access_token=None,
                                   endpoint_url = API_ORES_LIFTWING_ENDPOINT, 
                                   model_name = API_ORES_EN_QUALITY_MODEL, 
                                   request_data = ORES_REQUEST_DATA_TEMPLATE, 
                                   header_format = REQUEST_HEADER_TEMPLATE, 
                                   header_params = REQUEST_HEADER_PARAMS_TEMPLATE):
    
    #    Make sure we have an article revision id, email and token
    #    This approach prioritizes the parameters passed in when making the call
    if article_revid:
        request_data['rev_id'] = article_revid
    if email_address:
        header_params['email_address'] = email_address
    if access_token:
        header_params['access_token'] = access_token
    
    #   Making a request requires a revision id - an email address - and the access token
    if not request_data['rev_id']:
        raise Exception("Must provide an article revision id (rev_id) to score articles")
    if not header_params['email_address']:
        raise Exception("Must provide an 'email_address' value")
    if not header_params['access_token']:
        raise Exception("Must provide an 'access_token' value")
    
    # Create the request URL with the specified model parameter - default is a article quality score request
    request_url = endpoint_url.format(model_name=model_name)
    
    # Create a compliant request header from the template and the supplied parameters
    headers = dict()
    for key in header_format.keys():
        headers[str(key)] = header_format[key].format(**header_params)
    
    # make the request
    try:
        # we'll wait first, to make sure we don't exceed the limit in the situation where an exception
        # occurs during the request processing - throttling is always a good practice with a free data
        # source like ORES - or other community sources
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        #response = requests.get(request_url, headers=headers)
        response = requests.post(request_url, headers=headers, data=json.dumps(request_data))
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response


In [74]:
score = request_ores_score_per_article(ARTICLE_REVISIONS[politicians.loc[1, "name"]], 
                                       email_address="dvogler@uw.edu",
                                       access_token=ACCESS_TOKEN)

In [75]:
score

{'enwiki': {'models': {'articlequality': {'version': '0.9.2'}},
  'scores': {'1230459615': {'articlequality': {'score': {'prediction': 'B',
      'probability': {'B': 0.41680778937422464,
       'C': 0.3779375158384741,
       'FA': 0.057958708381212594,
       'GA': 0.08901185936511881,
       'Start': 0.052924970659449413,
       'Stub': 0.0053591563815205195}}}}}}}