Import necessary libraries

In [1]:
import json 
import time
import pandas as pd
import requests
import urllib.parse

Import dataset with movie titles

In [2]:
df = pd.read_excel('thank_the_academy.AUG.2023.csv.xlsx')

In [141]:
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'
API_LATENCY_ASSUMED = 0.002       # Assuming roughly 2ms latency on the API and network
API_THROTTLE_WAIT = (1.0/100.0)-API_LATENCY_ASSUMED

REQUEST_HEADERS = {
    'User-Agent': '<vaibhav1@uw.edu>, University of Washington, MSDS DATA 512 - AUTUMN 2023',
}

def collect_data(access='desktop'):
    records = dict()
    for title in df.name:
        ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE = {
        "project":     "en.wikipedia.org",
        "access":      access,      # this should be changed for the different access types
        "agent":       "user",
        "article":     title,
        "granularity": "monthly",
        "start":       "20150701",   # start and end dates need to be set
        "end":         "20231030"    # this is likely the wrong end date
        }
        views = request_pageviews_per_article(request_template=ARTICLE_PAGEVIEWS_PARAMS_TEMPLATE)
        if 'items' not in views.keys():
            print(f"No data found for {title}")
            print(f"Return message {views}")
            continue
        records[title] = views['items']
    return records

This code example was developed by Vaibhav Mehrotra and largely reuses code written by Dr. David W. McDonald for use in DATA 512, a course in the UW MS Data Science degree program. This code is provided under the Creative Commons CC-BY license. Revision 1.2 - October 8, 2023

In [142]:
API_REQUEST_PER_ARTICLE_PARAMS = 'per-article/{project}/{access}/{agent}/{article}/{granularity}/{start}/{end}'
API_REQUEST_PAGEVIEWS_ENDPOINT = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'

def request_pageviews_per_article(article_title = None, 
                                  endpoint_url = API_REQUEST_PAGEVIEWS_ENDPOINT, 
                                  endpoint_params = API_REQUEST_PER_ARTICLE_PARAMS, 
                                  request_template = None,
                                  headers = REQUEST_HEADERS):

    if article_title:
        request_template['article'] = article_title

    if not request_template['article']:
        raise Exception("Must supply an article title to make a pageviews request.")

    article_title_encoded = urllib.parse.quote(request_template['article'].replace(' ','_'))
    request_template['article'] = article_title_encoded
    
    request_url = endpoint_url+endpoint_params.format(**request_template)
    
    
    try:
        if API_THROTTLE_WAIT > 0.0:
            time.sleep(API_THROTTLE_WAIT)
        response = requests.get(request_url, headers=headers)
        json_response = response.json()
    except Exception as e:
        print(e)
        json_response = None
    return json_response

Run data collection functions

In [144]:
desktop_records = collect_data('desktop')
mobile_app_records = collect_data('mobile-app')

No data found for Victor/Victoria
Return message {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-app/user/Victor/Victoria/monthly/20150701/20231030'}


In [145]:
mobile_web_records = collect_data('mobile-web')
mobile_records = mobile_app_records.copy()

No data found for Victor/Victoria
Return message {'type': 'https://mediawiki.org/wiki/HyperSwitch/errors/not_found#route', 'title': 'Not found.', 'method': 'get', 'uri': '/wikimedia.org/v1/metrics/pageviews/per-article/en.wikipedia.org/mobile-web/user/Victor/Victoria/monthly/20150701/20231030'}


> The title Victor/Victoria was not found on Wikipedia for desktop, mobile-app and mobile-web

Remove access key from the data entries

In [146]:
for title in desktop_records.keys():
    for entry in desktop_records[title]:
        entry.pop('access', None)
        
for title in mobile_app_records.keys():
    for entry in mobile_app_records[title]:
        entry.pop('access', None)
        
for title in mobile_web_records.keys():
    for entry in mobile_web_records[title]:
        entry.pop('access', None)

Combine mobile-app and mobile-web entries into a mobile dict

In [147]:
for title in mobile_app_records.keys():
    for idx, entry in enumerate(mobile_app_records[title]):
        mobile_records[title][idx]['views'] = entry['views'] + mobile_web_records[title][idx]['views']

Save monthly data

In [148]:
import json
with open('academy_monthly_mobile_201507-202310.json', 'w') as f:
    json.dump(mobile_records, f)
with open('academy_monthly_desktop_201507-202310.json', 'w') as f:
    json.dump(desktop_records, f)

Calculate cumilative data for both desktop and mobile views and save the data

In [150]:
cumulative_records = desktop_records.copy()

for title in desktop_records.keys():
    for idx, entry in enumerate(desktop_records[title]):
        cumulative_records[title][idx]['views'] = entry['views'] + mobile_records[title][idx]['views']

In [151]:
with open('academy_monthly_cumulative_201507-202310.json', 'w') as f:
    json.dump(cumulative_records, f)