## Standard Imports

In [3]:
import datetime as dt
import json
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

## Step 1: Data acquisition

In [68]:
# First define the endpoints. We use two APIs to 
# acquire the data, the legacy and the pageviews API

endpoint_legacy = 'https://wikimedia.org/api/rest_v1/\
                  metrics/legacy/pagecounts/aggregate/\
                  {project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

We first access the data from the legacy pagecounts API and save it as two seperate JSON files. 

In [39]:
# The end_date is the last date for which the pagecounts
# API provides data. The pageviews API replaced the pagecounts
# API.
end_date = '2016073100' 

# The date when the API was created.
start_date = '2008010100' 

We need two accesses. One for the legacy pagecount desktop site traffic and one for the mobile site traffic. 

In [48]:
# Define the parameter dictionary that contains the 
# parameters that need to be sent to the API

params = {'project' : 'en.wikipedia.org',
            'access-site' : 'desktop-site',
            'granularity' : 'monthly',
            'start' : '2008070100',
            'end' : '2016080100'
            }

In [50]:
# Getting desktop-data from the legacy API
api_call = requests.get(endpoint.format(**params))
response = api_call.json()

# Writing JSON data to a file
fileName= open('pagecounts_desktop-site_200801-201607.json', 'w')
json.dump(response, fileName, indent = 4)
fileName.close()

In [51]:
# Getting mobile-site traffic data from the API
params['access-site'] = 'mobile-site'
api_call = requests.get(endpoint.format(**params))
response = api_call.json()

# Writing JSON data to a file
fileName= open('pagecounts_mobile-site_200801-201607.json', 'w')
json.dump(response, fileName, indent = 4)
fileName.close()

The next step is getting the data from the newer, more versatile Pageviews API. The pageviews API allows us to filter out data from spiders and only get traffic from actual users. 

In [70]:
# Define the parameters dictionary that will be passed 
# to the API

params = {'project' : 'en.wikipedia.org',
            'access' : 'desktop',
            'agent' : 'user',
            'granularity' : 'monthly',
            'start' : '2015070100',
            'end' : '2018100100'
            }

headers = {
    'User-Agent': 'https://github.com/tejasmhos',
    'From': 'tejash@uw.edu'
}

In [71]:
# Getting Desktop traffic data from Pageviews
api_call = requests.get(endpoint_pageviews.format(**params))
response = api_call.json()
fileName= open('pageviews_desktop_201507-201810.json', 'w')
json.dump(response, fileName, indent = 4)
fileName.close()

In [72]:
# Getting Mobile-App traffic data from Pageviews API
params['access'] = 'mobile-app'
api_call = requests.get(endpoint_pageviews.format(**params))
response = api_call.json()
fileName= open('pageviews_mobile-app_201507-201810.json', 'w')
json.dump(response, fileName, indent = 4)
fileName.close()

In [73]:
# Getting Mobile-web traffic data from Pageviews API
params['access'] = 'mobile-web'
api_call = requests.get(endpoint_pageviews.format(**params))
response = api_call.json()
fileName= open('pageviews_mobile-web_201507-201810.json', 'w')
json.dump(response, fileName, indent = 4)
fileName.close()