# A1 - Data Curation

## Part 1 - Data Acquisition

In [1]:
import json
import requests

In [4]:
def call_api(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    return response

headers = {
    'User-Agent': 'https://github.com/whamsy',
    'From': 'whamsy@uw.edu'
}

### Legacy Page Counts Data -- (December 2007 to July 2016)

First getting the legacy data for Desktop Site usage and dumping to JSON:

In [5]:
pagecounts_api = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'

legacy_desktop_params = {
                    'project' : 'en.wikipedia.org',
                    'access' : 'desktop-site',
                    'granularity' : 'monthly',
                    'start' : '2008010100',
                    'end' : '2016080100'
                    }

desktop_data = call_api(pagecounts_api, legacy_desktop_params)
with open('pagecounts_desktop-site_200801-201607.json', 'w') as outfile:
    json.dump(desktop_data, outfile)

Next getting the legacy data for Mobile Site usage and dumping to JSON:

In [6]:
pagecounts_api = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'

legacy_mobile_params = {
                    'project' : 'en.wikipedia.org',
                    'access' : 'mobile-site',
                    'granularity' : 'monthly',
                    'start' : '2008010100',
                    'end' : '2016080100'
                    }

mobile_data = call_api(pagecounts_api, legacy_mobile_params)
with open('pagecounts_mobile-site_200801-201607.json', 'w') as outfile:
    json.dump(mobile_data, outfile)

### Page Views Data -- (July 2015 to Sep 2018)

Note: Filtering Data by agent : user to get only organic traffic

First getting the Pageviews data for Desktop usage and dumping to JSON:

In [7]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_desktop_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'desktop',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

desktop_data_pageviews = call_api(pageviews_api, pageviews_desktop_params)
with open('pageviews_desktop-site_201507-201809.json', 'w') as outfile:
    json.dump(desktop_data_pageviews, outfile)

Next getting the Pageviews data for Mobile Web usage and dumping to JSON:

In [8]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_mobile_web_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'mobile-web',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

mobile_web_data_pageviews = call_api(pageviews_api, pageviews_mobile_web_params)
with open('pageviews_mobile-web_201507-201809.json', 'w') as outfile:
    json.dump(mobile_web_data_pageviews, outfile)
    

Finally getting the Pageviews data for Mobile App usage and dumping to JSON:

In [9]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_mobile_app_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'mobile-app',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

mobile_app_data_pageviews = call_api(pageviews_api, pageviews_mobile_app_params)
with open('pageviews_mobile-app_201507-201809.json', 'w') as outfile:
    json.dump(mobile_app_data_pageviews, outfile)

In [2]:
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [4]:
 #SAMPLE parameters for getting aggregated legacy view data 
# see: https://wikimedia.org/api/rest_v1/#!/Legacy_data/get_metrics_legacy_pagecounts_aggregate_project_access_site_granularity_start_end
example_params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2001010100",
                # for end use 1st day of month following final month of data
                 "end" : "2018100100"
                    }

# SAMPLE parameters for getting aggregated current standard pageview data
# see: https://wikimedia.org/api/rest_v1/#!/Pageviews_data/get_metrics_pageviews_aggregate_project_access_agent_granularity_start_end
example_params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2001010100",
                    # for end use 1st day of month following final month of data
                    "end" : '2018101000'
                        }

# Customize these with your own information
headers = {
    'User-Agent': 'https://github.com/whamsy',
    'From': 'whamsy@uw.edu'
}

In [5]:
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    return response

In [6]:
example_monthly_pageviews = api_call(endpoint_pageviews, example_params_pageviews)

In [7]:
print(example_monthly_pageviews)

{'items': [{'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015070100', 'views': 4376666686}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015080100', 'views': 4332482183}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015090100', 'views': 4485491704}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015100100', 'views': 4477532755}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015110100', 'views': 4287720220}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2015120100', 'views': 4100012037}, {'project': 'en.wikipedia', 'access': 'desktop', 'agent': 'user', 'granularity': 'monthly', 'timestamp': '2016010100', 'views': 4436179457}, {'

In [8]:
example_monthly_legacy = api_call(endpoint_legacy, example_params_legacy)

In [9]:
print(example_monthly_legacy)

{'items': [{'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2007120100', 'count': 2998331524}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008010100', 'count': 4930902570}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008020100', 'count': 4818393763}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008030100', 'count': 4955405809}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008040100', 'count': 5159162183}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008050100', 'count': 5584691092}, {'project': 'en.wikipedia', 'access-site': 'desktop-site', 'granularity': 'monthly', 'timestamp': '2008060100', 'count': 5712104279}, {'project': 'en.wikipedia', 'access-site': 'desktop