# <center> A1 - Data Curation </center>

## Part 1 - Data Acquisition

In [1]:
import json
import requests

In [4]:
def call_api(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    return response

headers = {
    'User-Agent': 'https://github.com/whamsy',
    'From': 'whamsy@uw.edu'
}

### Legacy Page Counts Data -- (December 2007 to July 2016)

First getting the legacy data for Desktop Site usage and dumping to JSON:

In [5]:
pagecounts_api = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'

legacy_desktop_params = {
                    'project' : 'en.wikipedia.org',
                    'access' : 'desktop-site',
                    'granularity' : 'monthly',
                    'start' : '2008010100',
                    'end' : '2016080100'
                    }

desktop_data = call_api(pagecounts_api, legacy_desktop_params)
with open('pagecounts_desktop-site_200801-201607.json', 'w') as outfile:
    json.dump(desktop_data, outfile)

Next getting the legacy data for Mobile Site usage and dumping to JSON:

In [6]:
pagecounts_api = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access}/{granularity}/{start}/{end}'

legacy_mobile_params = {
                    'project' : 'en.wikipedia.org',
                    'access' : 'mobile-site',
                    'granularity' : 'monthly',
                    'start' : '2008010100',
                    'end' : '2016080100'
                    }

mobile_data = call_api(pagecounts_api, legacy_mobile_params)
with open('pagecounts_mobile-site_200801-201607.json', 'w') as outfile:
    json.dump(mobile_data, outfile)

### Page Views Data -- (July 2015 to Sep 2018)

Note: Filtering Data by agent : user to get only organic traffic

First getting the Pageviews data for Desktop usage and dumping to JSON:

In [7]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_desktop_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'desktop',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

desktop_data_pageviews = call_api(pageviews_api, pageviews_desktop_params)
with open('pageviews_desktop-site_201507-201809.json', 'w') as outfile:
    json.dump(desktop_data_pageviews, outfile)

Next getting the Pageviews data for Mobile Web usage and dumping to JSON:

In [8]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_mobile_web_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'mobile-web',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

mobile_web_data_pageviews = call_api(pageviews_api, pageviews_mobile_web_params)
with open('pageviews_mobile-web_201507-201809.json', 'w') as outfile:
    json.dump(mobile_web_data_pageviews, outfile)
    

Finally getting the Pageviews data for Mobile App usage and dumping to JSON:

In [9]:
pageviews_api = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

pageviews_mobile_app_params = {
        'project' : 'en.wikipedia.org',
        'access' : 'mobile-app',
        'agent' : 'user',
        'granularity' : 'monthly',
        'start' : '2015070100',
        'end' : '2018100100'
        }

mobile_app_data_pageviews = call_api(pageviews_api, pageviews_mobile_app_params)
with open('pageviews_mobile-app_201507-201809.json', 'w') as outfile:
    json.dump(mobile_app_data_pageviews, outfile)

## Part 2 - Data Processing

### Combine the monthly values for mobile-app and mobile-web from PageViews

In [13]:
import json 
import pandas as pd 
from pandas.io.json import json_normalize

In [24]:
with open('pageviews_mobile-app_201507-201809.json') as f1:
    d1 = json.load(f1)
    
pv_mobile_app_data = json_normalize(d1['items'])

In [25]:
with open('pageviews_mobile-web_201507-201809.json') as f2:
    d2 = json.load(f2)
    
pv_mobile_web_data = json_normalize(d2['items'])

In [48]:
pv_mobile_traffic_data = pv_mobile_app_data
pv_mobile_traffic_data['views'] += pv_mobile_web_data['views']

The pv_mobile_traffic dataframe now has combined views for both mobile web data and mobile app data. Now converting the rest into dataframes:

In [31]:
with open('pagecounts_desktop-site_200801-201607.json') as f3:
    d3 = json.load(f3)
    
pagecount_desktop_site_data = json_normalize(d3['items'])

with open('pagecounts_mobile-site_200801-201607.json') as f4:
    d4 = json.load(f4)
    
pagecount_mobile_site_data = json_normalize(d4['items'])

with open('pageviews_desktop-site_201507-201809.json') as f5:
    d5 = json.load(f5)
    
pv_desktop_site_data = json_normalize(d5['items'])

Now we have 4 dataframes from the 5 JSON files created earlier.

### Splitting the timestamp into month and year

In [49]:
pagecount_desktop_site_data['year'] = pagecount_desktop_site_data['timestamp'].str[0:4]
pagecount_desktop_site_data['month'] = pagecount_desktop_site_data['timestamp'].str[4:6]

pagecount_mobile_site_data['year'] = pagecount_mobile_site_data['timestamp'].str[0:4]
pagecount_mobile_site_data['month'] = pagecount_mobile_site_data['timestamp'].str[4:6]

pv_desktop_site_data['year'] = pv_desktop_site_data['timestamp'].str[0:4]
pv_desktop_site_data['month'] = pv_desktop_site_data['timestamp'].str[4:6]

pv_mobile_traffic_data['year'] = pv_mobile_traffic_data['timestamp'].str[0:4]
pv_mobile_traffic_data['month'] = pv_mobile_traffic_data['timestamp'].str[4:6]

### Combining all these dataframes into one csv file