# Data 511 A1

***TODO: expand inline comments to md***

# Step 1: Gathering the Data

In [2]:
import json
import requests

In [113]:
# request endpoints
endpoint_legacy = 'https://wikimedia.org/api/rest_v1/metrics/legacy/pagecounts/aggregate/{project}/{access-site}/{granularity}/{start}/{end}'

endpoint_pageviews = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/aggregate/{project}/{access}/{agent}/{granularity}/{start}/{end}'

In [114]:
# params for legacy pagecount
desktop_params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "desktop-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2020090100"
                    }
mobile_params_legacy = {"project" : "en.wikipedia.org",
                 "access-site" : "mobile-site",
                 "granularity" : "monthly",
                 "start" : "2008010100",
                 "end" : "2020090100"
                    }

# params for pageviews
desktop_params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "desktop",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2008010100",
                    "end" : '2020090100'
                        }
mobile_params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "mobile-web",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2008010100",
                    "end" : '2020090100'
                        }
mobile_app_params_pageviews = {"project" : "en.wikipedia.org",
                    "access" : "mobile-app",
                    "agent" : "user",
                    "granularity" : "monthly",
                    "start" : "2008010100",
                    "end" : '2020090100'
                        }

# github info
headers = {
    'User-Agent': 'https://github.com/vaneshsu99',
    'From': 'vaneshsu@uw.edu'
}

In [5]:
# function for api call
# takes in endpoint and param dict
def api_call(endpoint,parameters):
    call = requests.get(endpoint.format(**parameters), headers=headers)
    response = call.json()
    
    return response

In [6]:
# make all 5 api calls
desktop_legacy = api_call(endpoint_legacy, desktop_params_legacy)
mobile_legacy = api_call(endpoint_legacy, mobile_params_legacy)
desktop_pageviews = api_call(endpoint_pageviews, desktop_params_pageviews)
mobile_pageviews = api_call(endpoint_pageviews, mobile_params_pageviews)
mobile_app_pageviews = api_call(endpoint_pageviews, mobile_app_params_pageviews)

In [8]:
# function for saving data to json files
def save_json(filename, data):
    with open(filename, 'w') as file:
        json.dump(data, file)

In [9]:
# save data to json files
save_json('pagecounts_desktop-site_200801-202008.json', desktop_legacy)
save_json('pagecounts_mobile-site_200801-202008.json', mobile_legacy)
save_json('pageviews_desktop_200801-202008.json', desktop_pageviews)
save_json('pageviews_mobile-site_200801-202008.json', mobile_pageviews)
save_json('pageviews_mobile-app_200801-202008.json', mobile_app_pageviews)

# Step 2: Processing the Data

In [95]:
# final columns we want
columns = ['year', 
           'month', 
           'pagecount_all_views', 
           'pagecount_desktop_views', 
           'pagecount_mobile_views', 
           'pageview_all_views',
           'pageview_desktop_views',
           'pageview_mobile_views'
          ]

In [13]:
import pandas as pd

In [115]:
# turn mobile pageview json to dataframes
mobile_pageviews_df = pd.DataFrame(mobile_pageviews['items'])
mobile_app_pageviews_df = pd.DataFrame(mobile_app_pageviews['items'])

In [116]:
# sum mobile web pageviews and mobile app pageviews
mobile_combined_pageviews = mobile_pageviews_df['views'] + mobile_app_pageviews_df['views']
mobile_combined_pageviews_df = pd.DataFrame({'access': 'pageview_mobile_views', 'timestamp':mobile_pageviews_df['timestamp'], 'views':mobile_combined_pageviews})

In [117]:
# desktop legacy, mobile legacy, desktop pageviews to dataframes
# select only relevant columns
# assign column with type of access (future column name for pivot later)
desktop_legacy_df = pd.DataFrame(desktop_legacy['items'], columns=['timestamp', 'count']).assign(access='pagecount_desktop_views')
desktop_legacy_df.rename(columns={'count': 'views'}, inplace=True)
mobile_legacy_df = pd.DataFrame(mobile_legacy['items'], columns=['timestamp', 'count']).assign(access='pagecount_mobile_views')
mobile_legacy_df.rename(columns={'count': 'views'}, inplace=True)
desktop_pageviews_df = pd.DataFrame(desktop_pageviews['items'], columns=['timestamp', 'views']).assign(access='pageview_desktop_views')


In [118]:
# concat dataframes from all sources
wiki_traffic_df = pd.concat([desktop_legacy_df, mobile_legacy_df, desktop_pageviews_df, mobile_combined_pageviews_df])
wiki_traffic_df.head()

Unnamed: 0,timestamp,views,access
0,2008010100,4930902570,pagecount_desktop_views
1,2008020100,4818393763,pagecount_desktop_views
2,2008030100,4955405809,pagecount_desktop_views
3,2008040100,5159162183,pagecount_desktop_views
4,2008050100,5584691092,pagecount_desktop_views


In [125]:
# pivot type of access to columns
# fill NAs with 0s
wiki_traffic_df_pivot = wiki_traffic_df.pivot_table(index='timestamp', columns='access', values='views').reset_index().fillna(0)
wiki_traffic_df_pivot.columns.name=None

In [126]:
# sum total pagecount and pageviews to create two new columns with totals
wiki_traffic_df_pivot = wiki_traffic_df_pivot.assign(
    pagecount_all_views=wiki_traffic_df_pivot['pagecount_desktop_views']+wiki_traffic_df_pivot['pagecount_mobile_views'],
    pageview_all_views=wiki_traffic_df_pivot['pageview_desktop_views']+wiki_traffic_df_pivot['pageview_mobile_views']
)

In [127]:
wiki_traffic_df_pivot

Unnamed: 0,timestamp,pagecount_desktop_views,pagecount_mobile_views,pageview_desktop_views,pageview_mobile_views,pagecount_all_views,pageview_all_views
0,2008010100,4.930903e+09,0.0,0.000000e+00,0.000000e+00,4.930903e+09,0.000000e+00
1,2008020100,4.818394e+09,0.0,0.000000e+00,0.000000e+00,4.818394e+09,0.000000e+00
2,2008030100,4.955406e+09,0.0,0.000000e+00,0.000000e+00,4.955406e+09,0.000000e+00
3,2008040100,5.159162e+09,0.0,0.000000e+00,0.000000e+00,5.159162e+09,0.000000e+00
4,2008050100,5.584691e+09,0.0,0.000000e+00,0.000000e+00,5.584691e+09,0.000000e+00
...,...,...,...,...,...,...,...
147,2020040100,0.000000e+00,0.0,3.798373e+09,5.505742e+09,0.000000e+00,9.304115e+09
148,2020050100,0.000000e+00,0.0,3.078094e+09,5.231700e+09,0.000000e+00,8.309794e+09
149,2020060100,0.000000e+00,0.0,2.721329e+09,4.573975e+09,0.000000e+00,7.295304e+09
150,2020070100,0.000000e+00,0.0,2.638936e+09,4.809714e+09,0.000000e+00,7.448651e+09


In [128]:
# split timestamp for year and month columns
wiki_traffic_df_pivot = wiki_traffic_df_pivot.assign(year=wiki_traffic_df_pivot['timestamp'].str[:4], month=wiki_traffic_df_pivot['timestamp'].str[4:6])

In [129]:
# reorder final columns
wiki_traffic_final = wiki_traffic_df_pivot[columns]

In [130]:
wiki_traffic_final

Unnamed: 0,year,month,pagecount_all_views,pagecount_desktop_views,pagecount_mobile_views,pageview_all_views,pageview_desktop_views,pageview_mobile_views
0,2008,01,4.930903e+09,4.930903e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
1,2008,02,4.818394e+09,4.818394e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
2,2008,03,4.955406e+09,4.955406e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
3,2008,04,5.159162e+09,5.159162e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
4,2008,05,5.584691e+09,5.584691e+09,0.0,0.000000e+00,0.000000e+00,0.000000e+00
...,...,...,...,...,...,...,...,...
147,2020,04,0.000000e+00,0.000000e+00,0.0,9.304115e+09,3.798373e+09,5.505742e+09
148,2020,05,0.000000e+00,0.000000e+00,0.0,8.309794e+09,3.078094e+09,5.231700e+09
149,2020,06,0.000000e+00,0.000000e+00,0.0,7.295304e+09,2.721329e+09,4.573975e+09
150,2020,07,0.000000e+00,0.000000e+00,0.0,7.448651e+09,2.638936e+09,4.809714e+09


In [131]:
# save to csvs
wiki_traffic_final.to_csv('en-wikipedia_traffic_200712-202008.csv')

# Step 3