In [1]:
import urllib.request as urllib2
import time
from datetime import datetime
import json
from urllib.parse import urlencode, quote_plus

In [2]:
TOP_API_URL = 'https://wikimedia.org/api/rest_v1/metrics/pageviews/'\
              'top/{lang}.{project}/all-access/{year}/{month}/{day}'

def get_traffic(year, month, day):
    '''\
    Get the traffic report for the top 1000 articles for a given day.
    TODO: Get from local file, if available
    '''
    url = TOP_API_URL.format(lang='en',
                             project='wikipedia',
                             year=year,
                             month=month,
                             day=day)
#     if DEBUG:
#         print 'Getting %s' % url
    resp = urllib2.urlopen(url)
    resp_bytes = resp.read()
#         rec.success('Fetched {len_bytes} bytes from {url}',
#                     len_bytes=len(resp_bytes), url=url)

    data = json.loads(resp_bytes)
    articles = data['items'][0]['articles']
    return articles

In [3]:
get_traffic('2020', '02','29')

[{'article': 'Main_Page', 'views': 11011950, 'rank': 1},
 {'article': 'United_States_Senate', 'views': 5785633, 'rank': 2},
 {'article': 'Special:Search', 'views': 1379510, 'rank': 3},
 {'article': 'February_29', 'views': 449520, 'rank': 4},
 {'article': '2019–20_coronavirus_outbreak', 'views': 401916, 'rank': 5},
 {'article': 'The_Invisible_Man_(2020_film)', 'views': 275387, 'rank': 6},
 {'article': 'Coronavirus', 'views': 268392, 'rank': 7},
 {'article': 'Carlton_Tavern,_Kilburn', 'views': 244195, 'rank': 8},
 {'article': '2019–20_coronavirus_outbreak_by_country_and_territory',
  'views': 158829,
  'rank': 9},
 {'article': 'Boris_Johnson', 'views': 143594, 'rank': 10},
 {'article': 'Mila_Kunis', 'views': 142252, 'rank': 11},
 {'article': 'Leap_year', 'views': 139097, 'rank': 12},
 {'article': 'Roman_Polanski', 'views': 138395, 'rank': 13},
 {'article': 'Zhong_Nanshan', 'views': 126388, 'rank': 14},
 {'article': 'Muhyiddin_Yassin', 'views': 124799, 'rank': 15},
 {'article': 'Bible', '

In [4]:
MW_API_URL = 'https://{lang}.{project}.org/w/api.php?'
PREFIXES = ['Special', 'Template', 'Sp?cial', 'Project']

def get_wiki_info(lang, project):
    '''\
    Get the mainpage title and local namespace map.
    '''
    url = MW_API_URL.format(lang=lang, project=project)
    params = {'action': 'query',
              'meta': 'siteinfo',
              'format': 'json',
              'siprop': 'general|namespaces'}
    resp = urllib2.urlopen(url + urlencode(params))
    data = json.loads(resp.read())
    mainpage = data['query']['general']['mainpage'].replace(' ', '_')
    namespaces = [ns_info['*'].replace(' ', '_') for ns_id, ns_info in
                  data['query']['namespaces'].items() if ns_id is not 0]
    return {'mainpage': mainpage, 'namespaces': namespaces}


  data['query']['namespaces'].items() if ns_id is not 0]


In [5]:
def is_article(title, wiki_info):
    '''\
    Is it an article, or some other sort of page? We'll want to filter out the
    search page (Special:Search in English, etc) and similar pages appearing
    inconveniently in the traffic report.
    Skip xhamster. There are a few clues this Wikipedia traffic is artificial.
    See https://en.wikipedia.org/w/index.php?title=XHamster&diff=701682670&oldid=700826198
    '''
    skip = ['-', '404.php', 'XHamster'] + [wiki_info['mainpage']]
    prefixes = PREFIXES + wiki_info['namespaces']
    if title in skip:
        return False
    if title == "Media":
        return False
    if title == "Wikipedia":
        return False
    if title == "United_States_Senate":
        return False
    for prefix in prefixes:
        if title.startswith(prefix + ':'):
            return False
    return True

In [6]:
wiki_info = get_wiki_info('en', 'wikipedia')
raw_traffic = get_traffic('2020', '02', '29')
articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]

In [7]:
len(articles[:25])
articles

[{'article': 'February_29', 'views': 449520, 'rank': 4},
 {'article': '2019–20_coronavirus_outbreak', 'views': 401916, 'rank': 5},
 {'article': 'The_Invisible_Man_(2020_film)', 'views': 275387, 'rank': 6},
 {'article': 'Coronavirus', 'views': 268392, 'rank': 7},
 {'article': 'Carlton_Tavern,_Kilburn', 'views': 244195, 'rank': 8},
 {'article': '2019–20_coronavirus_outbreak_by_country_and_territory',
  'views': 158829,
  'rank': 9},
 {'article': 'Boris_Johnson', 'views': 143594, 'rank': 10},
 {'article': 'Mila_Kunis', 'views': 142252, 'rank': 11},
 {'article': 'Leap_year', 'views': 139097, 'rank': 12},
 {'article': 'Roman_Polanski', 'views': 138395, 'rank': 13},
 {'article': 'Zhong_Nanshan', 'views': 126388, 'rank': 14},
 {'article': 'Muhyiddin_Yassin', 'views': 124799, 'rank': 15},
 {'article': 'Bible', 'views': 118980, 'rank': 16},
 {'article': 'Spanish_flu', 'views': 115849, 'rank': 17},
 {'article': 'WrestleMania_36', 'views': 104952, 'rank': 18},
 {'article': 'Parasite_(2019_film)',

In [8]:
import pandas as pd
import numpy as np

In [9]:
date1 = '2020-01-01'
date2 = '2021-01-07'
dates = pd.date_range(date1, date2).tolist()

In [11]:
dates[0].strftime('%Y')

'2020'

In [12]:
dates

[Timestamp('2020-01-01 00:00:00', freq='D'),
 Timestamp('2020-01-02 00:00:00', freq='D'),
 Timestamp('2020-01-03 00:00:00', freq='D'),
 Timestamp('2020-01-04 00:00:00', freq='D'),
 Timestamp('2020-01-05 00:00:00', freq='D'),
 Timestamp('2020-01-06 00:00:00', freq='D'),
 Timestamp('2020-01-07 00:00:00', freq='D'),
 Timestamp('2020-01-08 00:00:00', freq='D'),
 Timestamp('2020-01-09 00:00:00', freq='D'),
 Timestamp('2020-01-10 00:00:00', freq='D'),
 Timestamp('2020-01-11 00:00:00', freq='D'),
 Timestamp('2020-01-12 00:00:00', freq='D'),
 Timestamp('2020-01-13 00:00:00', freq='D'),
 Timestamp('2020-01-14 00:00:00', freq='D'),
 Timestamp('2020-01-15 00:00:00', freq='D'),
 Timestamp('2020-01-16 00:00:00', freq='D'),
 Timestamp('2020-01-17 00:00:00', freq='D'),
 Timestamp('2020-01-18 00:00:00', freq='D'),
 Timestamp('2020-01-19 00:00:00', freq='D'),
 Timestamp('2020-01-20 00:00:00', freq='D'),
 Timestamp('2020-01-21 00:00:00', freq='D'),
 Timestamp('2020-01-22 00:00:00', freq='D'),
 Timestamp

In [13]:
#array of 25 most popular articles
top_array = []

for date in dates:
    year=date.strftime('%Y')
    month = date.strftime('%m')
    day = date.strftime('%d')
    print(date)
#     wiki_info = get_wiki_info('en', 'wikipedia')
    raw_traffic = get_traffic(year, month, day)
    articles = [a for a in raw_traffic if is_article(a['article'], wiki_info)]
    top_array.append(articles[:25])

2020-01-01 00:00:00
2020-01-02 00:00:00
2020-01-03 00:00:00
2020-01-04 00:00:00
2020-01-05 00:00:00
2020-01-06 00:00:00
2020-01-07 00:00:00
2020-01-08 00:00:00
2020-01-09 00:00:00
2020-01-10 00:00:00
2020-01-11 00:00:00
2020-01-12 00:00:00
2020-01-13 00:00:00
2020-01-14 00:00:00
2020-01-15 00:00:00
2020-01-16 00:00:00
2020-01-17 00:00:00
2020-01-18 00:00:00
2020-01-19 00:00:00
2020-01-20 00:00:00
2020-01-21 00:00:00
2020-01-22 00:00:00
2020-01-23 00:00:00
2020-01-24 00:00:00
2020-01-25 00:00:00
2020-01-26 00:00:00
2020-01-27 00:00:00
2020-01-28 00:00:00
2020-01-29 00:00:00
2020-01-30 00:00:00
2020-01-31 00:00:00
2020-02-01 00:00:00
2020-02-02 00:00:00
2020-02-03 00:00:00
2020-02-04 00:00:00
2020-02-05 00:00:00
2020-02-06 00:00:00
2020-02-07 00:00:00
2020-02-08 00:00:00
2020-02-09 00:00:00
2020-02-10 00:00:00
2020-02-11 00:00:00
2020-02-12 00:00:00
2020-02-13 00:00:00
2020-02-14 00:00:00
2020-02-15 00:00:00
2020-02-16 00:00:00
2020-02-17 00:00:00
2020-02-18 00:00:00
2020-02-19 00:00:00


In [14]:
top_array[0][0]

{'article': 'Post_Malone', 'views': 415705, 'rank': 3}

In [15]:
dates_array = []
views_array = []
articles_array = []
ranks_array = []

for i in range(len(top_array)):
    date = dates[i]
    array = top_array[i]
    for j in range(25):
        row = array[j]
        dates_array.append(date)
        articles_array.append(row['article'].replace('_', ' '))
#         print(articles_array)
        views_array.append(row['views'])
        ranks_array.append(row['rank'])
    

In [16]:
len(dates_array)
# articles_array

9325

In [17]:
dict = {'date': dates_array, 'article': articles_array, 'views': views_array, 'rank': ranks_array}
df = pd.DataFrame(data=dict)
df

Unnamed: 0,date,article,views,rank
0,2020-01-01,Post Malone,415705,3
1,2020-01-01,New Year's Day,277191,4
2,2020-01-01,Natašha Stanković,255088,5
3,2020-01-01,Paula Abdul,246877,6
4,2020-01-01,The Witcher (TV series),244934,7
...,...,...,...,...
9320,2021-01-07,Mike Pence,172759,23
9321,2021-01-07,Joe Biden,164591,24
9322,2021-01-07,2020 United States presidential election,160966,25
9323,2021-01-07,Bible,148171,26


In [18]:
df[df.article == "United States Senate"]

Unnamed: 0,date,article,views,rank


In [19]:
df.sort_values(by="views", ascending=False)

Unnamed: 0,date,article,views,rank
6025,2020-08-29,Chadwick Boseman,9929065,1
625,2020-01-26,Kobe Bryant,9513452,2
650,2020-01-27,Kobe Bryant,8045344,2
4125,2020-06-14,Sushant Singh Rajput,7062335,1
7800,2020-11-08,Kamala Harris,6591413,1
...,...,...,...,...
3571,2020-05-22,Candy Ken,58373,25
3572,2020-05-22,COVID-19 pandemic in India,57314,26
3573,2020-05-22,Paul Warner Powell,57105,27
5524,2020-08-08,Jerry Falwell Jr.,56988,27


In [20]:
df.groupby('article').count()

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
13 Ghosts,1,1,1
13 Reasons Why,5,5,5
1838,1,1,1
1917 (2019 film),24,24,24
1927 Liberian general election,1,1,1
...,...,...,...
Zone Rouge,1,1,1
Zoo hypothesis,1,1,1
Zooey Deschanel,3,3,3
special:search,1,1,1


In [21]:
df.groupby('article').agg({'date':'first', 
                         'views':'max', 
                         'rank':'count'}).sort_values(by=['views'], ascending=False)

Unnamed: 0_level_0,date,views,rank
article,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Chadwick Boseman,2020-08-29,9929065,12
Kobe Bryant,2020-01-26,9513452,22
Sushant Singh Rajput,2020-06-14,7062335,33
Kamala Harris,2020-07-29,6591413,40
United States Electoral College,2020-10-27,4986159,13
...,...,...,...
Dany Garcia,2020-08-03,61620,1
Brian Cage,2020-05-24,60418,1
Maurizio Sarri,2020-08-08,59633,1
Robert Sheehan,2020-08-03,59413,1


In [22]:
df.to_csv('wiki_pageviews_jan-jan.csv')