# Acquiring data - Web scraping

## 1. Download archives with Wikipedia citation records

In [1]:
import sys
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import requests
import bs4

In [3]:
baseurl = 'https://analytics.wikimedia.org/datasets/archive/public-datasets/all/mwrefs/mwcites-20180301'
res = requests.get(baseurl)

In [4]:
res.status_code == requests.codes.ok

True

In [5]:
basepage = bs4.BeautifulSoup(res.text, "lxml")

In [6]:
target_dir = '../data/raw'
os.makedirs(target_dir, exist_ok=True)

Download and save gzipped tar files from the base page

In [8]:
for fname in basepage.find_all(text=re.compile('gz$')):
    #print('file:', fname)
    url = baseurl + '/' + fname
    res = requests.get(url)
    res.raise_for_status()
    
    # save tarbal to arachive
    with open(os.path.join(target_dir, fname), 'wb') as fo:
        for chunk in res.iter_content(100000):
            fo.write(chunk)

Untar the archives to target directory (/data/raw)

In [None]:
import tarfile

for fname in basepage.find_all(text=re.compile('gz$')):
    tar = tarfile.open(os.path.join(target_dir, fname))
    tar.extractall(path=target_dir)
    tar.close()

## 2. Categorization of Wikipedia pages based on their description

In [9]:
base_path = '../data/raw'
processed_path = '../data/processed'

In [10]:
# read TSV data
df = pd.read_csv(os.path.join(base_path,'enwiki.tsv'), sep='\t', parse_dates=['timestamp'],infer_datetime_format=True)

# Convert mistakenly converted type nan to string 'NaN' (wikipedia page name)
df.page_title = df.page_title.fillna("NaN")

df.head(5)

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
0,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20078357
1,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0604502
2,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0003329
3,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,0708.1752
4,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20064946


Lists of unique pages and publications

In [43]:
# list of unique web page ids (web_page nodes)
wp_ids = df.page_id.unique()

# list of unique web page names
wp_names = df.page_title.unique()

# list of unique publications (publication nodes)
pub_ids = df.id.unique()

In [73]:
wiki_url = 'http://en.wikipedia.org'

In [97]:
r = requests.get(wiki_url+ '/?curid=' + str(wp_ids[1]))
soup = bs4.BeautifulSoup(r.text, 'html.parser')
cats_base = soup.select('#mw-normal-catlinks ul li a')

In [95]:
def get_cats(cat_ref):
    r = requests.get(wiki_url+cat_ref)
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    cats = soup.select('#mw-normal-catlinks ul li a')
    return cats

In [98]:
for i in range(len(cats_base)):
    cats = [cats_base[i]]
    while len(cats) > 0:
        c = cats[0]
        print(f'Round {i}', c.get_text(), len(cats))
        cat_ref = c.get('href')
        cats = get_cats(cat_ref)

Round 0 Electronic music stubs 1
Round 0 Electronic music 4
Round 0 Popular music 5
Round 0 Mass media 3
Round 0 Communication 6
Round 0 Behavior 5
Round 0 Action (philosophy) 2
Round 0 Free will 3
Round 0 Autonomy 7
Round 0 Forms of government 4
Round 0 Constitutions 7
Round 0 Constitutional law 5
Round 0 Law 5
Round 0 Government 9
Round 0 Legal entities 5
Round 0 Corporate law 5
Round 0 Business law 4
Round 0 Business 5
Round 0 Economy 1
Round 0 Society 1
Round 0 Main topic classifications 3
Round 0 Articles 1
Round 0 Contents 1
Round 1 20th-century music genres 1
Round 1 20th century in music 2
Round 1 20th-century works 4
Round 1 20th century 4
Round 1 2nd millennium 4
Round 1 Millennia 2
Round 1 Chronology 2
Round 1 Fields of history 6
Round 1 History 2
Round 1 Main topic classifications 5
Round 1 Articles 1
Round 1 Contents 1
Round 2 Techno genres 1
Round 2 Techno 2
Round 2 20th-century music genres 2
Round 2 20th century in music 2
Round 2 20th-century works 4
Round 2 20th centu

In [67]:
print(*[c.get_text() + 2*'\n' for c in cats])

Electronic music stubs

 20th-century music genres

 Techno genres

 Industrial music

 British styles of music




In [59]:
wp_names[1]

'Industrial techno'

In [16]:
import wikipedia

In [40]:
#wp = wikipedia.page(pageid=wpages[0])
wp = wikipedia.page('Aquila_(constellation)')

In [41]:
wp.categories

['All articles needing additional references',
 'All articles with unsourced statements',
 'Aquila (constellation)',
 'Articles needing additional references from November 2009',
 'Articles with unsourced statements from March 2011',
 'Articles with unsourced statements from November 2015',
 'Constellations listed by Ptolemy',
 'Equatorial constellations',
 'Wikipedia articles incorporating a citation from the 1911 Encyclopaedia Britannica with Wikisource reference',
 'Wikipedia articles incorporating text from the 1911 Encyclopædia Britannica',
 'Wikipedia articles with GND identifiers']

In [None]:
#for pid, name in zip(wpages, wnames):
#    wp = wikipedia.page(pageid=pid)
    
    #print(name, wp.categories)

In [20]:
category_url = 'https://en.wikipedia.org/w/api.php?action=query&format=json&prop=categories&titles='

In [36]:
url = category_url + 'Aquila_(constellation)'# wnames[1]

In [37]:
r = requests.get(url)

In [38]:
d = r.json()#['categories']

In [39]:
from pprint import pprint
pprint(d)

{'continue': {'clcontinue': '269002|Wikipedia_articles_with_GND_identifiers',
              'continue': '||'},
 'query': {'normalized': [{'from': 'Aquila_(constellation)',
                           'to': 'Aquila (constellation)'}],
           'pages': {'269002': {'categories': [{'ns': 14,
                                                'title': 'Category:All '
                                                         'articles needing '
                                                         'additional '
                                                         'references'},
                                               {'ns': 14,
                                                'title': 'Category:All '
                                                         'articles with '
                                                         'unsourced '
                                                         'statements'},
                                               {'ns': 14,
           