# Acquiring data - Web scraping

## 1. Download archives with Wikipedia citation records

In [1]:
import sys
import os
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
import requests
import bs4

In [3]:
baseurl = 'https://analytics.wikimedia.org/datasets/archive/public-datasets/all/mwrefs/mwcites-20180301'
res = requests.get(baseurl)

In [4]:
res.status_code == requests.codes.ok

True

In [5]:
basepage = bs4.BeautifulSoup(res.text, "lxml")

In [6]:
target_dir = '../data/raw'
os.makedirs(target_dir, exist_ok=True)

Download and save gzipped tar files from the base page

In [8]:
for fname in basepage.find_all(text=re.compile('gz$')):
    #print('file:', fname)
    url = baseurl + '/' + fname
    res = requests.get(url)
    res.raise_for_status()
    
    # save tarbal to arachive
    with open(os.path.join(target_dir, fname), 'wb') as fo:
        for chunk in res.iter_content(100000):
            fo.write(chunk)

Untar the archives to target directory (/data/raw)

In [None]:
import tarfile

for fname in basepage.find_all(text=re.compile('gz$')):
    tar = tarfile.open(os.path.join(target_dir, fname))
    tar.extractall(path=target_dir)
    tar.close()

## 2. Categorization of Wikipedia pages based on their description

In [7]:
base_path = '../data/raw'
processed_path = '../data/processed'

In [8]:
# read TSV data
df = pd.read_csv(os.path.join(base_path,'enwiki.tsv'), sep='\t', parse_dates=['timestamp'],infer_datetime_format=True)

# Convert mistakenly converted type nan to string 'NaN' (wikipedia page name)
df.page_title = df.page_title.fillna("NaN")

df.head(5)

Unnamed: 0,page_id,page_title,rev_id,timestamp,type,id
0,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20078357
1,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0604502
2,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,astro-ph/0003329
3,2867096,Mu Aquilae,508363722,2012-08-20 22:56:21,arxiv,0708.1752
4,2867096,Mu Aquilae,503137751,2012-07-19 16:08:41,doi,10.1051/0004-6361:20064946


Lists of unique pages and publications

In [117]:
# list of unique web page ids (web_page nodes)
wp_ids = df.page_id.unique()

# list of unique web page names
wp_titles = df.page_title.unique()

# list of unique publications (publication nodes)
pub_ids = df.id.unique()

**Construct the category tree graph**

**Implementation:**
Perform breadth-first search of the links and add newly found nodes and edges to a NetworkX directed graph.

At each step
1. Find all categories (nodes) on a page.
2. Create links between these and the current page.
3. Check if the nodes are already present in the graph
4. If not present add to the graph and the to-be-visited list (implemented as collections.deque)

In [118]:
import networkx as nx
from collections import deque

In [152]:
# Create an empty directed graph
G = nx.DiGraph()

In [153]:
node_list = [(title, {'pid': pid,'address':'/wiki/'+title.replace(' ','_'), 'ptype':'topic'}) for pid, title in zip(wp_ids, wp_titles)]

In [154]:
# Create leaf nodes corresponding to Wikipedia pages
G.add_nodes_from(node_list[:1])

Define a queue for pages to be explored in a breadth-first search

In [155]:
# Add all base pages to the queue
queue = deque(G.nodes(data=True))

Also define a set of page addresses for quick search of node presence.

In [156]:
node_titles = set([node[0] for node in G.nodes(data=True)])

Base URLs for wiki pages and wiki API

In [157]:
wiki_url = 'http://en.wikipedia.org'
api_query = 'https://en.wikipedia.org/w/api.php?action=query&format=json&titles='

Define a function for scraping wiki pages for categories

In [158]:
def get_cats(address):
    """
    Accept a wiki page element and return a list of category elements found on the page.
    """

    r = requests.get(wiki_url+address)
    
    soup = bs4.BeautifulSoup(r.text, 'html.parser')
    
    cats = soup.select('#mw-normal-catlinks ul li a')
    
    return cats

Cycle over queue and append new nodes and edges to the graph

In [159]:
it = 0
while len(queue) > 0:
    
    # pop the first node in queue
    node = queue.popleft()
    
    # get the address of the page
    address = node[1]['address']
    
    # scrape the page and return tags for categories
    cats = get_cats(address)
    
    # print each 100 cycles
    if it%100 == 0:
        print('Iteration:', it, len(queue), len(G.nodes()), len(G.edges()), node[0], node[1]['ptype'])
    it += 1

    # create nodes for the categories (if new)
    for c in cats:
        
        # get category page title (== node identification)
        title = c.get_text()
        
        # create a new edge
        G.add_edge(title, node[0])
        
        if title not in node_titles:
            
            # get referenced page title
            title = c.get_text()
            
            # get category page address
            cat_ref = c.get('href')
            
            # create a new node tuple
            new_node = (title, {'address':cat_ref, 'ptype':'category'})
            #print(new_node)
            
            # create a new node
            G.add_node(title, address=cat_ref, ptype='category')
 
            # update queue and node_titles set
            queue.append(new_node)
            node_titles.add(title)

Iteration: 0 0 1 0 Mu Aquilae topic
Iteration: 100 119 220 293 Divination category
Iteration: 200 220 421 592 Theories of deduction category
Iteration: 300 321 622 938 Aerospace agencies category
Iteration: 400 371 772 1260 Form category
Iteration: 500 407 908 1595 Signal processing category
Iteration: 600 445 1046 1882 International organizations by topic category
Iteration: 700 471 1172 2178 Postmodern theory category
Iteration: 800 508 1309 2513 Imagination category
Iteration: 900 522 1423 2869 Propositions category
Iteration: 1000 533 1534 3161 Fields of history category
Iteration: 1100 552 1653 3449 Travel category
Iteration: 1200 564 1765 3765 Real-time technology category
Iteration: 1300 581 1882 4071 Modern physics category
Iteration: 1400 590 1991 4389 Ancient Roman religion category
Iteration: 1500 626 2127 4733 History of Europe category
Iteration: 1600 620 2221 5007 Civil engineering category
Iteration: 1700 642 2343 5315 Area studies by ancient history category
Iteration: 

KeyboardInterrupt: 

**Alternative 1: wikipedia library** 

In [101]:
import wikipedia
wp = wikipedia.page('Aquila_(constellation)')
wp.categories

['All articles needing additional references',
 'All articles with unsourced statements',
 'Aquila (constellation)',
 'Articles needing additional references from November 2009',
 'Articles with unsourced statements from March 2011',
 'Articles with unsourced statements from November 2015',
 'Constellations listed by Ptolemy',
 'Equatorial constellations',
 'Wikipedia articles incorporating a citation from the 1911 Encyclopaedia Britannica with Wikisource reference',
 'Wikipedia articles incorporating text from the 1911 Encyclopædia Britannica',
 'Wikipedia articles with GND identifiers']

**Alternative 2: wikipedia API** 

In [148]:
category_url = 'https://en.wikipedia.org/w/api.php?action=query&format=json&titles='
url = category_url + 'Mu_Aquilae'

In [149]:
r = requests.get(url)
d = r.json()

In [150]:
from pprint import pprint
pprint(d)

{'batchcomplete': '',
 'query': {'normalized': [{'from': 'Mu_Aquilae', 'to': 'Mu Aquilae'}],
           'pages': {'2867096': {'ns': 0,
                                 'pageid': 2867096,
                                 'title': 'Mu Aquilae'}}}}


Neither the library of API are helpful for distinguishing topical and maintenance categories.