In [1]:
import wikipedia

# List of links which may have the data I'm interested in

https://en.wikipedia.org/wiki/Category:All_stub_articles
https://en.wikipedia.org/wiki/Category:Articles_to_be_expanded


Turns out that that it isn't possible to access "special:" pages on wikipedia with the api. I should be able to get around this with a scraper.

Let's see if we can build that. 

In [2]:
from bs4 import BeautifulSoup
import requests
import html5lib

In [3]:
html = requests.get("https://en.wikipedia.org/wiki/Category:All_stub_articles")

In [4]:
soup = BeautifulSoup(html.text, 'html5lib')

In [5]:
soup('div', {'class', "mw-category-group"})

[<div class="mw-category-group"><h3>-</h3>
 <ul><li><a href="/wiki/(%E2%88%921)F" title="(−1)F">(−1)F</a></li>
 <li><a href="/wiki/-an" title="-an">-an</a></li>
 <li><a href="/wiki/-oate" title="-oate">-oate</a></li>
 <li><a href="/wiki/-ol" title="-ol">-ol</a></li>
 <li><a href="/wiki/-yne" title="-yne">-yne</a></li></ul></div>,
 <div class="mw-category-group"><h3>,</h3>
 <ul><li><a href="/wiki/Battle_of_Koporye" title="Battle of Koporye">Battle of Koporye</a></li></ul></div>,
 <div class="mw-category-group"><h3>?</h3>
 <ul><li><a href="/wiki/The_%27%3F%27_Motorist" title="The '?' Motorist">The '?' Motorist</a></li>
 <li><a href="/wiki/%3F!_(album)" title="?! (album)">?! (album)</a></li>
 <li><a href="/wiki/%3FEjere_K%27elni_Kue_Indian_Reserve_No._196I" title="?Ejere K'elni Kue Indian Reserve No. 196I">?Ejere K'elni Kue Indian Reserve No. 196I</a></li></ul></div>,
 <div class="mw-category-group"><h3>.</h3>
 <ul><li><a href="/wiki/%E0%A5%B0" title="॰">॰</a></li>
 <li><a href="/wiki/.25

In [6]:
test1 = soup('div', {'class', "mw-category-group"})[0]

In [7]:
test1

<div class="mw-category-group"><h3>-</h3>
<ul><li><a href="/wiki/(%E2%88%921)F" title="(−1)F">(−1)F</a></li>
<li><a href="/wiki/-an" title="-an">-an</a></li>
<li><a href="/wiki/-oate" title="-oate">-oate</a></li>
<li><a href="/wiki/-ol" title="-ol">-ol</a></li>
<li><a href="/wiki/-yne" title="-yne">-yne</a></li></ul></div>

In [8]:
test1.findAll('li')

[<li><a href="/wiki/(%E2%88%921)F" title="(−1)F">(−1)F</a></li>,
 <li><a href="/wiki/-an" title="-an">-an</a></li>,
 <li><a href="/wiki/-oate" title="-oate">-oate</a></li>,
 <li><a href="/wiki/-ol" title="-ol">-ol</a></li>,
 <li><a href="/wiki/-yne" title="-yne">-yne</a></li>]

In [9]:
for thing in test1.findAll('li'):
    print(thing)

<li><a href="/wiki/(%E2%88%921)F" title="(−1)F">(−1)F</a></li>
<li><a href="/wiki/-an" title="-an">-an</a></li>
<li><a href="/wiki/-oate" title="-oate">-oate</a></li>
<li><a href="/wiki/-ol" title="-ol">-ol</a></li>
<li><a href="/wiki/-yne" title="-yne">-yne</a></li>


I think I've figured out what patterns I need find

In [10]:
def yield_links_pages_stubs(soup):
    """
    This is crawler that is designed to work through a single page of
    Category:All stub articles page on wikipedia 
    returns [href, title]
    """
    
    for div_class in soup('div', {'class', "mw-category-group"}):
        for li in div_class.findAll('li'):
            yield li.a.attrs['href'], li.a.attrs['title']


In [11]:
def next_page_link(soup):
    """
    Takes beautiful soup object and iterates 
    through all 'a' categories and finds the one
    which has 'next page' in it
    """
    for next_link in soup.findAll('a'):
        if 'next page' in next_link.contents:
            return next_link.attrs['href']
    return None
    

In [12]:
import pandas as pd

In [13]:
stub_dataframe = pd.DataFrame(columns=['href', 'title'])

base_link = "https://en.wikipedia.org"
initial_link = "https://en.wikipedia.org/wiki/Category:All_stub_articles"

html = requests.get(initial_link)
soup = BeautifulSoup(html.text, 'html5lib')

dataframe_index = 0
while True:
    for ref, title in yield_links_pages_stubs(soup):
        # This generates the link data of the stubs
        stub_dataframe.loc[dataframe_index] = [ref, title]
        dataframe_index += 1
        next_link = next_page_link(soup)
        
        # Check to see if it has crawled through every page
        if next_link is None:
            break
        
    print(next_link)
    html = requests.get(base_link + next_link)
    soup = BeautifulSoup(html.text, 'html5lib')


/w/index.php?title=Category:All_stub_articles&pagefrom=1+Month+2+Live#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=1st+Division+%28Colombia%29#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2+Piscis+Austrini#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2D+Or+Not+2D+Animation+Festival#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2PAR#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=3-MeO-PCMo#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=3rd+Fighter+Aviation+Division+%28People%27s+Liberation+Army+Air+Force%29#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=4-6-4-4#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=4th+Cavalry+Brigade+%28Imperial+Japanese+Army%29#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=5-0530%0AA530+road#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=5th+U-boat+Flotilla#mw-pages


KeyboardInterrupt: 

Though this code works there is a serious bottleneck with dynamically appending data to a pandas dataframe. To overcome this I'll create a bunch of Pd.DataFrames and then dump them into a csv file.

In [15]:
import os

In [19]:
base_link = "https://en.wikipedia.org"
initial_link = "https://en.wikipedia.org/wiki/Category:All_stub_articles"

html = requests.get(initial_link)
soup = BeautifulSoup(html.text, 'html5lib')

dataframe_index = 0
# dumping a csv for each page that's queried
page_count = 0

while True:
    
    stub_dataframe = pd.DataFrame(columns=['href', 'title'])
    
    for ref, title in yield_links_pages_stubs(soup):
        # This generates the link data of the stubs
        stub_dataframe.loc[dataframe_index] = [ref, title]
        dataframe_index += 1
        next_link = next_page_link(soup)
        
        # Check to see if it has crawled through every page
        if next_link is None:
            break
    
    stub_dataframe.to_csv(os.path.join('../data/', 'stub_data_20180614_file_' + str(page_count) + '.csv'))
    page_count += 1  
                          
    print(next_link)
                          
    html = requests.get(base_link + next_link)
    soup = BeautifulSoup(html.text, 'html5lib')

/w/index.php?title=Category:All_stub_articles&pagefrom=1+Month+2+Live#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=1st+Division+%28Colombia%29#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2+Piscis+Austrini#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2D+Or+Not+2D+Animation+Festival#mw-pages
/w/index.php?title=Category:All_stub_articles&pagefrom=2PAR#mw-pages


KeyboardInterrupt: 