In [1]:
from bs4 import BeautifulSoup
import requests
BASE_URL = 'http://en.wikipedia.org'
# Wikipedia will reject our request unless we add
# a 'User-Agent' attribute to our http header.
HEADERS = {'User-Agent': 'Mozilla/5.0'}
def get_Nobel_soup():
    """ Return a parsed tag tree of our Nobel prize page """
    # Make a request to the Nobel page, setting valid headers
    response = requests.get(
    BASE_URL + '/wiki/List_of_Nobel_laureates',
    headers=HEADERS)
    # Return the content of the response parsed by BeautifulSoup
    return BeautifulSoup(response.content, "lxml")

In [2]:
soup = get_Nobel_soup()

In [3]:
soup.find('table', {'class':'wikitable sortable'})

<table class="wikitable sortable">\n<tbody><tr>\n<th>Year\n</th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physics" title="List of Nobel laureates in Physics">Physics</a>\n</th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Chemistry" title="List of Nobel laureates in Chemistry">Chemistry</a>\n</th>\n<th width="18%"><a href="/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine" title="List of Nobel laureates in Physiology or Medicine">Physiology<br/>or Medicine</a>\n</th>\n<th width="16%"><a href="/wiki/List_of_Nobel_laureates_in_Literature" title="List of Nobel laureates in Literature">Literature</a>\n</th>\n<th width="16%"><a href="/wiki/List_of_Nobel_Peace_Prize_laureates" title="List of Nobel Peace Prize laureates">Peace</a>\n</th>\n<th width="15%"><a class="mw-redirect" href="/wiki/List_of_Nobel_laureates_in_Economics" title="List of Nobel laureates in Economics">Economics</a>\n</th></tr>\n<tr>\n<td align="center">1901\n</td>\n<td><span data-sort-v

In [4]:
def get_column_titles(table):
    """ Get the Nobel categories from the table header """
    cols = []
    for th in table.select_one('tr').select('th')[1:]:
        link = th.select_one('a')
        # Store the category name and any Wikipedia link it has
        if link:
            cols.append({'name':link.text,\
                        'href':link.attrs['href']})
        else:
            cols.append({'name':th.text, 'href':None})
    return cols

In [5]:
table = soup.select_one('table.sortable.wikitable')
get_column_titles(table)

[{'href': '/wiki/List_of_Nobel_laureates_in_Physics', 'name': u'Physics'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Chemistry', 'name': u'Chemistry'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Physiology_or_Medicine',
  'name': u'Physiologyor Medicine'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Literature',
  'name': u'Literature'},
 {'href': '/wiki/List_of_Nobel_Peace_Prize_laureates', 'name': u'Peace'},
 {'href': '/wiki/List_of_Nobel_laureates_in_Economics', 'name': u'Economics'}]

In [18]:
def get_Nobel_winners(table):
    cols = get_column_titles(table)
    winners = []
    for row in table.select('tr')[1:-1]:
        ## year = int(row.select_one('td').text) # Gets 1st <td>
        year = int(row.select_one('td').text[0:4])
        ##year = row.select_one('td').text
        for i, td in enumerate(row.select('td')[1:]):
            for winner in td.select('a'):
                href = winner.attrs['href']
                if not href.startswith('#endnote'):
                    winners.append({
                        'year':year,
                        'category':cols[i]['name'],
                        'name':winner.text,
                        'link':winner.attrs['href']
                    })
    return winners

In [19]:
get_Nobel_winners(table)

[{'category': u'Physics',
  'link': '/wiki/Wilhelm_R%C3%B6ntgen',
  'name': u'Wilhelm R\xf6ntgen',
  'year': 1901},
 {'category': u'Chemistry',
  'link': '/wiki/Jacobus_Henricus_van_%27t_Hoff',
  'name': u"Jacobus Henricus van 't Hoff",
  'year': 1901},
 {'category': u'Physiologyor Medicine',
  'link': '/wiki/Emil_Adolf_von_Behring',
  'name': u'Emil Adolf von Behring',
  'year': 1901},
 {'category': u'Literature',
  'link': '/wiki/Sully_Prudhomme',
  'name': u'Sully Prudhomme',
  'year': 1901},
 {'category': u'Peace',
  'link': '/wiki/Henry_Dunant',
  'name': u'Henry Dunant',
  'year': 1901},
 {'category': u'Peace',
  'link': '/wiki/Fr%C3%A9d%C3%A9ric_Passy',
  'name': u'Fr\xe9d\xe9ric Passy',
  'year': 1901},
 {'category': u'Physics',
  'link': '/wiki/Hendrik_Lorentz',
  'name': u'Hendrik Lorentz',
  'year': 1902},
 {'category': u'Physics',
  'link': '/wiki/Pieter_Zeeman',
  'name': u'Pieter Zeeman',
  'year': 1902},
 {'category': u'Chemistry',
  'link': '/wiki/Hermann_Emil_Fischer',