# This tutorial will show you how to scrape the web using Python

## The task is to get information about every faculty member in sociology from their department profiles. We must begin with a base URL from which we can access the profiles

In [1]:
URL = "http://www.soc.cornell.edu/people/faculty/"

## There are a couple of packages we need to import

In [2]:
import requests
from bs4 import BeautifulSoup as BS

In [3]:
html = requests.get(URL)

In [5]:
html.content

b'<!DOCTYPE html>\n<!--[if IE 7]>\n<html class="ie ie7" lang="en-US">\n<![endif]-->\n<!--[if IE 8]>\n<html class="ie ie8" lang="en-US">\n<![endif]-->\n<!--[if IE 9]>\n<html class="ie ie9" lang="en-US">\n<![endif]-->\n<!--[if !(IE 7) | !(IE 8) | !(IE 9)  ]><!-->\n<html lang="en-US">\n<!--<![endif]-->\n    <head>\n        <meta charset="UTF-8">\n        <title>Faculty | Department of Sociology</title>\n        <meta name="viewport" content="width=device-width, initial-scale=1, maximum-scale=5">\n        <meta http-equiv="X-UA-Compatible" content="IE=Edge">\n        <link rel="profile" href="http://gmpg.org/xfn/11">\n        <link rel="pingback" href="http://www.soc.cornell.edu/xmlrpc.php">\n        \n        <!-- CU Typekit fonts -->\n        <script type="text/javascript" src="//use.typekit.net/gog6dck.js"></script>\n        <script type="text/javascript">try{Typekit.load();}catch(e){}</script>\n        \n        \n        <link rel="stylesheet" type="text/css" media="print" href="http:

In [10]:
soup = BS(html.content, "html.parser")

## We'll need to do these steps quite a lot so its useful to abstract it with a function 

In [11]:
def getSoup(url):
    html = requests.get(url)
    soup = BS(html.content, "html.parser")
    return soup

## BeautifulSoup provides some useful functions to parse the raw html

In [13]:
links = soup.findAll('a', href=True) #Finds all 'a' tags with an href object (i.e. all hyperlinks)

In [14]:
links

[<a href="#content">Skip to main content</a>,
 <a href="http://www.cornell.edu/search/" title="more options">more options</a>,
 <a href="#" title="Search This Site">Search This Site</a>,
 <a href="http://blogs.cornell.edu/socy/">Home</a>,
 <a href="http://www.soc.cornell.edu/people/">People</a>,
 <a href="http://www.soc.cornell.edu/people/faculty/">Faculty and Field</a>,
 <a href="http://www.soc.cornell.edu/people/faculty#post-docs">Postdoctoral Fellows</a>,
 <a href="http://www.soc.cornell.edu/people/gradstudents/">Graduate Students</a>,
 <a href="http://www.soc.cornell.edu/people/phds-on-the-market/">PhDs on the Market</a>,
 <a href="http://www.soc.cornell.edu/people/job-openings/">Job Openings</a>,
 <a href="http://www.soc.cornell.edu/people/staff/">Staff</a>,
 <a href="http://blogs.cornell.edu/socy/events">Events</a>,
 <a href="http://blogs.cornell.edu/socy/events/">Department Events</a>,
 <a href="http://blogs.cornell.edu/socy/events/">Department Events – Current</a>,
 <a href="ht

In [18]:
#Let's take a look at one of these items 
links[20]

<a href="http://www.soc.cornell.edu/graduate/whos-who/">Who’s Who in the Graduate Program</a>

In [19]:
type(links[20])

bs4.element.Tag

In [20]:
dir(links[20])

['HTML_FORMATTERS',
 'XML_FORMATTERS',
 '__bool__',
 '__call__',
 '__class__',
 '__contains__',
 '__copy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__unicode__',
 '__weakref__',
 '_all_strings',
 '_attr_value_as_string',
 '_attribute_checker',
 '_find_all',
 '_find_one',
 '_formatter_for_name',
 '_is_xml',
 '_lastRecursiveChild',
 '_last_descendant',
 '_select_debug',
 '_selector_combinators',
 '_should_pretty_print',
 '_tag_name_matches_and',
 'append',
 'attribselect_re',
 'attrs',
 'can_be_empty_element',
 'childGenerator',
 'children',
 'clear',
 'contents',
 'decode',
 'decode_contents',
 'decompose',
 'descendants',


In [35]:
x = links[20]

In [36]:
x.contents

['Who’s Who in the Graduate Program']

In [37]:
x['href']

'http://www.soc.cornell.edu/graduate/whos-who/'

## After experimenting with the object and determining what we want, we can then loop through all the objects returned by the query

In [38]:
profiles = []
for l in links:
    if "/people/faculty/" in l['href']:
        profiles.append(l['href'])

In [17]:
profiles

['http://www.soc.cornell.edu/people/faculty/',
 'http://www.soc.cornell.edu/people/faculty/',
 'http://www.soc.cornell.edu/people/faculty/',
 'http://www.soc.cornell.edu/people/faculty/alvarado/',
 'http://www.soc.cornell.edu/people/faculty/berezin/',
 'http://www.soc.cornell.edu/people/faculty/bischoff/',
 'http://www.soc.cornell.edu/people/faculty/burton/',
 'http://www.soc.cornell.edu/people/faculty/bcornwell/',
 'http://www.soc.cornell.edu/people/faculty/garip/',
 'http://www.soc.cornell.edu/people/faculty/haskins/',
 'http://www.soc.cornell.edu/people/faculty/heckathorn/',
 'http://www.soc.cornell.edu/people/faculty/lawler/',
 'http://www.soc.cornell.edu/people/faculty/lichter/',
 'http://www.soc.cornell.edu/people/faculty/macy/',
 'http://www.soc.cornell.edu/people/faculty/maralani/',
 'http://www.soc.cornell.edu/people/faculty/nee/',
 'http://www.soc.cornell.edu/people/faculty/pinch/',
 'http://www.soc.cornell.edu/people/faculty/strang/',
 'http://www.soc.cornell.edu/people/facu

In [39]:
##We can remove the incorrect links by applying a conditional filter to profiles
profiles = [x for x in profiles if x.endswith('faculty/') == False]

In [40]:
profiles

['http://www.soc.cornell.edu/people/faculty/alvarado/',
 'http://www.soc.cornell.edu/people/faculty/berezin/',
 'http://www.soc.cornell.edu/people/faculty/bischoff/',
 'http://www.soc.cornell.edu/people/faculty/burton/',
 'http://www.soc.cornell.edu/people/faculty/bcornwell/',
 'http://www.soc.cornell.edu/people/faculty/garip/',
 'http://www.soc.cornell.edu/people/faculty/haskins/',
 'http://www.soc.cornell.edu/people/faculty/heckathorn/',
 'http://www.soc.cornell.edu/people/faculty/lawler/',
 'http://www.soc.cornell.edu/people/faculty/lichter/',
 'http://www.soc.cornell.edu/people/faculty/macy/',
 'http://www.soc.cornell.edu/people/faculty/maralani/',
 'http://www.soc.cornell.edu/people/faculty/nee/',
 'http://www.soc.cornell.edu/people/faculty/pinch/',
 'http://www.soc.cornell.edu/people/faculty/strang/',
 'http://www.soc.cornell.edu/people/faculty/swedberg/',
 'http://www.soc.cornell.edu/people/faculty/weeden/',
 'http://www.soc.cornell.edu/people/faculty/wethington/',
 'http://www.

In [41]:
#Note that there are many duplicates in the list...
print(len(profiles))
print(len(set(profiles)))

39
20


In [42]:
profiles = list(set(profiles))

## Now we have a list of URLs we can retrieve the information from each by looping through the list and applying the function we created. The results can be saved in a dictionary.

In [43]:
profile_contents = {}
for p in profiles:
    print("Getting information from: ", p)
    soup = getSoup(p)
    name = p.split('/')[-2]
    profile_contents[name] = soup

Getting information from:  http://www.soc.cornell.edu/people/faculty/scaldwell/
Getting information from:  http://www.soc.cornell.edu/people/faculty/burton/
Getting information from:  http://www.soc.cornell.edu/people/faculty/bcornwell/
Getting information from:  http://www.soc.cornell.edu/people/faculty/nee/
Getting information from:  http://www.soc.cornell.edu/people/faculty/strang/
Getting information from:  http://www.soc.cornell.edu/people/faculty/berezin/
Getting information from:  http://www.soc.cornell.edu/people/faculty/heckathorn/
Getting information from:  http://www.soc.cornell.edu/people/faculty/swedberg/
Getting information from:  http://www.soc.cornell.edu/people/faculty/garip/
Getting information from:  http://www.soc.cornell.edu/people/faculty/weeden/
Getting information from:  http://www.soc.cornell.edu/people/faculty/alvarado/
Getting information from:  http://www.soc.cornell.edu/people/faculty/haskins/
Getting information from:  http://www.soc.cornell.edu/people/fac

In [44]:
print(profile_contents.keys())

dict_keys(['bcornwell', 'maralani', 'swedberg', 'alvarado', 'lawler', 'macy', 'lichter', 'burton', 'garip', 'haskins', 'pinch', 'weeden', 'nee', 'wethington', 'scaldwell', 'bischoff', 'eycornwell', 'berezin', 'heckathorn', 'strang'])


In [45]:
#If we want to get the information for a particular professor we can look up their dictionary entry
macy = profile_contents['macy']
macy

<!DOCTYPE html>

<!--[if IE 7]>
<html class="ie ie7" lang="en-US">
<![endif]-->
<!--[if IE 8]>
<html class="ie ie8" lang="en-US">
<![endif]-->
<!--[if IE 9]>
<html class="ie ie9" lang="en-US">
<![endif]-->
<!--[if !(IE 7) | !(IE 8) | !(IE 9)  ]><!-->
<html lang="en-US">
<!--<![endif]-->
<head>
<meta charset="utf-8">
<title>Michael Macy | Department of Sociology</title>
<meta content="width=device-width, initial-scale=1, maximum-scale=5" name="viewport">
<meta content="IE=Edge" http-equiv="X-UA-Compatible">
<link href="http://gmpg.org/xfn/11" rel="profile">
<link href="http://www.soc.cornell.edu/xmlrpc.php" rel="pingback">
<!-- CU Typekit fonts -->
<script src="//use.typekit.net/gog6dck.js" type="text/javascript"></script>
<script type="text/javascript">try{Typekit.load();}catch(e){}</script>
<link href="http://www.soc.cornell.edu/wp-content/themes/cornell_base/style/print.css" media="print" rel="stylesheet" type="text/css"/>
<!--[if lt IE 9]>
            <link rel="stylesheet" type="te

In [46]:
macy.find('div', {'class': 'entry-content'})

<div class="entry-content">
<h4>Goldwin Smith Professor of Arts and Sciences, Department of <a href="http://infosci.cornell.edu/faculty/michael-macy">Information Science</a> and <a href="http://www.soc.cornell.edu">Sociology,</a> Director of the <a href="http://sdl.soc.cornell.edu/">Social Dynamics Laboratory</a><br/>
PhD 1985, Harvard University</h4>
<h4>Areas of Interest: Human Behavior, Social Interaction, Computational Social Science</h4>
<h4 style="text-align: center;"><strong>Contact</strong></h4>
<table style="border-color: #fbfbfb; background-color: #fbfbfb;" width="422">
<tbody>
<tr>
<td style="border-color: #fbfbfb; background-color: #fbfbfb; text-align: right; vertical-align: middle;">
<h4><a href="mailto:mwm14@cornell.edu">mwm14@cornell.edu</a><br/>
(607) 255-4555/ 4269<br/>
<a href="https://sites.google.com/site/michaelmacy14/home">Personal Site</a></h4>
</td>
<td style="border-color: #fbfbfb; background-color: #fbfbfb; vertical-align: middle;">
<h4 style="text-align: left

In [47]:
content = macy.find('div', {'class': 'entry-content'})
content.text

'\nGoldwin Smith Professor of Arts and Sciences, Department of Information Science and Sociology, Director of the Social Dynamics Laboratory\nPhD 1985, Harvard University\nAreas of Interest: Human Behavior, Social Interaction, Computational Social Science\nContact\n\n\n\n\nmwm14@cornell.edu\n(607) 255-4555/ 4269\nPersonal Site\n\n\n372 Uris Hall, 225 Gates Hall\nCornell University\nIthaca, NY 14853-7601\n\n\n\n\nResearch\nMichael Macy left the farm in Tennessee where he grew up to attend Harvard, where he received his B.A. and later Ph.D, along with an M.A. from Stanford. He is currently Goldwin Smith Professor of Arts and Sciences in Sociology and Director of the Social Dynamics Laboratory at Cornell, where he has worked since 1997. With support from the National Science Foundation, the Department of Defense, and Google, his research team has used computational models, online laboratory experiments, and digital traces of device-mediated interaction to explore familiar but enigmatic so

In [48]:
content_refined = content.findAll('h4')

In [49]:
content_refined[0]

<h4>Goldwin Smith Professor of Arts and Sciences, Department of <a href="http://infosci.cornell.edu/faculty/michael-macy">Information Science</a> and <a href="http://www.soc.cornell.edu">Sociology,</a> Director of the <a href="http://sdl.soc.cornell.edu/">Social Dynamics Laboratory</a><br/>
PhD 1985, Harvard University</h4>

In [50]:
titles = content_refined[0].text

In [51]:
titles.split('PhD')

['Goldwin Smith Professor of Arts and Sciences, Department of Information Science and Sociology, Director of the Social Dynamics Laboratory\n',
 ' 1985, Harvard University']

In [52]:
title_and_education = titles.split('PhD')

In [53]:
title = title_and_education[0]
education = title_and_education[1]
education = 'PhD'+education

In [54]:
title

'Goldwin Smith Professor of Arts and Sciences, Department of Information Science and Sociology, Director of the Social Dynamics Laboratory\n'

In [55]:
education

'PhD 1985, Harvard University'

## Let's tidy that up and make some functions we can reuse

In [56]:
def getFacultyInfo(soup):
    info = soup.find('div', {'class': 'entry-content'})
    return info

In [57]:
def getTitleAndEducation(info):
    info_refined = info.findAll('h4')
    titles = info_refined[0].text
    title_and_education = titles.split('PhD')
    title = title_and_education[0]
    education = 'PhD'+title_and_education[1]
    return title, education

In [58]:
macy = getFacultyInfo(profile_contents['macy'])
macy_te = getTitleAndEducation(macy)
print(macy_te[0], macy_te[1])

Goldwin Smith Professor of Arts and Sciences, Department of Information Science and Sociology, Director of the Social Dynamics Laboratory
 PhD 1985, Harvard University


In [59]:
heckathorn = getFacultyInfo(profile_contents['heckathorn'])
heckathorn_te = getTitleAndEducation(heckathorn)
print(heckathorn_te[0], heckathorn_te[1])

Professor
 PhD 1974, University of Kansas


In [60]:
garip = getFacultyInfo(profile_contents['garip'])
garip_te = getTitleAndEducation(garip)
print(garip_te[0], garip_te[1])

IndexError: list index out of range

In [61]:
garip

<div class="entry-content">
<h4>Professor<br/>
Ph.D. 2007, Princeton University</h4>
<h4><strong>Areas of Interest</strong>: Migration, Economic Sociology, Social Networks, Inequality</h4>
<h4 style="text-align: center;"><strong>Contact</strong></h4>
<table style="border-color: #FBFBFB; height: 100px;" width="422">
<tbody>
<tr>
<td style="border-color: #fbfbfb; vertical-align: middle;">
<h4 style="text-align: right;"><a href="mailto:fgarip@cornell.edu">fgarip@cornell.edu</a><br/>
(607) 255-4266 (department)</h4>
</td>
<td style="border-color: #fbfbfb; vertical-align: middle;">
<h4 style="text-align: left;">348 Uris Hall<br/>
Cornell University<br/>
Ithaca, NY 14853-7601</h4>
</td>
</tr>
</tbody>
</table>
<h4 style="text-align: center;"><strong>Research</strong></h4>
<p>Filiz Garip’s research lies at the intersection of migration, economic sociology and inequality. Within this general area, she studies the mechanisms that enable or constrain mobility and lead to greater or lesser degree

In [62]:
import string
def getTitleAndEducation(info):
    info_refined = info.findAll('h4')
    titles = info_refined[0].text
    titles = ''.join(x for x in titles if x not in string.punctuation)
    title_and_education = titles.split('PhD')
    title = title_and_education[0]
    education = 'PhD'+title_and_education[1]
    return title, education

In [63]:
getTitleAndEducation(garip)

('Professor\n', 'PhD 2007 Princeton University')

## Now let's see if that works for all cases

In [64]:
for prof in profile_contents:
    print("Getting info for: ", prof)
    try:
        info = getFacultyInfo(profile_contents[prof])
        te = getTitleAndEducation(info)
        print(prof, te[0], te[1], '\n')
    except:
        print("ERROR: Failed to get info from", prof)

Getting info for:  bcornwell
bcornwell Associate Professor Director of Graduate Studies
 PhD 2007 University of Chicago
Curriculum Vitae 

Getting info for:  maralani
maralani Associate Professor
 PhD 2006 University of California – Los Angeles 

Getting info for:  swedberg
swedberg Professor
 PhD 1978 Boston College
 Curriculum Vitae 

Getting info for:  alvarado
alvarado Assistant Professor
 PhD 2011 University of WisconsinMadison
Curriculum Vitae 

Getting info for:  lawler
lawler Professor of Industrial and Labor Relations and Sociology
 PhD 1972 University of Wisconsin
Personal Site 

Getting info for:  macy
macy Goldwin Smith Professor of Arts and Sciences Department of Information Science and Sociology Director of the Social Dynamics Laboratory
 PhD 1985 Harvard University 

Getting info for:  lichter
lichter Ferris Family Professor of Policy Analysis and Management Robert S Harrison Director of the Institute for Social Sciences
 PhD 1981 University of WisconsinMadison
 Curricul

## OK, so it looks like we got everybody's details except Kim Weeden's. Why? Can you fix the function to get hers too.

## We should probably get some more information. Complete this function to get the correct name for each faculty member

In [65]:
def getFacultyName(soup):
    name_info = soup.findAll('h1', {'class':'entry-title'})
    name = name_info[0].text
    return name

In [66]:
for prof in profile_contents:
    name = getFacultyName(profile_contents[prof])
    print(name)

Benjamin Cornwell
Vida Maralani
Richard Swedberg
Steven E. Alvarado
Edward J. Lawler
Michael Macy
Daniel T. Lichter
Diane Burton
Filiz Garip
Anna R. Haskins
Trevor Pinch
Kim Weeden
Victor Nee
Elaine Wethington
Steven Caldwell
Kendra Bischoff
Erin York Cornwell
Mabel Berezin
Douglas Heckathorn
David Strang


## Now we can put it all together to get a Python object containing info from each page

In [67]:
faculty_info = {}
for prof in profile_contents:
    print("Getting info for: ", prof)
    try:
        name = getFacultyName(profile_contents[prof])
        info = getFacultyInfo(profile_contents[prof])
        te = getTitleAndEducation(info)
        faculty_info[name] = {'title': te[0], 'education':te[1]}
    except:
        print("ERROR: Failed to get info from", prof)
    

Getting info for:  bcornwell
Getting info for:  maralani
Getting info for:  swedberg
Getting info for:  alvarado
Getting info for:  lawler
Getting info for:  macy
Getting info for:  lichter
Getting info for:  burton
Getting info for:  garip
Getting info for:  haskins
Getting info for:  pinch
Getting info for:  weeden
ERROR: Failed to get info from weeden
Getting info for:  nee
Getting info for:  wethington
Getting info for:  scaldwell
Getting info for:  bischoff
Getting info for:  eycornwell
Getting info for:  berezin
Getting info for:  heckathorn
Getting info for:  strang


In [68]:
faculty_info

{'Anna R. Haskins': {'education': 'PhD 2013 University of WisconsinMadison\n Curriculum Vitae',
  'title': 'Assistant Professor\n'},
 'Benjamin Cornwell': {'education': 'PhD 2007 University of Chicago\nCurriculum Vitae',
  'title': 'Associate Professor Director of Graduate Studies\n'},
 'Daniel T. Lichter': {'education': 'PhD 1981 University of WisconsinMadison\n Curriculum Vitae',
  'title': 'Ferris Family Professor of Policy Analysis and Management Robert S Harrison Director of the Institute for Social Sciences\n'},
 'David Strang': {'education': 'PhD 1988 Stanford University\n Curriculum Vitae',
  'title': 'Professor\n'},
 'Diane Burton': {'education': 'PhD 1996 Stanford University\n Curriculum Vitae',
  'title': 'Associate Professor of Human Resource Studies and Sociology\n'},
 'Douglas Heckathorn': {'education': 'PhD 1974 University of Kansas',
  'title': 'Professor\n'},
 'Edward J. Lawler': {'education': 'PhD 1972 University of Wisconsin\nPersonal Site',
  'title': 'Professor of 

## OK, this looks more ore less correct. Can you see any problems?

In [69]:
import pandas as pd
df = pd.DataFrame.from_dict(faculty_info, orient='index')

In [70]:
df

Unnamed: 0,education,title
Anna R. Haskins,PhD 2013 University of WisconsinMadison\n Curr...,Assistant Professor\n
Benjamin Cornwell,PhD 2007 University of Chicago\nCurriculum Vitae,Associate Professor Director of Graduate Studi...
Daniel T. Lichter,PhD 1981 University of WisconsinMadison\n Curr...,Ferris Family Professor of Policy Analysis and...
David Strang,PhD 1988 Stanford University\n Curriculum Vitae,Professor\n
Diane Burton,PhD 1996 Stanford University\n Curriculum Vitae,Associate Professor of Human Resource Studies ...
Douglas Heckathorn,PhD 1974 University of Kansas,Professor\n
Edward J. Lawler,PhD 1972 University of Wisconsin\nPersonal Site,Professor of Industrial and Labor Relations an...
Elaine Wethington,PhD 1987 University of Michigan\n Curriculum V...,Professor of Human Development and Sociology\n
Erin York Cornwell,PhD 2008 University of Chicago\n Curriculum Vitae,Assistant Professor Sesquicentennial Faculty F...
Filiz Garip,PhD 2007 Princeton University,Professor\n
