# BeautifulSoup
- great 'screen scraping' package
- tons of interesting data on webpages
- makes it easy to extract information from complex web pages and XML documents
- can figure out what to do by playing interactively
- [doc](http://www.crummy.com/software/BeautifulSoup/)

In [1]:
# another way to do hamlet

import urllib.request
import collections
import bs4

url='https://courseworks.columbia.edu/access/content/group/'
url+='COMSW3101_002_2015_3/data/hamlet.html'

def hamlet(url):
    page = urllib.request.urlopen(url)
    sp = bs4.BeautifulSoup(page, 'html5lib', from_encoding='utf-8')
    lam = lambda t : t.name == 'a' and ('name' in t.attrs ) and t['name'].startswith('speech')
    al = sp.findAll(lam)
    cd=collections.defaultdict(int)
    lcnt=0
    for a in al:
        name = a.string
        cd[name] += 1
    speeches = sum(cd.values())
    # no line count
    return([len(al), speeches, cd])
  
hamlet(url)

[1150,
 1150,
 defaultdict(int,
             {'All': 4,
              'BERNARDO': 23,
              'CORNELIUS': 1,
              'Captain': 7,
              'Danes': 3,
              'FRANCISCO': 8,
              'First Ambassador': 1,
              'First Clown': 33,
              'First Player': 8,
              'First Priest': 2,
              'First Sailor': 2,
              'GUILDENSTERN': 33,
              'Gentleman': 3,
              'Ghost': 14,
              'HAMLET': 359,
              'HORATIO': 112,
              'KING CLAUDIUS': 102,
              'LAERTES': 62,
              'LORD POLONIUS': 86,
              'LUCIANUS': 1,
              'Lord': 3,
              'MARCELLUS': 36,
              'Messenger': 2,
              'OPHELIA': 58,
              'OSRIC': 25,
              'PRINCE FORTINBRAS': 6,
              'Player King': 4,
              'Player Queen': 5,
              'Prologue': 1,
              'QUEEN GERTRUDE': 69,
              'REYNALDO': 13,
            

# Want to find all the headlines on the front page of the [New York Times](http://nyt.com)
- html structure is quite complex
- would be very difficult to do with string.find() or regular expressions 

In [2]:
# 'lxml' is a XML parser
# must tell soup what encoding to use

from bs4 import BeautifulSoup

nf2 = urllib.request.urlopen('http://nyt.com')
sp = BeautifulSoup(nf2, 'lxml', from_encoding='utf-8')

In [3]:
# headlines seem to be contained in 'h2' elements

sp.findAll('h2')[10:20]

[<h2 class="story-heading"><a href="http://www.nytimes.com/2016/04/30/us/politics/indiana-republican-transgender-rights-bathroom.html">Cruz Seizes on Transgender Issue in Attacks on Trump</a></h2>,
 <h2 class="story-heading"><a href="http://www.nytimes.com/2016/04/29/us/politics/out-of-office-ex-speaker-john-boehner-gleefully-releases-mute-button.html">Boehner, Unbound and Speaking Freely</a></h2>,
 <h2 class="story-heading"><i class="icon"></i><a href="http://www.nytimes.com/2016/04/30/us/politics/obama-puts-his-weight-behind-smart-gun-technology.html">Obama Puts Weight Behind Smart Gun Technology</a> <time class="timestamp" data-eastern-timestamp="1:19 PM" data-utc-timestamp="1461950370" datetime="2016-04-29">1:19 PM ET</time></h2>,
 <h2 class="story-heading"><i class="icon"></i><a href="http://www.nytimes.com/2016/04/30/world/asia/north-korea-kim-dong-chul-sentence.html">North Korea Sentences American to 10 Years for Spying</a> </h2>,
 <h2 class="story-heading"><a href="http://www.n

In [4]:
# first 'h2' element

h2 = sp.h2
h2

<h2 class="branding"><a href="http://www.nytimes.com/">
<svg aria-label="The New York Times" class="nyt-logo" height="64" role="img" width="379">
<image alt="The New York Times" border="0" height="64" src="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.png" width="379" xlink:href="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.svg"></image>
</svg>
</a></h2>

In [5]:
# can pull 'a' element out of 'h2'
# this 'a' element is a picture

a=h2.find('a')
a

<a href="http://www.nytimes.com/">
<svg aria-label="The New York Times" class="nyt-logo" height="64" role="img" width="379">
<image alt="The New York Times" border="0" height="64" src="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.png" width="379" xlink:href="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.svg"></image>
</svg>
</a>

In [6]:
# try pulling the 'a' out of all 'h2' elements
# looks like we get mostly headlines

al=[h2.find('a') for h2 in sp.findAll("h2")]
al[:20]

[<a href="http://www.nytimes.com/">
 <svg aria-label="The New York Times" class="nyt-logo" height="64" role="img" width="379">
 <image alt="The New York Times" border="0" height="64" src="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.png" width="379" xlink:href="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.svg"></image>
 </svg>
 </a>,
 None,
 None,
 None,
 None,
 None,
 <a href="http://www.nytimes.com/2016/04/30/world/asia/afghanistan-doctors-without-borders-hospital-strike.html">Punishing 16, Pentagon Says Mistakes Led to Hospital Attack</a>,
 <a href="http://www.nytimes.com/interactive/2015/11/25/world/asia/errors-us-airstrike-afghan-kunduz-msf-hospital.html">A Step-by-Step Look at the Errors Behind the Strike</a>,
 <a href="http://www.nytimes.com/2016/04/29/us/politics/hillary-clinton-donald-trump-women.html">Trump and Clinton Gear Up for a Race Defined by Gender</a>,
 <a href="http://www.nyti

In [7]:
# pull out the 'a' link text 

[a.contents for a in al if a != None][:30]

[['\n',
  <svg aria-label="The New York Times" class="nyt-logo" height="64" role="img" width="379">
  <image alt="The New York Times" border="0" height="64" src="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.png" width="379" xlink:href="https://a1.nyt.com/assets/homepage/20160427-131901/images/foundation/logos/nyt-logo-379x64.svg"></image>
  </svg>,
  '\n'],
 ['Punishing 16, Pentagon Says Mistakes Led to Hospital Attack'],
 ['A Step-by-Step Look at the Errors Behind the Strike'],
 ['Trump and Clinton Gear Up for a Race Defined by Gender'],
 ['Protest Turns Violent at Trump Rally in Southern California'],
 ['Cruz Seizes on Transgender Issue in Attacks on Trump'],
 ['Boehner, Unbound and Speaking Freely'],
 ['Obama Puts Weight Behind Smart Gun Technology'],
 ['Justices Leave Texas Voter ID Law Intact'],
 ['North Korea Sentences American to 10 Years for Spying'],
 ['Buried in Microfilm, Whitman’s Health Tips'],
 ['At Small Colleges, Harsh Lesso

In [8]:
# filter out images

[a.contents for a in al if a != None and len(a)==1][:30]

[['Punishing 16, Pentagon Says Mistakes Led to Hospital Attack'],
 ['A Step-by-Step Look at the Errors Behind the Strike'],
 ['Trump and Clinton Gear Up for a Race Defined by Gender'],
 ['Protest Turns Violent at Trump Rally in Southern California'],
 ['Cruz Seizes on Transgender Issue in Attacks on Trump'],
 ['Boehner, Unbound and Speaking Freely'],
 ['Obama Puts Weight Behind Smart Gun Technology'],
 ['Justices Leave Texas Voter ID Law Intact'],
 ['North Korea Sentences American to 10 Years for Spying'],
 ['Buried in Microfilm, Whitman’s Health Tips'],
 ['At Small Colleges, Harsh Lessons About Cash Flow'],
 ['University in Turmoil Over Scalia Tribute and Koch Role'],
 ['Money, Race and Success: How Your School District Compares'],
 ['Surge in Palestinian Youths in Prison Tests Israel'],
 ['Notes From Aleppo: Glimpses of War-Ravaged Syria'],
 ['Study Finds Fewer Homeless on Streets of New York'],
 ['The Perks of Monotasking, a.k.a. ‘Paying Attention’'],
 ['Review: In Key & Peele’s ‘Ke