# Scraping with Beautiful Soup

http://www.crummy.com/software/BeautifulSoup/

In [49]:
import urllib

url = "http://static.decontextualize.com/kittens.html"
html_str = urllib.urlopen(url).read()

In [50]:
print html_str

<!doctype html>
<html>
	<head>
		<title>Kittens!</title>
		<style type="text/css">
			span.lastcheckup { font-family: "Courier", fixed; font-size: 11px; }
		</style>
	</head>
	<body>
		<h1>Kittens and the TV Shows They Love</h1>
		<div class="kitten">
			<h2>Fluffy</h2>
			<div><img src="http://placekitten.com/120/120"></div>
			<ul class="tvshows">
				<li>
					<a href="http://www.imdb.com/title/tt0106145/">Deep Space Nine</a>
				</li>
				<li>
					<a href="http://www.imdb.com/title/tt0088576/">Mr. Belvedere</a>
				</li>
			</ul>
			Last check-up: <span class="lastcheckup">2014-01-17</span>
		</div>
		<div class="kitten">
			<h2>Monsieur Whiskeurs</h2>
			<div><img src="http://placekitten.com/110/110"></div>
			<ul class="tvshows">
				<li>
					<a href="http://www.imdb.com/title/tt0106179/">The X-Files</a>
				</li>
				<li>
					<a href="http://www.imdb.com/title/tt0098800/">Fresh Prince</a>
				</li>
			</ul>
			Last check-up: <span class="lastcheckup">2013-11-02</span>
		</div

In [51]:
from bs4 import BeautifulSoup

document = BeautifulSoup(html_str)
print type(document)



<class 'bs4.BeautifulSoup'>


### finding an individual tag

In [52]:
h1_tag = document.find("h1")
print h1_tag

<h1>Kittens and the TV Shows They Love</h1>


In [53]:
h1_tag.string

u'Kittens and the TV Shows They Love'

In [54]:
img_tag = document.find("img")
print img_tag["src"]

http://placekitten.com/120/120


In [55]:
img_tags = document.find_all("img")
print img_tags
print img_tags[0]

[<img src="http://placekitten.com/120/120"/>, <img src="http://placekitten.com/110/110"/>]
<img src="http://placekitten.com/120/120"/>


In [56]:
h2_tags = document.find_all("h2")
for tag in h2_tags:
    print tag.string

Fluffy
Monsieur Whiskeurs


### particular attributes

In [57]:
checkup_tags = document.find_all("span", attrs={"class": "lastcheckup"}) 

In [58]:
for tag in checkup_tags:
    print tag.string

2014-01-17
2013-11-02


### finding tags within tags

what I want to end up with:

```
[
    {"name": "Fluffy", "shows" : ["Deep Space Nine", "Mr. Belvedere"]},
    {"name": "Monsieur Whiskeurs", "shows": ["The X-Files", "Fresh Prince"]}
]
```

buuut let's get *just* the televions shows first.

In [59]:
a_tags = document.find_all("a")
for tag in a_tags:
    print tag.string

Deep Space Nine
Mr. Belvedere
The X-Files
Fresh Prince


In [60]:
kittens = []
kitten_tags = document.find_all("div", attrs = {"class": "kitten"})
for tag in kitten_tags:
    # find h2 tag that is the child of this tag
    h2_tag = tag.find("h2")
    kitten_name = h2_tag.string
    tvshow_tags = tag.find_all("a")
    kitten_shows = [tvshow.string for tvshow in tvshow_tags]
    kitten_info = {"name": kitten_name, "shows": kitten_shows}
    kittens.append(kitten_info)
kittens

[{'name': u'Fluffy', 'shows': [u'Deep Space Nine', u'Mr. Belvedere']},
 {'name': u'Monsieur Whiskeurs', 'shows': [u'The X-Files', u'Fresh Prince']}]

# Finding siblings

In [61]:
cheese_html = """
<h2>Camebert</h2>
<p>A soft cheese made in the Camebert region.</p>

<h2>Cheddar</h2>
<p>A yellow chees made in the Cheddar region.</p>
"""

cheese_dict = {"Camebert": "A soft cheese made in the Camebert region", "Cheddar": "A yellow chees made in the Cheddar region."}


In [62]:
tag.find_next_sibling("tag name")

In [63]:
document = BeautifulSoup(cheese_html)
h2_tags = document.find_all("h2")
[tag.string for tag in h2_tags]

[u'Camebert', u'Cheddar']

In [64]:
for tag in h2_tags:
    cheese_name = tag.string
    p_tag = tag.find_next_sibling("p")
    cheese_desc = p_tag.string
    cheese_dict[cheese_name] = cheese_desc
cheese_dict

{'Camebert': u'A soft cheese made in the Camebert region.',
 'Cheddar': u'A yellow chees made in the Cheddar region.'}

# Practice

In [68]:
url2 = "http://hashtagdigilab.tumblr.com"
html_str = urllib.urlopen(url2).read()
import requests


In [69]:
r  = requests.get("http://hashtagdigilab.tumblr.com")

data = r.text

soup = BeautifulSoup(data)

for link in soup.find_all('a'):
    print(link.get('href'))

None
/
/
/
/archive
/innovation
/ddj
/socialmedia
/toolbox
/about
http://hashtagdigilab.tumblr.com/post/116823524393/ein-bisschen-perugia-fuer-alle-highlights-des
http://www.vanessawormer.de
http://www.journalismfestival.com
https://www.youtube.com/user/festivalgiornalismo/videos
http://hashtagdigilab.tumblr.com/post/116823524393/ein-bisschen-perugia-fuer-alle-highlights-des
http://hashtagdigilab.tumblr.com/tagged/DDJ
http://hashtagdigilab.tumblr.com/tagged/Datenjournalismus
http://hashtagdigilab.tumblr.com/tagged/Journalismus
http://hashtagdigilab.tumblr.com/tagged/Perugia
http://hashtagdigilab.tumblr.com/tagged/ijf15
http://hashtagdigilab.tumblr.com/post/116823524393/ein-bisschen-perugia-fuer-alle-highlights-des#notes
http://hashtagdigilab.tumblr.com/post/116823524393/ein-bisschen-perugia-fuer-alle-highlights-des
#
#
http://facebook.com/sharer.php?u=http%3A%2F%2Fhashtagdigilab.tumblr.com%2Fpost%2F116823524393%2Fein-bisschen-perugia-fuer-alle-highlights-des&t=Ein%20bisschen%20Perugia%