## Extracting Data using Web Scraping 

In [26]:
# import 
import requests    
from bs4 import BeautifulSoup 

In [27]:
# HTML String
html_string = """
<!doctype html>
<html lang="en">
<head>
  <title>Doing Data Science With Python</title>
</head>
<body>
  <h1 style="color:#F15B2A;">Doing Data Science With Python</h1>
  <p id="author">Author : Onur Surucu</p>
  <p id="description">This course will help you to perform various data science activities using python.</p>
  
  <h3 style="color:#404040">Modules</h3>
  <table id="module" style="width:100%">
      <tr>
        <th>Title</th>
        <th>Duration (In Minutes)</th> 
      </tr>
      <tr>
        <td>Getting Started</td>
        <td>20</td> 
      </tr>
      <tr>
        <td>Setting up the Environment</td>
        <td>40</td> 
      </tr>
      <tr>
        <td>Extracting Data</td>
        <td>35</td> 
      </tr>
      <tr>
        <td>Exploring and Processing Data - Part 1</td>
        <td>45</td> 
      </tr>
      <tr>
        <td>Exploring and Processing Data - Part 2</td>
        <td>45</td> 
      </tr>
      <tr>
        <td>Building Predictive Model</td>
        <td>30</td> 
      </tr>
  </table>
</body>
</html>
"""

In [28]:
# display HTML string in the juptyer notebook
from IPython.core.display import display, HTML
display(HTML(html_string))

Title,Duration (In Minutes)
Getting Started,20
Setting up the Environment,40
Extracting Data,35
Exploring and Processing Data - Part 1,45
Exploring and Processing Data - Part 2,45
Building Predictive Model,30


In [29]:
# use beautiful soup 
ps = BeautifulSoup(html_string)

In [30]:
# print b
print(ps)

<!DOCTYPE html>
<html lang="en">
<head>
<title>Doing Data Science With Python</title>
</head>
<body>
<h1 style="color:#F15B2A;">Doing Data Science With Python</h1>
<p id="author">Author : Onur Surucu</p>
<p id="description">This course will help you to perform various data science activities using python.</p>
<h3 style="color:#404040">Modules</h3>
<table id="module" style="width:100%">
<tr>
<th>Title</th>
<th>Duration (In Minutes)</th>
</tr>
<tr>
<td>Getting Started</td>
<td>20</td>
</tr>
<tr>
<td>Setting up the Environment</td>
<td>40</td>
</tr>
<tr>
<td>Extracting Data</td>
<td>35</td>
</tr>
<tr>
<td>Exploring and Processing Data - Part 1</td>
<td>45</td>
</tr>
<tr>
<td>Exploring and Processing Data - Part 2</td>
<td>45</td>
</tr>
<tr>
<td>Building Predictive Model</td>
<td>30</td>
</tr>
</table>
</body>
</html>



In [31]:
# use name parameter to select by tag name
body = ps.find(name="body")

In [32]:
print(body)

<body>
<h1 style="color:#F15B2A;">Doing Data Science With Python</h1>
<p id="author">Author : Onur Surucu</p>
<p id="description">This course will help you to perform various data science activities using python.</p>
<h3 style="color:#404040">Modules</h3>
<table id="module" style="width:100%">
<tr>
<th>Title</th>
<th>Duration (In Minutes)</th>
</tr>
<tr>
<td>Getting Started</td>
<td>20</td>
</tr>
<tr>
<td>Setting up the Environment</td>
<td>40</td>
</tr>
<tr>
<td>Extracting Data</td>
<td>35</td>
</tr>
<tr>
<td>Exploring and Processing Data - Part 1</td>
<td>45</td>
</tr>
<tr>
<td>Exploring and Processing Data - Part 2</td>
<td>45</td>
</tr>
<tr>
<td>Building Predictive Model</td>
<td>30</td>
</tr>
</table>
</body>


In [33]:
# use text attribute to get the content of the tag
print(body.find(name="h1").text)

Doing Data Science With Python


In [34]:
# gets only first matching element
print(body.find(name="p"))

<p id="author">Author : Onur Surucu</p>


In [35]:
# get all elements
print(body.findAll(name="p"))

[<p id="author">Author : Onur Surucu</p>, <p id="description">This course will help you to perform various data science activities using python.</p>]


In [36]:
# loop through each element
# Finding specific 'p' through 'p' elements
for p in body.findAll(name="p"):
    print(p.text)

Author : Onur Surucu
This course will help you to perform various data science activities using python.


In [12]:
# add attributes in the selection process
#So you can narrow down your selections
print(body.find(name='p', attrs={"id":"author"}))

<p id="author">Author : Abhishek Kumar</p>


In [13]:
print(body.find(name='p', attrs={"id":"description"}))

<p id="description">This course will help you to perform various data science activities using python.</p>


In [14]:
# body
body = ps.find(name="body")
# module table
module_table = body.find(name='table', attrs={"id": "module"})
# iterate through each row in the table (skipping the first row)
for row in module_table.findAll(name='tr')[1:]:
    # module title
    title = row.findAll(name='td')[0].text
    # module duration
    duration = int(row.findAll(name='td')[1].text)
    print title, duration

Getting Started 20
Setting up the Environment 40
Extracting Data 35
Exploring and Processing Data - Part 1 45
Exploring and Processing Data - Part 2 45
Building Predictive Model 30
