# Web Scraping

In [1]:
from urllib.request import urlopen
html = urlopen('http://www.pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


In [2]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs = BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


## Manejo de exepciones

In [16]:
from urllib.request import urlopen
from urllib.error import HTTPError

try:
    html = urlopen('http://www.pythonscraping.com/pages/page1rsw.html')
except HTTPError as e:
    print(e)



HTTP Error 404: Not Found
continua trabajando


In [23]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen('https://pythonscrapingthisurldoesnotexist.com')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It Worked')


The server could not be found!


### Excepciones en la etiqueta

In [29]:
try:
    badcontent = bs.etiquetaNoExiste.Otraetiqueta
except AttributeError as e:
    print('Etiqueta no encontrada', e)
else:
    if badcontent == None:
        print('Etiqueta no encontrada')
    else:
        print(badcontent)

Etiqueta no encontrada 'NoneType' object has no attribute 'Otraetiqueta'


In [32]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    
    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')
if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>


In [8]:
print(bs.find_all('div'))

[<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>]


## Advanced HTML Parsing

- *find_all(etiqueta, etiquetaAtributos)*: funcion que se utiliza para buscar a traves de un archivo y retornar los elemntos que coincidan con sus filtros
- *get_text()*: funcion que retorna solo el texto de un objeto beautifulSoup sin las etiquetas o tags

In [12]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen(' http://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')
nameList = bs.find_all('span', {'class':'green'})
for name in set(nameList):
    print(name.get_text())
    

The prince
St. Petersburg
the Emperor
Rohans
Empress Marya
Fedorovna
Anatole
le Vicomte de Mortemart
Anna Pavlovna
Anna Pavlovna's
Baron
Funke
King of Prussia
Her Majesty
Anna
Pavlovna Scherer
Anna
Pavlovna
Abbe Morio
Prince Vasili Kuragin
the baron
Prince Vasili
the prince
Montmorencys
Wintzingerode
Dowager Empress Marya Fedorovna
the Empress


In [14]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html.read(), 'html.parser')

for child in bs.find('table', {'id':'giftList'}).children:
    print(child)



<tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>


<tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>


<tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>


<tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gifts/img3.jpg"/>


## chidren() y descendants()
El metodo children devuelve una lista de todos los hijos directos de una etiqueta html, mientras que descendants() devuelve una lista con todos los decendiendes de la etiqueta.

In [21]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for n, child in enumerate(bs.find('table', {'id':'giftList'}).children):
    print(n, child)



0 

1 <tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
2 

3 <tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
4 

5 <tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
6 

7 <tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gi

## next_siblings()

In [29]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for n, sibling in enumerate(bs.find('table', {'id':'giftList'}).tr):
    print(n, sibling)

0 <th>
Item Title
</th>
1 <th>
Description
</th>
2 <th>
Cost
</th>
3 <th>
Image
</th>


In [30]:
# ejemplo
from bs4 import BeautifulSoup

html = """
<html>
  <head>
    <title>Prueba</title>
  </head>
  <body>
    <h1>Título</h1>
    <p>Este es el primer párrafo.</p>
    <p>Este es el segundo párrafo.</p>
    <p>Este es el tercer párrafo.</p>
  </body>
</html>
"""

soup = BeautifulSoup(html, 'html.parser')

# Obtenemos la primera etiqueta p
primer_p = soup.find('p')

# Recorremos todos los hermanos siguientes de la primera etiqueta p
for hermano in primer_p.next_siblings:
    print(hermano)




<p>Este es el segundo párrafo.</p>


<p>Este es el tercer párrafo.</p>




In [26]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = open('index.html','r')
# html = urlopen('index.html')
bs = BeautifulSoup(html, 'html.parser')
sitiosTuristicos = bs.find('li')
# print(sitiosTuristicos)
print(list(sitiosTuristicos.next_siblings))

result =[]
for h in sitiosTuristicos.next_siblings:
    if h != '\n':
        result.append(h)

for n, hijo in enumerate(result):
    print(n, hijo)


['\n', <p>Ubicacion: Centro de San Juan</p>, '\n', <li>el castrero</li>, '\n', <p>Ubicacion: Sureste</p>, '\n', <li>la puerta del llano</li>, '\n', <p>Ubicacion: Norte</p>, '\n']
0 <p>Ubicacion: Centro de San Juan</p>
1 <li>el castrero</li>
2 <p>Ubicacion: Sureste</p>
3 <li>la puerta del llano</li>
4 <p>Ubicacion: Norte</p>
