# Web Scraping

In [10]:
from urllib.request import urlopen
html = urlopen('http://pythonscraping.com/pages/page1.html')
print(html.read())

b'<html>\n<head>\n<title>A Useful Page</title>\n</head>\n<body>\n<h1>An Interesting Title</h1>\n<div>\nLorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.\n</div>\n</body>\n</html>\n'


## Ejecutar BeautifulSoup

In [5]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page1.html')
bs =BeautifulSoup(html.read(), 'html.parser')
print(bs.h1)

<h1>An Interesting Title</h1>


## find_all y get_tex()
**find_all(etiqueta, atributos)**: retorna todo los elementos que coincidan con los filtros.

**find()**: retorna solo el primer elementos que coincide con los filtros.

**get_text()**: retorna solo el texto, sin etiquetas

In [9]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://gamaenlinea.com/VIVERES/Aceites-y-aderezos/Mayonesas/MAYONESA-NATURAL-HEINZ-370-GR/p/10034430')
bs = BeautifulSoup(html.read(), 'html.parser')
namelist = bs.find_all('span', {'class':'nav-items-total'})
for name in namelist:
    print(name.get_text())


0 Artículos
0 Artículos


### Script para paginas que tienen proteccion contra el agente de usuario de python

Una posible solucion es cambiar el User-Agent para que python se parezca a un navegador como Mozilla.

In [10]:
import urllib.request
from bs4 import BeautifulSoup
url = 'https://vallearriba.elplazas.com/huevos-en-estuche-de-12und.html'
hdr = { 'User-Agent' : 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)' }

req = urllib.request.Request(url, headers=hdr)
response = urllib.request.urlopen(req)
bs = BeautifulSoup(response, 'html.parser')
print(bs.title)

<title>HUEVOS EN ESTUCHE DE 12 UNIDADES</title>


## Manejo de exepciones

In [3]:
from urllib.request import urlopen
from urllib.error import HTTPError

try:
    html = urlopen('http://www.pythonscraping.com/pages/page1x.html')
except HTTPError as e:
    print(e)



HTTP Error 404: Not Found


In [8]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen('https://garmaenlinea.com/BEBIDAS/Cervezas/Nacionales/CERVEZA-POLAR-TIPO-PILSEN-BOTELLA-0%2C355-LT/p/40005236')
except HTTPError as e:
    print('A ocurrido un error', e)
except URLError as e:
    print('servidor no encontrado ', e)
else:
    print('Bien')

servidor no encontrado  <urlopen error [Errno 11001] getaddrinfo failed>


In [11]:
from urllib.request import urlopen
from urllib.error import HTTPError
from urllib.error import URLError

try:
    html = urlopen('https://pythonscrapingthisurldoesnotexist.com')
except HTTPError as e:
    print(e)
except URLError as e:
    print('The server could not be found!')
else:
    print('It Worked')


The server could not be found!


### Excepciones en la etiqueta

In [11]:
try:
    badcontent = bs.etiquetaNoExiste.Otraetiqueta
except AttributeError as e:
    print('Etiqueta no encontrada', e)
else:
    if badcontent == None:
        print('Etiqueta no encontrada')
    else:
        print(badcontent)

Etiqueta no encontrada 'NoneType' object has no attribute 'Otraetiqueta'


In [32]:
from urllib.request import urlopen
from urllib.error import HTTPError
from bs4 import BeautifulSoup

def getTitle(url):
    try:
        html = urlopen(url)
    except HTTPError as e:
        return None
    
    try:
        bs = BeautifulSoup(html.read(), 'html.parser')
        title = bs.body.h1
    except AttributeError as e:
        return None
    
    return title

title = getTitle('http://www.pythonscraping.com/pages/page1.html')
if title == None:
    print('Title could not be found')
else:
    print(title)

<h1>An Interesting Title</h1>


In [8]:
print(bs.find_all('div'))

[<div>
Lorem ipsum dolor sit amet, consectetur adipisicing elit, sed do eiusmod tempor incididunt ut labore et dolore magna aliqua. Ut enim ad minim veniam, quis nostrud exercitation ullamco laboris nisi ut aliquip ex ea commodo consequat. Duis aute irure dolor in reprehenderit in voluptate velit esse cillum dolore eu fugiat nulla pariatur. Excepteur sint occaecat cupidatat non proident, sunt in culpa qui officia deserunt mollit anim id est laborum.
</div>]


## Advanced HTML Parsing

- *find_all(etiqueta, etiquetaAtributos)*: funcion que se utiliza para buscar a traves de un archivo y retornar los elementos que coincidan con sus filtros
- *find*: retorna solo el primer elemnto que conincide con los filtros
- *get_text()*: funcion que retorna solo el texto de un objeto beautifulSoup sin las etiquetas o tags

.find_all(['h1','h2','h3'])

In [None]:
# retornar una lista de varias etiquetas
#.find_all(['h1','h2','h3'])

# filtrando por etiqueta, y varios atributos
#.find_all('span', {'class': {'grenn', 'red'}})

# busqueda por texto
#.find_all(string='the text')

# filtrar por keywords
#.find_all(id='cosa', class_='algo') # class con _

In [38]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/warandpeace.html')
bs = BeautifulSoup(html.read(), 'html.parser')

# busqueda por etiqueta y atributo
nameList = bs.find_all('span', {'class':'green'})

for name in nameList:
    print(name.get_text())


Anna
Pavlovna Scherer
Empress Marya
Fedorovna
Prince Vasili Kuragin
Anna Pavlovna
St. Petersburg
the prince
Anna Pavlovna
Anna Pavlovna
the prince
the prince
the prince
Prince Vasili
Anna Pavlovna
Anna Pavlovna
the prince
Wintzingerode
King of Prussia
le Vicomte de Mortemart
Montmorencys
Rohans
Abbe Morio
the Emperor
the prince
Prince Vasili
Dowager Empress Marya Fedorovna
the baron
Anna Pavlovna
the Empress
the Empress
Anna Pavlovna's
Her Majesty
Baron
Funke
The prince
Anna
Pavlovna
the Empress
The prince
Anatole
the prince
The prince
Anna
Pavlovna
Anna Pavlovna


## Encontrar etiquetas basado en la ubicacion

## chidren() y descendants()
El metodo children devuelve una lista de todos los hijos directos de una etiqueta html, mientras que descendants() devuelve una lista con todos los decendiendes de la etiqueta.

In [18]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = urlopen('https://www.pythonscraping.com/pages/page3.html')

bs = BeautifulSoup(html, 'html.parser')
for e, child in enumerate(bs.find('table', {'id':'giftList'}).children):
    print(e, child)

0 

1 <tr><th>
Item Title
</th><th>
Description
</th><th>
Cost
</th><th>
Image
</th></tr>
2 

3 <tr class="gift" id="gift1"><td>
Vegetable Basket
</td><td>
This vegetable basket is the perfect gift for your health conscious (or overweight) friends!
<span class="excitingNote">Now with super-colorful bell peppers!</span>
</td><td>
$15.00
</td><td>
<img src="../img/gifts/img1.jpg"/>
</td></tr>
4 

5 <tr class="gift" id="gift2"><td>
Russian Nesting Dolls
</td><td>
Hand-painted by trained monkeys, these exquisite dolls are priceless! And by "priceless," we mean "extremely expensive"! <span class="excitingNote">8 entire dolls per set! Octuple the presents!</span>
</td><td>
$10,000.52
</td><td>
<img src="../img/gifts/img2.jpg"/>
</td></tr>
6 

7 <tr class="gift" id="gift3"><td>
Fish Painting
</td><td>
If something seems fishy about this painting, it's because it's a fish! <span class="excitingNote">Also hand-painted by trained monkeys!</span>
</td><td>
$10,005.00
</td><td>
<img src="../img/gi

## next_sibling y next_siblings()
Permite obtener el siguiente hermano(s) de un elemento del arbol de un documento HTML o XML

In [24]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('http://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')

for n, sibling in enumerate(bs.find('table', {'id':'giftList'}).tr.next_sibling):
    print(n, sibling)

0 



## previous_siblings
Permite obtener los hermanos anteriores de un elemento del documento html oxml

In [37]:
# ejemplo
from bs4 import BeautifulSoup

html = """
<html>
  <head>
    <title>Prueba</title>
  </head>
  <body>
    <h1>Título</h1>
    <p id=0>Este es el parrafo cero</p>
    <p id=1>Este es el primer párrafo.</p>
    <p id=2>Este es el segundo párrafo.</p>
    <p id=3>Este es el tercer párrafo.</p>
  </body>
</html>
"""

soup = BeautifulSoup(html, 'html.parser')

# Obtenemos la primera etiqueta p
primer_p = soup.find('p',{'id':1})

# Recorremos todos los hermanos siguientes de la primera etiqueta p
for hermano in primer_p.next_siblings:
    print(hermano)

# elementos previos
print('---------------------------')
for hermano in primer_p.previous_siblings:
    print(hermano)




<p id="2">Este es el segundo párrafo.</p>


<p id="3">Este es el tercer párrafo.</p>


---------------------------


<p id="0">Este es el parrafo cero</p>


<h1>Título</h1>




In [19]:
from urllib.request import urlopen
from bs4 import BeautifulSoup
html = open('index.html','r')
# html = urlopen('index.html')
bs = BeautifulSoup(html, 'html.parser')
sitiosTuristicos = bs.find('li')
# print(sitiosTuristicos)
print(list(sitiosTuristicos.next_siblings))

result =[]
for h in sitiosTuristicos.next_siblings:
    if h != '\n':
        result.append(h)

for n, hijo in enumerate(result):
    print(n, hijo)


['\n', <p>Ubicacion: Centro de San Juan</p>, '\n', <li>el castrero</li>, '\n', <p>Ubicacion: Sureste</p>, '\n', <li>la puerta del llano</li>, '\n', <p>Ubicacion: Norte</p>, '\n']
0 <p>Ubicacion: Centro de San Juan</p>
1 <li>el castrero</li>
2 <p>Ubicacion: Sureste</p>
3 <li>la puerta del llano</li>
4 <p>Ubicacion: Norte</p>


## parent and parents
Permiten obtener los elementos que estan por encima de otro elemnto del documento html o xml

In [47]:
html = '''
<div id="pages">
  <ul>
    <li class="active"><a href="example.com">Example</a></li>
    <li><a href="example.com">Example</a></li>
    <li><a href="example1.com">Example 1</a></li>
    <li><a href="example2.com">Example 2</a></li>
  </ul>
</div>
'''
from bs4 import BeautifulSoup
soup = BeautifulSoup(html, 'html.parser')
page = soup.find('li', {'class', 'active'})
padres = page.parents
for p in padres:
    print(p.name)

ul
div
[document]


In [53]:
from urllib.request import urlopen
from bs4 import BeautifulSoup

html = urlopen('https://www.pythonscraping.com/pages/page3.html')
bs = BeautifulSoup(html, 'html.parser')
print(bs.find('img', {'src':'../img/gifts/img1.jpg'}).parent.previous_sibling.get_text())



$15.00



## Solo Pruebas

investigar:
metodo name
metodo prettify()

In [34]:
from urllib.request import  urlopen
from bs4 import  BeautifulSoup
html = urlopen('https://gamaenlinea.com/VIVERES/Panes/Salados/PAN-DE-SANDWICH-BLANCO-HOLSUM-420-GR/p/10034381')

bs = BeautifulSoup(html.read(), 'html.parser')
precio = bs.find('div', {'class':'from-price-value'})
print(precio)

<div class="from-price-value" style="font-weight: bold;">Total Ref. 2,15</div>
