# Web Scraping com Python

## Introdução

Website: **https://pythonwebscraping.netlify.com/**

Começamos importando as bibliotecas essenciais para trabalharmos com **Web Scraping**

In [1]:
# A Biblioteca requests nos permite fazer requisições HTTP de forma a obtermos o conteúdo html
import requests 
# A Biblioteca re nos permite trabalharmos com expressões regulares para que possamos buscar padrões nos textos
import re

### Requisição HTTP

In [2]:
# Começamos executando uma requisição GET para obtermos o conteúdo HTML do website
r = requests.get('https://pythonwebscraping.netlify.com/')
print(type(r))

<class 'requests.models.Response'>


In [3]:
# Inspecionamos o objeto 'requests.models.Response'
dir(r)

['__attrs__',
 '__bool__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__enter__',
 '__eq__',
 '__exit__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__nonzero__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_content',
 '_content_consumed',
 '_next',
 'apparent_encoding',
 'close',
 'connection',
 'content',
 'cookies',
 'elapsed',
 'encoding',
 'headers',
 'history',
 'is_permanent_redirect',
 'is_redirect',
 'iter_content',
 'iter_lines',
 'json',
 'links',
 'next',
 'ok',
 'raise_for_status',
 'raw',
 'reason',
 'request',
 'status_code',
 'text',
 'url']

In [5]:
# Verificando o conteúdo
r.content

b'<!DOCTYPE html>\n<html>\n<head>\n\t<meta charset="UTF-8">\n\t<title>Web Scraping Tutorial com Python</title>\n\t<link rel="icon" href="https://i.imgur.com/QOVnf5D.png">\n\t<style>\n\t.python {\n\t\tcolor: purple;\n\t}\n\t#titulo {\n\t\ttext-transform: uppercase;\n\t}\n\ttable {\n\t  border-collapse: collapse;\n\t}\n\n\ttable, th, td {\n\t  border: 1px solid black;\n\t  padding: 3px;\n\t}\n\t</style>\n</head>\n<body>\n\t<h1>Web Scraping</h1>\n\t<h2>Estrutura B\xc3\xa1sica HTML</h2>\n\t<img src="https://www.crummy.com/software/BeautifulSoup/bs4/doc/_images/6.1.jpg">\n\t<p>Aprendendo Web Scraping com <a href="https://www.python.org/">Python</a>,\n\t<a href="https://github.com/psf/requests-html">Requests-HTML</a>,\n\t<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">Beautiful Soup</a> e\n\t<a href="https://scrapy.org/">Scrapy</a>\n\t</p>\n\n\t<p>\xe2\x80\x9cLogic will get you from A to Z; imagination will get you everywhere.\xe2\x80\x9d <b>Albert Einstein</b></p>\n\t\n\t<h

In [6]:
# Verificando seu tipo de codificação
r.encoding

'UTF-8'

In [7]:
# Verificando detalhes sobre seus headers
r.headers

{'Cache-Control': 'public, max-age=0, must-revalidate', 'Content-Type': 'text/html; charset=UTF-8', 'Date': 'Tue, 22 Oct 2019 19:48:53 GMT', 'Etag': '"475162527e495373465cb3f752d1c3ff-ssl-df"', 'Strict-Transport-Security': 'max-age=31536000', 'Content-Encoding': 'gzip', 'Content-Length': '743', 'Age': '18870', 'Connection': 'keep-alive', 'Server': 'Netlify', 'Vary': 'Accept-Encoding', 'X-NF-Request-ID': 'b4580cd5-aca0-4b46-ab5a-ed9540b362d8-12694057'}

In [8]:
# Verificando o status_code, 200 significa que a requisição ocorreu com sucesso
r.status_code

200

In [9]:
# Verificando a url que foi utilizada
r.url

'https://pythonwebscraping.netlify.com/'

In [9]:
# Por fim o atributo 'text' nos traz uma string que representa o documento html que desejamos executar o Web Scraping
r.text

'<!DOCTYPE html>\n<html>\n<head>\n\t<meta charset="UTF-8">\n\t<title>Web Scraping Tutorial com Python</title>\n\t<link rel="icon" href="https://i.imgur.com/QOVnf5D.png">\n\t<style>\n\t.python {\n\t\tcolor: purple;\n\t}\n\t#titulo {\n\t\ttext-transform: uppercase;\n\t}\n\ttable {\n\t  border-collapse: collapse;\n\t}\n\n\ttable, th, td {\n\t  border: 1px solid black;\n\t  padding: 3px;\n\t}\n\t</style>\n</head>\n<body>\n\t<h1>Web Scraping</h1>\n\t<h2>Estrutura Básica HTML</h2>\n\t<img src="https://www.crummy.com/software/BeautifulSoup/bs4/doc/_images/6.1.jpg">\n\t<p>Aprendendo Web Scraping com <a href="https://www.python.org/">Python</a>,\n\t<a href="https://github.com/psf/requests-html">Requests-HTML</a>,\n\t<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">Beautiful Soup</a> e\n\t<a href="https://scrapy.org/">Scrapy</a>\n\t</p>\n\n\t<p>“Logic will get you from A to Z; imagination will get you everywhere.” <b>Albert Einstein</b></p>\n\t\n\t<h3 id=\'titulo\'>Linguagens de 

In [10]:
# Guardando o conteúdo html em uma variável
html = r.text

### Buscando Dados com Expressões Regulares

#### Título da Página

In [11]:
title = re.findall(r"<title>(.+?)</title>", html)

In [12]:
title

['Web Scraping Tutorial com Python']

#### Parágrafos da Página

In [13]:
p = re.findall(r"<p>(.*?)</p>", html, flags=re.DOTALL) 

In [14]:
p

['Aprendendo Web Scraping com <a href="https://www.python.org/">Python</a>,\n\t<a href="https://github.com/psf/requests-html">Requests-HTML</a>,\n\t<a href="https://www.crummy.com/software/BeautifulSoup/bs4/doc/">Beautiful Soup</a> e\n\t<a href="https://scrapy.org/">Scrapy</a>\n\t',
 '“Logic will get you from A to Z; imagination will get you everywhere.” <b>Albert Einstein</b>']

#### Links da Página

In [15]:
a = re.findall(r'href=[\'"]?([^\'" >]+)', html)

In [16]:
a

['https://i.imgur.com/QOVnf5D.png',
 'https://www.python.org/',
 'https://github.com/psf/requests-html',
 'https://www.crummy.com/software/BeautifulSoup/bs4/doc/',
 'https://scrapy.org/']

#### Emails da Página

In [17]:
emails = re.findall(r'([\d\w\.]+@[\d\w\.\-]+\.\w+)', html)

In [18]:
emails

['alan@turing.com', 'john@voneumann.com', 'blaise@pascal.com']