<h1 align=center>Capítulo 1 - Começando com Scraping</h1>
<p align =center><img src=https://miro.medium.com/max/1000/1*75AH4zD0r5CNtO_Zpf_epg.jpeg width=500></p>

## Capturando dados com BeautifulSoup e Requests

In [3]:
import requests
from bs4 import BeautifulSoup

url = 'https://www.python.org/events/python-events/'

# Fazendo o request
req = requests.get(url)
req

<Response [200]>

In [6]:
# Capturando o conteúdo
req.text[:200]

'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->\n<!--[if IE 8]>      <h'

In [7]:
req.content[:200]

b'<!doctype html>\n<!--[if lt IE 7]>   <html class="no-js ie6 lt-ie7 lt-ie8 lt-ie9">   <![endif]-->\n<!--[if IE 7]>      <html class="no-js ie7 lt-ie8 lt-ie9">          <![endif]-->\n<!--[if IE 8]>      <h'

In [14]:
soup = BeautifulSoup(req.text, 'lxml')
events = soup.find('ul', {'class':'list-recent-events'}).findAll('li')
for event in events:
	event_details = dict()
	event_details['name'] = event.find('h3').find('a').text
	event_details['location'] = event.find('span', {'class':'event-location'}).text
	event_details['time'] = event.find('time').text
	print(event_details)

{'name': 'PyCon UK 2022', 'location': 'Cardiff City Hall, UK', 'time': '16 Sept. – 18 Sept.  2022'}
{'name': 'DjangoCon Europe 2022', 'location': 'Porto, Portugal', 'time': '21 Sept. – 25 Sept.  2022'}
{'name': 'PyCon Portugal 2022', 'location': 'Porto, Portugal', 'time': '24 Sept. 2022'}
{'name': '9th Conference of Scientific Python Latinamerica', 'location': 'Salta, Argentina', 'time': '26 Sept. – 28 Sept.  2022'}
{'name': 'PyConEs - Granada', 'location': '', 'time': '30 Sept. – 02 Oct.  2022'}
{'name': 'PyCon MEA @ Global DevSlam 2022', 'location': 'Dubai, UAE', 'time': '10 Oct. – 13 Oct.  2022'}


## Capturando os dados com urllib3 e BeatifulSoup

In [16]:
import urllib3

# Criando uma Função para capturar

def get_upcoming_events(url):
	req = urllib3.PoolManager()
	res = req.request('GET', url)

	soup = BeautifulSoup(res.data, 'html.parser')
	events = soup.find('ul', {'class':'list-recent-events'}).findAll('li')

	for event in events:
		event_details = dict()
		event_details['name'] = event.find('h3').find("a").text
		event_details['location'] = event.find('span', {'class', 'event-location'}).text
		event_details['time'] = event.find('time').text
		print(event_details)

get_upcoming_events(url)

{'name': 'PyCon UK 2022', 'location': 'Cardiff City Hall, UK', 'time': '16 Sept. – 18 Sept.  2022'}
{'name': 'DjangoCon Europe 2022', 'location': 'Porto, Portugal', 'time': '21 Sept. – 25 Sept.  2022'}
{'name': 'PyCon Portugal 2022', 'location': 'Porto, Portugal', 'time': '24 Sept. 2022'}
{'name': '9th Conference of Scientific Python Latinamerica', 'location': 'Salta, Argentina', 'time': '26 Sept. – 28 Sept.  2022'}
{'name': 'PyConEs - Granada', 'location': '', 'time': '30 Sept. – 02 Oct.  2022'}
{'name': 'PyCon MEA @ Global DevSlam 2022', 'location': 'Dubai, UAE', 'time': '10 Oct. – 13 Oct.  2022'}


In [17]:
import scrapy
from scrapy.crawler import CrawlerProcess


class PythonEventsSpider(scrapy.Spider):
    name = 'pythoneventsspider'
    start_urls = ['https://www.python.org/events/python-events/',]
    found_events = []

    def parse(self, response):
        for event in response.xpath('//ul[contains(@class, "list-recent-events")]/li'):
            event_details = dict()
            event_details['name'] = event.xpath('h3[@class="event-title"]/a/text()').get()
            event_details['location'] = event.xpath('p/span[@class="event-location"]/text()').get()
            event_details['time'] = event.xpath('p/time/text()').get()
            self.found_events.append(event_details)
        if __name__ == "__main__":
            process = CrawlerProcess({'LOG_LEVEL': 'ERROR'})
            process.crawl(PythonEventsSpider)
            spider = next(iter(process.crawlers)).spider
            process.start()
            for event in spider.found_events:
                print(event)