# Corpus compilation
- text type and genres
- characteristics according to task

In [1]:
import urllib3
import re
from bs4 import BeautifulSoup
import os
import justext

In [2]:
urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)
user_agent = {'user-agent': 'Mozilla/5.0 (Windows NT 6.1; Win64; x64)'}

http = urllib3.PoolManager(10, headers=user_agent)


class Crawler:
    
    def __init__(self, corpus_path, max_files, seed_url, url_pattern):
        self.corpus_path = corpus_path
        self.max_files = max_files
        self.seed_url = seed_url
        self.url_pattern = url_pattern
        self.visited_links = {}
        self.to_be_visited = []
        
        if not os.path.exists(self.corpus_path):
            os.makedirs(self.corpus_path)
        
    def crawl(self):
        first_urls = self.get_page(self.seed_url)
        self.add_links(first_urls)
        next_link = self.get_next_link()
        
        file_counter = 1
        while next_link and file_counter < self.max_files:
            links = self.get_page(next_link)
            self.add_links(links)
            next_link = self.get_next_link()
            file_counter += 1
    '''def get_links(self, page_data):
        found_links = re.search("http://g1\.globo\.com/politica/\d+" '''

    def get_page(self, url):
        print("getting page {}".format(url))
        response = http.request('GET', url)

        # store text content
        paragraphs = justext.justext(response.data, justext.get_stoplist("Portuguese"))
        with open("{}/{}.txt".format(self.corpus_path, url.replace(".", "_").replace("/","-")), "w") as output_file:
            for paragraph in paragraphs:
                if not paragraph.is_boilerplate:
                    output_file.write(paragraph.text)
        
        # get links
        soup = BeautifulSoup(response.data, 'html.parser')
        links = [link.get('href') for link in soup.findAll('a', attrs={'href': re.compile(self.url_pattern)})]
        
        return links

    def add_links(self, links):
        links = list(set(links))
        self.to_be_visited.extend([link for link in links if link not in self.visited_links])

    def get_next_link(self):
        next_link = self.to_be_visited.pop(0)
        self.visited_links[next_link] = None
        return next_link

In [7]:
crawler_educacao = Crawler("../Data/corpora/educacaoG1", 100, "https://g1.globo.com/educacao/", "^https://g1\.globo\.com/educacao/noticia/\d+")
crawler_politica = Crawler("../Data/corpora/politicaG1", 100, "https://g1.globo.com/politica/", "^https://g1\.globo\.com/politica/noticia/\d+")

In [8]:
crawler_politica.crawl()
crawler_educacao.crawl()


getting page https://g1.globo.com/politica/
getting page https://g1.globo.com/politica/noticia/2019/04/23/relator-no-stj-vota-por-manter-condenacao-mas-reduzir-pena-de-lula.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/ccj-da-camara-da-inicio-a-reuniao-para-votar-reforma-da-previdencia.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/conselho-suspende-promotor-que-disse-que-contrataria-desembargadora-como-faxineira.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/conselho-do-mp-vai-investigar-deltan-dallagnol-por-falas-contra-o-stf.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/previdencia-governadores-discutirao-com-deputados-texto-viavel-para-aprovacao-diz-witzel.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/maioria-da-5a-turma-do-stj-mantem-condenacao-mas-vota-pela-reducao-da-pena-de-lula.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/23/ccj-da-camara-da-inicio-a-

getting page https://g1.globo.com/politica/noticia/2019/04/15/apos-inverter-pauta-ccj-da-camara-da-aval-a-pec-do-orcamento.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/17/presidente-da-ccj-diz-que-votacao-da-previdencia-ficara-para-semana-que-vem-por-falta-de-acordo.ghtml
getting page https://g1.globo.com/politica/noticia/2019/03/26/presidente-da-ccj-diz-que-reforma-da-previdencia-sera-votada-em-17-de-abril-na-comissao.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/16/apos-12-horas-ccj-encerra-debate-sobre-previdencia-sessao-sera-retomada-nesta-quarta.ghtml
getting page https://g1.globo.com/politica/noticia/2019/04/15/apos-inverter-pauta-ccj-da-camara-da-aval-a-pec-do-orcamento.ghtml
getting page https://g1.globo.com/politica/noticia/2019/03/26/presidente-da-ccj-diz-que-reforma-da-previdencia-sera-votada-em-17-de-abril-na-comissao.ghtml
getting page https://g1.globo.com/politica/noticia/2019/02/14/reforma-da-previdencia-preve-idade-minima-de-65-ano

getting page https://g1.globo.com/educacao/noticia/2018/11/29/como-uma-autora-lutou-ha-mais-de-100-anos-para-pedro-coelho-virar-heroi-eterno-veja-no-podcast-livro-falante.ghtml
getting page https://g1.globo.com/educacao/noticia/2019/04/11/bolsonaro-assina-decreto-sobre-a-alfabetizacao-no-pais.ghtml
getting page https://g1.globo.com/educacao/noticia/2018/11/29/como-uma-autora-lutou-ha-mais-de-100-anos-para-pedro-coelho-virar-heroi-eterno-veja-no-podcast-livro-falante.ghtml
getting page https://g1.globo.com/educacao/noticia/2019/04/04/mec-estuda-priorizar-a-alfabetizacao-no-1o-e-nao-mais-no-2o-ano-do-ensino-fundamental.ghtml
getting page https://g1.globo.com/educacao/noticia/2019/03/26/ex-secretaria-do-mec-diz-que-demissao-e-preco-que-paga-por-educacao-de-qualidade.ghtml
getting page https://g1.globo.com/educacao/noticia/2019/03/25/secretaria-de-educacao-basica-do-mec-pede-demissao.ghtml
getting page https://g1.globo.com/educacao/noticia/2019/03/22/anunciada-como-numero-dois-do-mec-iolen

IndexError: pop from empty list