#### Libraries

In [None]:
import os
import json

import requests
import psycopg2
from bs4 import BeautifulSoup

#### Pensador Scraper

In [None]:
class PensadorScraper:
  def __init__(self):
    self.links = {}
    self.phrases = {}
    self.next_links = ['/']
    self.previous_links = set()
    self.home = 'https://www.pensador.com'
  
  def get_links(self, url, soup):
    links = [a['href'] 
             for a in soup.find_all('a') 
             if a['href'][0] == '/' and
                a['href'].find('.php') == -1 and
                not a['href'].startswith('/frase')]
    self.links[url] = links
    links = set(links) - self.previous_links
    self.next_links.extend(links)

  def get_phrases(self, url, soup):
    for card in soup.find_all('div', 'thought-card'):
      try:
        id_ = card.find('p', 'frase')['id']
        phrase_ = card.find('p', 'frase').text.replace('\u2060', '')
      except:
        continue
      try:
        author_ = card.find('span', 'autor').a.text
        author_url_ = card.find('span', 'autor').a['href']
      except:
        author_ = ''
        author_url_ = ''
      try:
        n_shares_ = card.find('div', 'total-shares').text
        n_shares_ = n_shares_.replace(' compartilhamentos', '')
        if n_shares_.find(' mil') > -1:
          n_shares_ = n_shares_.replace(' mil', '')
          n_shares_ = float(n_shares_) * 1000
        n_shares_ = float(n_shares_)
      except:
        n_shares_ = 0
      try:
        img_url_ = card['data-src']
      except:
        img_url_ = ''

      if id_ not in self.phrases:
        self.phrases[id_] = {
          'phrase': phrase_,
          'phrase_url': f'/frase/{id_}/',
          'author': author_,
          'author_url': author_url_,
          'n_shares': n_shares_,
          'img_url': img_url_,
          'url': [url]
        }
      else:
        self.phrases[id_]['url'].append(url)   
  
  def work(self):
    while self.next_links:
      url = self.next_links.pop()
      self.previous_links.add(url)
      page = requests.get(f'{self.home}{url}')
      soup = BeautifulSoup(page.content, 'html.parser')
      self.get_links(url, soup)
      self.get_phrases(url, soup)
      print(f'loading {len(self.previous_links)}:{len(self.next_links)}')
  
  def save(self):
    with open('./pensador-links.json', 'w') as file:
      json.dump(self.links, file)
    with open('./pensador-phrases.json', 'w') as file:
      json.dump(self.phrases, file)

In [None]:
scraper = PensadorScraper()
scraper.work()

In [None]:
scraper.phrases

In [None]:
scraper.save()

#### Pensador PostgreSQL DB

In [None]:
%env DATABASE_URL=

In [None]:
DATABASE_URL = os.environ['DATABASE_URL']
conn = psycopg2.connect(DATABASE_URL, sslmode='require')
cursor = conn.cursor()

In [None]:
# TODO: Create tables
# args_str = ','.join(cur.mogrify("(%s,%s,%s,%s,%s,%s,%s,%s,%s)", x) for x in tup)
# cur.execute("INSERT INTO table VALUES " + args_str) 

In [None]:
# TODO: Add rows