#### Libraries

In [None]:
import os
import json
from datetime import date

import requests
import psycopg2
from bs4 import BeautifulSoup
from dotenv import load_dotenv
from IPython.display import clear_output

In [None]:
from google.colab import drive
drive.mount('/content/drive')

#### Pensador Scraper

In [None]:
class PensadorScraper:
  def __init__(self):
    self.links = {}
    self.phrases = {}
    self.next_links = ['/']
    self.previous_links = set()
    self.home = 'https://www.pensador.com'
  
  def get_links(self, url, soup):
    links = [self.home + a['href'][1:] 
             for a in soup.find_all('a') 
             if a and 
                a['href'] and
                a['href'].startswith('/') and 
                a['href'].find('.php') == -1 and
                a['href'].find('/frase') == -1 and
                a['href'].find('/colecao') == -1]
    links = set(links)
    self.links[url] = list(links)
    links = links - self.previous_links.keys()
    self.next_links.extend(links)

  def get_phrases(self, url, soup):
    for card in soup.find_all('div', 'thought-card'):      
      try:
        id_ = card.find('p', 'frase')['id']
        phrase_ = card.find('p', 'frase').texts
        phrase_ = phrase_.strip().replace('\u2060', '')
      except:
        continue
      
      try:
        author_ = card.find('span', 'autor').a.text
        author_url_ = card.find('span', 'autor').a['href']
      except:
        author_ = ''
        author_url_ = ''
      
      try:
        n_shares = card.find('div', class_='total-shares')
        n_shares = n_shares.text.replace(' compartilhamentos', '')
        if n_shares[-4:] == ' mil':
          n_shares = n_shares.replace(' mil', '')
          n_shares = float(n_shares) * 1000
        elif n_shares:
          n_shares = float(n_shares)
        else:
          n_shares = 0
      except:
        n_shares = 0
        
      try:
        img_url_ = card['data-src']
      except:
        img_url_ = ''

      if id_ not in self.phrases:
        self.phrases[id_] = {
          'phrase': phrase_,
          'phrase_url': f'/frase/{id_}/',
          'author': author_,
          'author_url': author_url_,
          'n_shares': n_shares_,
          'img_url': img_url_,
          'urls': [url]
        }
      elif url not in self.phrases[id_]['urls']:
        self.phrases[id_]['urls'].append(url)
  
  def work(self):
    while self.next_links:
      url = self.next_links.pop()

      try:
        page = requests.get(url)
        page.raise_for_status()
      except requests.exceptions.HTTPError as errh:
        print('HTTP Error\n', errh)
      except requests.exceptions.ConnectionError as errc:
        print('Error Connecting:\n', errc)
      except requests.exceptions.Timeout as errt:
        print('Timeout Error:\n', errt)
      except requests.exceptions.RequestException as err:
        print('An unexpected error:\n', err)
      else:
        soup = BeautifulSoup(page.content, 'html.parser')

        self.get_links(url, soup)
        self.get_phrases(url, soup)
        
        prev_size = len(self.previous_links)
        next_size = len(self.next_links)
        print(url, f'{prev_size}:{next_size}')
        clear_output(wait=True)
  
  def load(self, date_):
    links_path = f'./pensador-links-{date_}.json'
    with open(links_path, 'w') as file:
      self.links = json.load(file)
    phrases_path = f'./pensador-phrases-{date_}.json'
    with open(phrases_path, 'w') as file:
      self.phrases = json.load(file)
  
  def save(self):
    links_path = f'./pensador-links-{date.today()}.json'
    with open(links_path, 'w') as file:
      json.dump(self.links, file)
    phrases_path = f'./pensador-phrases-{date.today()}.json'
    with open(phrases_path, 'w') as file:
      json.dump(self.phrases, file)

In [None]:
scraper = PensadorScraper()
scraper.work()
scraper.save()

#### Pensador JSON Dict to SQL Row Converter

In [None]:
scraper = PensadorScraper()
scraper.load('2021-01-15')

In [None]:
links = []
for key in scraper.links:
  links.append((
    key,
    '|'.join(scraper.links[key])
  ))
links

In [None]:
phrases = []
for key in scraper.phrases:
  phrases.append((
    key, 
    scraper.phrases[key]['phrase'], 
    scraper.phrases[key]['phrase_url'], 
    scraper.phrases[key]['author'], 
    scraper.phrases[key]['author_url'], 
    scraper.phrases[key]['img_url'], 
    scraper.phrases[key]['n_shares'], 
    scraper.phrases[key]['urls']
  ))
phrases

#### Pensador PostgreSQL DB

In [None]:
# references at
# www.postgresqltutorial.com/
# www.postgresqltutorial.com/postgresql-array/
# www.postgresqltutorial.com/postgresql-char-varchar-text/

In [None]:
# load environment variables
load_dotenv(dotenv_path='../.env')

In [None]:
# TODO: Finish create table block

try:
  conn = None
  DATABASE_URL = os.environ['DATABASE_URL']
  conn = psycopg2.connect(DATABASE_URL, sslmode='require')
  cur = conn.cursor()
  
  # create link table
  cur.execute('''
  
  ''')

  # create phrase table
  cur.execute('''
  
  ''')

  cur.close()
  conn.commit()
except (Exception, psycopg2.DatabaseError) as err:
  print(err)
finally:
  if conn is not None:
    conn.close()

In [None]:
# TODO: Finish insert table block

try:
  conn = None
  DATABASE_URL = os.environ['DATABASE_URL']
  conn = psycopg2.connect(DATABASE_URL, sslmode='require')
  cur = conn.cursor()
  
  # insert data into link table
  args_str = ','.join(cur.mogrify('(%s, %s)', x) for x in links)
  cur.execute('INSERT INTO links VALUES ' + args_str) 

  # insert data into phrase table
  args_str = ','.join(cur.mogrify('(%s, %s, %s, %s, %s, %s, %s, %s)', x) 
                      for x in phrases)
  cur.execute('INSERT INTO phrases VALUES ' + args_str)

  cur.close()
  conn.commit()
except (Exception, psycopg2.DatabaseError) as err:
  print(err)
finally:
  if conn is not None:
    conn.close()

In [None]:
# TODO: Query random row from phrases table

In [None]:
# TODO: Transform random phrase