In [2]:
import time
import numpy as np
import pandas as pd

from bs4 import BeautifulSoup
import requests
import random
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.firefox.options import Options

In [60]:
def scroll(driver, timeout):
    scroll_pause_time = timeout

    # Get scroll height
    last_height = driver.execute_script("return document.body.scrollHeight")

    while True:
        # Scroll down to bottom
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")

        # Wait to load page
        time.sleep(scroll_pause_time)

        # Calculate new scroll height and compare with last scroll height
        new_height = driver.execute_script("return document.body.scrollHeight")
        if new_height == last_height:
            # If heights are the same it will exit the function
            break
        last_height = new_height

options = Options()
options.set_preference('permissions.default.image', 2)
options.set_preference('dom.ipc.plugins.enabled.libflashplayer.so', False)


def get_quotes(url):
    # Setup the driver. This one uses firefox with some options and a path to the geckodriver
    driver = webdriver.Firefox(options=options, executable_path='/usr/local/bin/geckodriver')
    # implicitly_wait tells the driver to wait before throwing an exception
    driver.implicitly_wait(10)
    # driver.get(url) opens the page
    driver.get(url)
    # This starts the scrolling by passing the driver and a timeout
    scroll(driver, 3)
    # Once scroll returns bs4 parsers the page_source
    soup = BeautifulSoup(driver.page_source, 'lxml')
    # Them we close the driver as soup_a is storing the page source
    driver.close()

    quotes = []
    authors = []
    for quote in soup.find_all('a', {'title': 'view quote'}):
        quotes.append("'" + quote.contents[0] + "'")
    for author in soup.find_all('a', {'title': 'view author'}):
        authors.append(" " + author.contents[0])
    quotes = pd.Series(quotes, name='quote')
    authors = pd.Series(authors, name='author')
    quotes_authors = pd.concat([quotes, authors], axis=1)
    return quotes_authors

In [61]:
url = 'https://www.brainyquote.com/topics'
response = requests.get(url)
soup = BeautifulSoup(response.text, 'lxml')
links = []
for a in soup.find_all('a'):
    links.append(a.get('href'))
links = links[links.index('/topics/age-quotes'):(links.index('/topics/work-quotes') + 1)]
full_links = ['http://www.brainyquote.com' + link + '.html' for link in links]
topics = [link[8:(link.index('q')-1)] for link in links]
topics_links = dict(zip(topics, full_links))

In [65]:
subtopics = ['age', 'alone', 'anger', 'art', 'attitude', 'beauty', 'change', 'courage', 'death', 'dreams', 'experience',
            'failure', 'faith', 'fear', 'forgiveness', 'friendship', 'funny', 'god', 'happiness', 'humor', 'inspirational',
            'knowledge', 'learning' ,'life', 'love', 'marriage' ,'motivational', 'music', 'nature' ,'patience',
            'sad', 'success' ,'teacher' ,'thankful', 'time', 'wisdom']
url_fetch = [topics_links[u] for u in subtopics]
full = pd.DataFrame()
for u in url_fetch:
    new = get_quotes(u)
    new['label'] = str(list(topics_links.keys())[list(topics_links.values()).index(u)])
    full = pd.concat([full, new], ignore_index=True)

In [126]:
print('num of duplicates:', full.duplicated(subset=['quote'], keep='first').sum())

num of duplicates: 1930


In [None]:
full_drop = full.drop(full.loc[full.duplicated(subset=['quote'], keep='first') == True].index, axis=0)
full_drop.to_csv('full_quotes.csv', index=False)