# <center>Dyamic Web Page Scraping with Selenium </center>

References:
- http://selenium-python.readthedocs.io/getting-started.html

## 1. Why Selenium
- So far, we have learned how to scrape **static** HTML pages using **Requests + BeautifulSoup**
- However, if the web content rely on **javascript or AJAX** to build the content, this combination does not work
  - Web content is loaded **asynchronously**
  - You need to **interact with the page** to get some content loaded, i.e. click "more"
- Example: 'https://www.quora.com/topic/Machine-Learning'

In [None]:
# Exercise 1.1. Scape quora page

# import requests package
import requests                   

# import BeautifulSoup from package bs4 (i.e. beautifulsoup4)
from bs4 import BeautifulSoup   

page = requests.get("https://www.quora.com/topic/Machine-Learning")    # send a get request to the web page

# status_code 200 indicates success. 
#a status code >200 indicates a failure 
if page.status_code==200:      

    soup = BeautifulSoup(page.content, 'html.parser')
    
    # get all answer div
    spans=soup.select("div.Answer div.answer_body_preview div.ui_qtext_truncated_text span.ui_qtext_rendered_qtext")
    
    print(len(spans))
    
    for span in spans:
        print(span.get_text())
        print("\n")
    
# Note that 

## 2. Selenium WebDriver
- Selenium WebDriver is one of the most popular tools for Web UI Automation
- Simulates users' actions performed in a web browser
  - click a button
  - fill a form
  - scroll page down or up
  - ...
- Installation:
  - Install Selenium package: 
    - pip install selenium
  - Download a webdirver based on your browser
    - Chrome:	https://sites.google.com/a/chromium.org/chromedriver/downloads
    - Firefox:	https://github.com/mozilla/geckodriver/releases
    - Safari:	https://webkit.org/blog/6900/webdriver-support-in-safari-10/
  - Here we use **Firefox**

In [None]:
# Exercise 2.1. Scrape using Selenium

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions

# Path where you save the webdriver for windows
executable_path = '../driver/geckodriver'

# log path
service_log_path = '../driver'

# initiator the webdriver for Firefox browser
driver = webdriver.Firefox(executable_path=executable_path, log_path='geckodriver.log')

# Wait to let webdriver complete the initialization
driver.wait = WebDriverWait(driver, 5)

# send a request
driver.get('https://www.quora.com/topic/Machine-Learning')

In [None]:
# Exercise 2.2. Select truncated text using Selenium

# get all answer divs
answers=driver.find_elements_by_css_selector("div.Answer")
print(len(answers))

# loop through each answer div to get details
for answer in answers:
    
    # get unique id (attribute) of each answer
    answer_id=answer.get_attribute("id")
    print(answer_id)
    print("\n")
    
    # select SPAN element by CSS selector
    trancated_text_path="div#"+answer_id+" div.ui_qtext_truncated_text span.ui_qtext_rendered_qtext"
    print(trancated_text_path)
    
    span=driver.find_element_by_css_selector(trancated_text_path)
    
    print("truncated text:")
    # get text of SPAN element
    print(span.text)
    print("\n")
    
driver.quit()

In [None]:
# Exercise 2.3. Click "more" to get Full text using Selenium

driver = webdriver.Firefox(executable_path=executable_path, log_path='geckodriver.log', timeout=500)

# Wait to let webdriver complete the initialization
driver.wait = WebDriverWait(driver, 5)

# send a request
driver.get('https://www.quora.com/topic/Machine-Learning')


# get all answer divs
answers=driver.find_elements_by_css_selector("div.Answer")
print(len(answers))

# loop through each answer div to get details
for answer in answers:
    
    # get unique id (attribute) of each answer
    answer_id=answer.get_attribute("id")
    print(answer_id)
    print("\n")
    
    # select SPAN element by CSS selector
    # CSS selector is based on answer ID and class
    trancated_text_path="div#"+answer_id+" div.ui_qtext_truncated_text span.ui_qtext_rendered_qtext"
    print(trancated_text_path)
    
    span=driver.find_element_by_css_selector(trancated_text_path)
    
    print("truncated text:")
    # get text of SPAN element
    print(span.text)
    print("\n")
    
    # select the link by CSS selector
    more_path='div#'+answer_id+" a.ui_qtext_more_link"
    
    # check if the link can be clicked
    try:
        # This waits up to 10 seconds before throwing a TimeoutException 
        # unless it finds the clickable element to return within 10 seconds.
        more_link=WebDriverWait(driver, 10).until(\
                    expected_conditions.element_to_be_clickable((By.CSS_SELECTOR, more_path)))
   
        # click the link
        more_link.click()
    
        # Select the SPAN element again by CSS selector 
        # Note the path is different
        # Same answer id is used here
        expanded_text_path="div#"+answer_id+" div.ui_qtext_expanded span.ui_qtext_rendered_qtext"
        print(expanded_text_path)
        
        try:
            span=WebDriverWait(driver, 10).until\
            (expected_conditions.presence_of_element_located((By.CSS_SELECTOR, expanded_text_path)))
       
            print("full text:")
            print(span.text)
            print("\n")
        except:
            print("something wrong")
    except:
            print("something wrong")
            
            
driver.quit()


In [None]:
# Exercise 2.4. Scroll down to get more

from selenium.webdriver.common.keys import Keys
import time

driver = webdriver.Firefox(executable_path=executable_path, log_path='geckodriver.log')

# Wait to let webdriver complete the initialization
driver.wait = WebDriverWait(driver, 5)

# send a request
driver.get('https://www.quora.com/topic/Machine-Learning')

# get all answer divs
answers=driver.find_elements_by_css_selector("div.Answer")
print(len(answers))

# get body tag
body = driver.find_element_by_css_selector('body')

for i in range(10):
    
    # page down
    body.send_keys(Keys.PAGE_DOWN)
    
    #sleep; wait until pages to be loaded
    time.sleep(2)
    
    answers=driver.find_elements_by_css_selector("div.Answer")
    
    print(len(answers))

# take a screen short of the page
driver.save_screenshot('screenshot.png')



driver.quit()

