In [1]:
import platform
import logging
from tqdm import tqdm 
from time import sleep
import pandas as pd

from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.action_chains import ActionChains
from selenium.common.exceptions import NoSuchElementException, ElementClickInterceptedException

In [41]:
def sleepBar(seconds):
    for i in tqdm(range(seconds)):
        sleep(1)


def hideGBar():
    """
    Hide Google Bar to prevent ClickInterceptionError
    """
    try:
        browser.execute_script('document.getElementById("searchform").style.display = "none";')
    except:
        pass

        
def initBrowser(headless=False):
    if "Windows" in platform.system():
        chrome_path = "driver/chromedriver.exe"
    elif "Linux" in platform.system():
        chrome_path = "driver/chromedriver_linux"
    else:
        chrome_path = "driver/chromedriver_mac"
    chrome_options = Options()
    chrome_options.add_argument("--disable-features=NetworkService")
    if headless:
        chrome_options.add_argument('headless')
    return webdriver.Chrome(options=chrome_options,executable_path=chrome_path)



def clickNTimes(el, n=1):
    """
    Click on questions N times
    """
    for i in range(n):
        el.click()
#         print(f"clicking on ... {el.text}")
        sleep(1)
        # scrollToFeedback()
#         try:
#             el.find_element_by_xpath("//*[@aria-expanded='true']").click()
#         except:
#             pass
#         sleepBar(1)
        
def getAnswerText(el):
    ans = el.find_elements_by_xpath(".//div[contains(@class,'mod')]")
#     for item in ans:
#         print(item.text)
    return ans[1].text
    
    
def newSearch(browser,query, lang="en"):
    if lang== "en":
        browser.get("https://www.google.com?hl=en")
        searchbox = browser.find_element_by_xpath("//input[@aria-label='Search']")
    elif lang=='es':
        browser.get("https://www.google.com?hl=es")
        searchbox = browser.find_element_by_xpath("//input[@aria-label='Buscar']")
    else:
        print("Only English and Spanish are supported by this script for now.")
    
    searchbox.send_keys(query)
    sleep(1)
    # ??? tabNTimes()
    if lang== "en":
        searchbtn = browser.find_elements_by_xpath("//input[@aria-label='Google Search']")
    else:
        searchbtn = browser.find_elements_by_xpath("//input[@aria-label='Buscar con Google']")
    try:
        searchbtn[-1].click()
    except:
        searchbtn[0].click()
    sleep(1)
    # paa = browser.find_elements_by_xpath("//span/following-sibling::div[contains(@class,'match-mod-horizontal-padding')]")
    try:
        paa = browser.find_elements_by_class_name("related-question-pair")
    except:
        paa = []
    hideGBar()
    return paa


def crawlQuestions(start_paa, paa_list=[]):
    new_questions = []
    for el in start_paa:
        # scrollToFeedback()
        q = el.find_element_by_xpath(".//div[contains(@class,'match-mod-horizontal-padding')]")
        if "Dictionary" in q.text:
            continue
        clickNTimes(q)
        answer_text = getAnswerText(el)
        paa_list.append((q.text, answer_text))
        new_questions.append(q.text)
    return new_questions, paa_list


def saveToCSV(job_title, faqs):
    outfile = job_title.replace(" ", "_").replace("/", "_")+'.csv'
    headers = ['rank', 'question', 'answer']
    df = pd.DataFrame(faqs)
    df = df.reset_index()
    df.columns = headers
    df.to_csv(outfile, index=False)
    # return df
    
    
def main(job_title):
    browser = initBrowser(False)
    query_queue = [job_title]
    threshold = 20
    faqs = []
    while query_queue and len(faqs) < threshold:
        query = query_queue.pop(0)
        start_paa = newSearch(browser,query)
        if start_paa == []:
            print("No PAA found!")
            break
        new_questions, faqs = crawlQuestions(start_paa, faqs)
        query_queue.extend(new_questions)

    browser.close()
    saveToCSV(job_title, faqs)
    print(job_title, "Done")
    return faqs

In [43]:
titles = ['real estate agent', 'police officer', 'phlebotomist', 'substitute teacher', 'babysitter/nanny']
dfs = []
for title in titles:
    faqs = main(title)
    if faqs:
        df = pd.DataFrame(faqs)
        df = df.reset_index()
        df.columns = ['rank', 'question', 'answer']
        df['job_title'] = title
        df = df[['job_title', 'rank', 'question', 'answer']]
        dfs.append(df)

df_out = pd.concat(dfs)
df_out.to_csv('faqs_5titles.csv', index=False)

real estate agent Done
police officer Done
phlebotomist Done
substitute teacher Done
babysitter/nanny Done


In [44]:
df_out.head()

Unnamed: 0,job_title,rank,question,answer
0,real estate agent,0,How do you become a real estate agent?,7 Steps Toward Getting Your Texas Sales Agent ...
1,real estate agent,1,Can you make good money as a real estate agent?,Newer real estate agents will likely sell even...
2,real estate agent,2,Is there a difference between a realtor and a ...,Not every real estate agent is a REALTOR® and ...
3,real estate agent,3,How much does it cost to go to real estate sch...,The licensing fee for a Florida Real Estate Sa...
4,real estate agent,4,How long does it take to become a real estate ...,You also don't need to have a bachelor's degre...
