In [1]:
# This script extracts the title, link, and short description of search results on Google 

#import all libraries
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from bs4 import BeautifulSoup as bs
import time, sys, requests, random


# import libs, authorize gspread  
import gspread
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
google_key_file = '/Users/zacharywong/Documents/ServiceAccountKey-Secret/pelagic-tracker-338302-eaf0e0e671cb.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)


In [2]:
# Global variables
liklihoodDenom = 0
spreadsheet_id = '1vFXonFCyUlEKa1f0s5tvHCKeTek_sAv7rUPYfYss0Qo'

In [3]:
def wait():
    waitRun = 1
    maxbackOff = 120
    sleepTime = waitRun + random.uniform(0, 1)
    print('sleepTime: ', sleepTime)
    time.sleep(sleepTime)
    waitRun = waitRun*2
    print('waitRun: ', waitRun)
    if (waitRun >= maxbackOff):
        sys.exit("error: read from sheets quota exceeded")

In [4]:
# Helper function: reads in values from DigitalHealthWebscrape google sheet
# Need spreadsheet ID and the cell address where the value should be read in 
# returns the value 

def readinValue(cellLocation):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(0)
    try:
        value = worksheet.acell(cellLocation).value
    except:
        wait()
    return value

In [5]:
def readWebsites():
    links = []
    names = []
    row = 1;
    isDone = False
    while (isDone == False):
        cellLocationURL = 'B' + str(row)
        cellLocationName = 'D' + str(row)
        print("cellLocationURL: " + cellLocationURL, "cellLocationName: " + cellLocationName)
        try: 
            siteURL = readinValue(cellLocationURL)
        except:
            wait()
        links.append(siteURL)
        try: 
            websiteName = readinValue(cellLocationName)
        except:
            wait()
        names.append(websiteName)
        #print(siteURL, websiteName)
        if(siteURL == None and websiteName == None):
            isDone = True
            break
        else:
            row += 1 
    return links, names
    

In [6]:
def readKeyWords():
    keywords = []
    row = 1;
    isDone = False
    while (isDone == False):
        cellLocationKeyWords = 'H' + str(row)
        try:
            keyword = readinValue(cellLocationKeyWords)
        except:
            wait()
        keywords.append(keyword)
        if(keyword == None):
            isDone = True
            break
        else:
            row += 1 
    keywords = keywords[0:len(keywords)-1]
    global liklihoodDenom 
    liklihoodDenom= len(keywords) - 2
    return keywords

In [7]:
def readCalculateLiklihood():
    #print(links, names)
    cellLocationLiklihood = 'F1'
    try:
        calculateLiklihoodInput = readinValue(cellLocationLiklihood)
    except:
        wait()
    if (calculateLiklihoodInput == 'Yes'):
        calculateLiklihood = True
    else:
        calculateLiklihood = False
    return calculateLiklihood

In [8]:
def readInput():
    links, names = readWebsites()
    calculateLiklihood = readCalculateLiklihood()
    keywords = readKeyWords()
    return links, names, calculateLiklihood, keywords

In [9]:
# helper function to extract links from 1 page of results
def extractLinks(soup):
    links = []
    linksTag = 'div'
    linksClass = 'yuRUbf'
    linksAttr = 'href'
    searchLinks = soup.find_all(linksTag, class_ = linksClass)
    for h in searchLinks:
        link = h.a.get(linksAttr)
        links.append(link)
    return links 

In [10]:
def extractTitles(soup):
    titles = [] 
    titlesClass = 'LC20lb MBeuO DKV0Md'
    titlesTag = 'h3'
    searchTitles = soup.find_all(titlesTag, class_= titlesClass)
    for h in searchTitles:
        titles.append(h.text)
    return titles

In [11]:
# helper function to extract texts from 1 page of results
def extractTexts(soup):
    texts = []
    textsClass = 'VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf'
    textsTag = 'div'
    searchText = soup.find_all(textsTag, class_= textsClass)
    for h in searchText:
        fullText = h.text
        try:
            splitText = fullText.split('— ', 1)
            text = splitText[1]
            texts.append(text);
        except:
            texts.append(fullText)
    return texts;

In [12]:
def calculateLiklihoods(links, keywords):
    #keywords = ['personalized', 'personalization', 'machine-learning', 'AI', 'Artificial Intelligence', '24/7', 'democratizing']
    liklihoods = []
    detectedWordsAll = []
    for url in links: 
        detectedWords = []
        output = ''
        count = 0
        liklihood = 0
        ignore = ['[document]', 'label','div','script', 'style', 'img', 'svg', 'ul', 'g', 'footer', 'button', 'clippath', 'nav']


        res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        html_page = res.content
        soup = bs(html_page, 'html.parser')
        print("Encoded method :" + url + ": ", soup.original_encoding)
        text = soup.find_all(text=True)
       
        for t in text:
            if t.parent.name not in ignore:
                output += '{} '.format(t)

        # analyze the webpage to detect keywords 
        outputSub = output.split(' ')
        #print(outputSub)
        for word in keywords: 
            if (word in outputSub or word.capitalize() in outputSub):
                #print('"' + word +'"' + ' detected')
                detectedWords.append(word)
                count +=1 
            #else:
                #print(word + ' is not there')
        liklihood = round((count / liklihoodDenom), 2)
        liklihoods.append(liklihood)
        detectedWordsAll.append(detectedWords)
        #detectedWordsAll = ', '.join(str(keyword) for keyword in detectedWordsAll)

    return liklihoods, detectedWordsAll

In [13]:
# set the keyword you want to search for depending on whether link or name is given
# we find the search bar using its name attribute value (q)

def searchGoogle(index, linksInput, namesInput, googleurl, useURL, waitTime, driver):
    print("index: " + str(index))
    print("length of links list: " + str((len(linksInput)-1)))
    siteURL = linksInput[index]
    websiteName = namesInput[index]
    driver.get(googleurl)
    searchBar = driver.find_element(By.NAME, 'q')
    
    # Booleans
    if (siteURL != None):
        useURL = True; 
        useName = False; 
    else:
        useURL = False; 
        useName = True; 

    # first we send our keyword to the search bar followed by the enter # key depending on using URL or website name 

    if (useURL):
        query = "site: " + siteURL
        try:
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
        except Exception as e : 
            WebDriverWait(driver, waitTime).until(EC.presence_of_element_located((By.NAME, 'q')))
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
    else:
        query = websiteName
        searchBar.send_keys(query)
        searchBar.send_keys('\n')

In [14]:
#capture links, header, and text
#pageInfo is a list of dictionaries for each page with keys/value pairs: header, link, text
# extract and load each page of results to pageInfo 
def parseHTML(driver, calculateLiklihood, keywords, maxResult, pageInfo):
    soup = bs(driver.page_source, 'html.parser')
    links = extractLinks(soup);
    texts = extractTexts(soup);
    titles = extractTitles(soup);
    if (calculateLiklihood):
        liklihoods, detectedWordsAll = calculateLiklihoods(links[0:1], keywords)
        #print('detectedWordsAllStringForm: ' + str(detectedWordsAll))
        #detectedWordsAll = ', '.join([str(keyword) for keyword in detectedWordsAll])
        #print("detectedWordsAll String Form: " + detectedWordsAll)
        pageInfo = addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods, detectedWordsAll)
    else:
        pageInfo = addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods = None, detectedWordsAll = None)

    return pageInfo

In [15]:
# helper function to add 1 page of results to pageInfo list
def addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods, detectedWordsAll):
    index = 0;
    while (index < maxResult):
        # create new dictionary of each search results' title, link, and text
        if (liklihoods == None):
            pageInfo.append({"Title": titles[index], "Link": links[index], "About": texts[index]})
        else:
            
            detectedWords = str(detectedWordsAll[index])
            print("detectedWordsString: " + detectedWords)
            pageInfo.append({"Probability": liklihoods[index], "Title": titles[index], "Link": links[index], "About": texts[index],  "Detected Words": detectedWords})
        index += 1
    return pageInfo


In [16]:
def exportCSV(df, pathtoFile):
    # convert pageInfo to pandas dataframe and export as csv 
    df.to_csv(pathtoFile + 'WebScrapeDeliverable.csv')

In [17]:
def updateSpreadSheet(df):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(1)
    worksheet.clear()
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())
    

In [18]:
def exportDeliverables(pageInfo, pathtoFile):
    df = pd.DataFrame(pageInfo)
    #print(df.columns.values.tolist())
    #print(df.values.tolist())
    print(pageInfo)
    exportCSV(df, pathtoFile)
    updateSpreadSheet(df)

In [19]:
def run():
    maxResult = 1
    googleurl = 'https://www.google.com/'
    useURL = False
    useName = False
    waitTime = 5
    linksInput, namesInput, calculateLiklihood, keywords = readInput()
    index = 0
    waitRun = 1
    maxbackOff = 120 

  
    # Access chromedriver and determine path 
    driver = webdriver.Chrome(ChromeDriverManager().install())
    pathtoFile = '/Users/zacharywong/github/zacharywong2023/DigitalHealthWebscrape/Deliverables/'

    # list of dictionaries with key/value pairs: title, link, text
    # Contains all information for all search results 
    pageInfo = []

    while (index < len(linksInput)-1):
        
        searchGoogle(index, linksInput, namesInput, googleurl, useURL, waitTime, driver)
        for page in range(0, 1):
            pageInfo = parseHTML(driver, calculateLiklihood, keywords, maxResult, pageInfo)
    #print(pageInfo)
            index += 1
    exportDeliverables(pageInfo, pathtoFile)



In [None]:
# Run the script 
# implement exponential backoff algorithm to prevent exceeding read quota from sheets
startTime = time.time()
run()
endTime = time.time()
timeElapsed = endTime - startTime
print('timeElapsed: ' + timeElapsed)


cellLocationURL: B1 cellLocationName: D1
cellLocationURL: B2 cellLocationName: D2
cellLocationURL: B3 cellLocationName: D3
cellLocationURL: B4 cellLocationName: D4
cellLocationURL: B5 cellLocationName: D5
cellLocationURL: B6 cellLocationName: D6
cellLocationURL: B7 cellLocationName: D7
cellLocationURL: B8 cellLocationName: D8
cellLocationURL: B9 cellLocationName: D9
cellLocationURL: B10 cellLocationName: D10
cellLocationURL: B11 cellLocationName: D11
cellLocationURL: B12 cellLocationName: D12
cellLocationURL: B13 cellLocationName: D13
cellLocationURL: B14 cellLocationName: D14
sleepTime:  1.7611493363163717
waitRun:  2
sleepTime:  1.6107224218701348
waitRun:  2
sleepTime:  1.2796393459549997
waitRun:  2
cellLocationURL: B15 cellLocationName: D15
sleepTime:  1.220690356146924
waitRun:  2
sleepTime:  1.193883294877879
waitRun:  2
cellLocationURL: B16 cellLocationName: D16
sleepTime:  1.2053278217345609
waitRun:  2
sleepTime:  1.0035242093832952
waitRun:  2
cellLocationURL: B17 cellLocati