In [1]:
# This script extracts the title, link, and short description of search results on Google 

#import all libraries
import pandas as pd

from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By


from bs4 import BeautifulSoup as bs
import requests
import time


# import libs, authorize gspread  
import gspread
from oauth2client.service_account import ServiceAccountCredentials
scope = [
   'https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
google_key_file = '/Users/zacharywong/Documents/ServiceAccountKey-Secret/pelagic-tracker-338302-eaf0e0e671cb.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

spreadsheet_id = '1vFXonFCyUlEKa1f0s5tvHCKeTek_sAv7rUPYfYss0Qo'

In [2]:
# Global variables

liklihoodDenom = 0

In [3]:
# helper function to extract links from 1 page of results
def extractLinks(soup):
    links = []
    searchLinks = soup.find_all('div', class_ = 'yuRUbf')
    for h in searchLinks:
        link = h.a.get('href')
        links.append(link)
    return links 

In [4]:
def extractTitles(soup):
    titles = [] 
    searchTitles = soup.find_all('h3', class_='LC20lb MBeuO DKV0Md')
    for h in searchTitles:
        titles.append(h.text)
    return titles

In [5]:
# helper function to extract texts from 1 page of results
def extractTexts(soup):
    texts = []
    searchText = soup.find_all('div', class_='VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf')
    for h in searchText:
        fullText = h.text
        try:
            splitText = fullText.split('— ', 1)
            text = splitText[1]
            texts.append(text);
        except:
            texts.append(fullText)
    return texts;

In [6]:
def calculateLiklihoods(links, keywords):
    #keywords = ['personalized', 'personalization', 'machine-learning', 'AI', 'Artificial Intelligence', '24/7', 'democratizing']
    liklihoods = []
    detectedWordsAll = []
    for url in links: 
        detectedWords = []
        output = ''
        count = 0
        liklihood = 0

        res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        html_page = res.content
        soup = bs(html_page, 'html.parser')
        text = soup.find_all(text=True)

        #ignore = ['[document]', 'a', 'article', 'label','div','script', 'style', 'title', 'img', 'svg', 'ul', 'g', 'footer', 'button', 'clippath', 'nav']
        ignore1 = ['[document]', 'label','div','script', 'style', 'img', 'svg', 'ul', 'g', 'footer', 'button', 'clippath', 'nav']
       
        for t in text:
            if t.parent.name not in ignore1:
                output += '{} '.format(t)

        # analyze the webpage to detect keywords 
        outputSub = output.split(' ')
        #print(outputSub)
        for word in keywords: 
            if (word in outputSub or word.capitalize() in outputSub):
                #print('"' + word +'"' + ' detected')
                detectedWords.append(word)
                count +=1 
            #else:
                #print(word + ' is not there')
        liklihood = round((count / liklihoodDenom), 2)
        liklihoods.append(liklihood)
        detectedWordsAll.append(detectedWords)
        #detectedWordsAll = ', '.join(str(keyword) for keyword in detectedWordsAll)

    return liklihoods, detectedWordsAll

In [7]:
# helper function to add 1 page of results to pageInfo list
def addToPageInfo(titles, links, texts, liklihoods, detectedWordsAll):
    index = 0;
    while (index < maxResult):
        # create new dictionary of each search results' title, link, and text
        if (liklihoods == None):
            pageInfo.append({"Title": titles[index], "Link": links[index], "About": texts[index]})
        else:
            
            detectedWords = str(detectedWordsAll[index])
            print("detectedWordsString: " + detectedWords)
            pageInfo.append({"Liklihood": liklihoods[index], "Title": titles[index], "Link": links[index], "About": texts[index],  "DetectedWords": detectedWords})
        index += 1


In [8]:
def exportCSV(df):
    # convert pageInfo to pandas dataframe and export as csv 
    df.to_csv(pathToFile + 'WebScrapeDeliverable.csv')

In [9]:
# Helper function: reads in values from DigitalHealthWebscrape google sheet
# Need spreadsheet ID and the cell address where the value should be read in 
# returns the value 

def readinValue(cellLocation):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(0)
    value = worksheet.acell(cellLocation).value
    return value

In [10]:
def readWebsites():
    links = []
    names = []
    row = 1;
    isDone = False
    while (isDone == False):
        cellLocationURL = 'B' + str(row)
        cellLocationName = 'D' + str(row)
        print("cellLocationURL: " + cellLocationURL, "cellLocationName: " + cellLocationName)
        siteURL = readinValue(cellLocationURL)
        links.append(siteURL)
        websiteName = readinValue(cellLocationName)
        names.append(websiteName)
        #print(siteURL, websiteName)
        if(siteURL == None and websiteName == None):
            isDone = True
            break
        else:
            row += 1 
    return links, names
    

In [11]:
def readCalculateLiklihood():
    #print(links, names)
    cellLocationLiklihood = 'F1'
    calculateLiklihoodInput = readinValue(cellLocationLiklihood)
    if (calculateLiklihoodInput == 'Yes'):
        calculateLiklihood = True
    else:
        calculateLiklihood = False
    return calculateLiklihood

In [12]:
def readKeyWords():
    keywords = []
    row = 1;
    isDone = False
    while (isDone == False):
        cellLocationKeyWords = 'H' + str(row)
        keyword = readinValue(cellLocationKeyWords)
        keywords.append(keyword)
        if(keyword == None):
            isDone = True
            break
        else:
            row += 1 
    keywords = keywords[0:len(keywords)-1]
    global liklihoodDenom 
    liklihoodDenom= len(keywords) - 2
    return keywords

In [13]:
def readInput():
    links, names = readWebsites()
    calculateLiklihood = readCalculateLiklihood()
    keywords = readKeyWords()
    return links, names, calculateLiklihood, keywords

In [14]:
def updateSpreadSheet(df):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(1)
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())
    

In [None]:
linksInput, namesInput, calculateLiklihood, keywords = readInput()
print("links: " + str(linksInput))
print("names: " + str(namesInput))
print("calculateLiklihood: " + str(calculateLiklihood))
print("keywords: " + str(keywords))
# input for keyword, number of pages to scrape, and website URL 
# only want top result so only search for 1 result 
#nPages = 1
maxResult = 1
googleurl = 'https://www.google.com/'
useURL = False
useName = False
index = 0
waitTime = 5

# Access chromedriver and determine path 
driver = webdriver.Chrome(ChromeDriverManager().install())
pathToFile = '/Users/zacharywong/github/zacharywong2023/DigitalHealthWebscrape/Deliverables/'

# list of dictionaries with key/value pairs: title, link, text
# Contains all information for all search results 
pageInfo = []


# set the keyword you want to search for depending on whether link or name is given
# we find the search bar using it's name attribute value


while (index < len(linksInput)-1):
    print("index: " + str(index))
    print("length of links list: " + str((len(linksInput)-1)))
    siteURL = linksInput[index]
    websiteName = namesInput[index]
    driver.get(googleurl)
    searchBar = driver.find_element_by_name('q')
    
    # Booleans
    if (siteURL != None):
        useURL = True; 
        useName = False; 
    else:
        useURL = False; 
        useName = True; 

    # first we send our keyword to the search bar followed by the enter # key depending on using URL or website name 

    if (useURL):
        query = "site: " + siteURL
        try:
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
        except Exception as e : 
            WebDriverWait(driver, waitTime).until(EC.presence_of_element_located((By.NAME, 'q')))
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
    else:
        query = websiteName
        searchBar.send_keys(query)
        searchBar.send_keys('\n')

  

    #capture links, header, and text
    #pageInfo is a list of dictionaries for each page with keys/value pairs: header, link, text
    # extract and load each page of results to pageInfo 

    for page in range(0, 1):
        soup = bs(driver.page_source, 'html.parser')
        links = extractLinks(soup);
        texts = extractTexts(soup);
        titles = extractTitles(soup);
        if (calculateLiklihood):
            liklihoods, detectedWordsAll = calculateLiklihoods(links, keywords)
            #print('detectedWordsAllStringForm: ' + str(detectedWordsAll))
            #detectedWordsAll = ', '.join([str(keyword) for keyword in detectedWordsAll])
            #print("detectedWordsAll String Form: " + detectedWordsAll)
            addToPageInfo(titles, links, texts, liklihoods, detectedWordsAll)
        else:
            addToPageInfo(titles, links, texts, liklihoods = None, detectedWordsAll = None)
    #print(pageInfo)
    index += 1

df = pd.DataFrame(pageInfo)
#print(df.columns.values.tolist())
#print(df.values.tolist())
print(pageInfo)
exportCSV(df) 
updateSpreadSheet(df)

cellLocationURL: B1 cellLocationName: D1
cellLocationURL: B2 cellLocationName: D2
cellLocationURL: B3 cellLocationName: D3
cellLocationURL: B4 cellLocationName: D4




Current google-chrome version is 98.0.4758
Get LATEST chromedriver version for 98.0.4758 google-chrome


links: ['https://www.babylonhealth.com/en-us', 'https://www.concertai.com/predictivepatient/', 'https://kaiahealth.com/', None]
names: ['Babylon Health', 'Predictive Patient', 'Kaia Health', None]
calculateLiklihood: True
keywords: ['AI', 'personalized', 'artificial intelligence', 'democratize', '24/7', 'personalization', 'machine-learning']


Driver [/Users/zacharywong/.wdm/drivers/chromedriver/mac64/98.0.4758.102/chromedriver] found in cache
  driver = webdriver.Chrome(ChromeDriverManager().install())


index: 0
length of links list: 3


  searchBar = driver.find_element_by_name('q')


detectedWordsString: ['AI', 'personalized', '24/7']
index: 1
length of links list: 3
