In [21]:
# This script extracts the title, link, short description, and determines number of keywords detected in each webpage

#import all libraries
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service
from bs4 import BeautifulSoup as bs
import time, sys, requests, random


# import and authorize gspread  
import gspread
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
google_key_file = '/Users/zacharywong/Documents/ServiceAccountKey-Secret/pelagic-tracker-338302-eaf0e0e671cb.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)


In [22]:
# Global variables
columnName = 'Company Names (Leave Blank if Using LinkedIn Company Mining)'

# paths/baseurls
spreadsheet_id = '1vFXonFCyUlEKa1f0s5tvHCKeTek_sAv7rUPYfYss0Qo'
companyFilePath = '/Users/zacharywong/github/zacharywong2023/DigitalHealthWebscrape/Misc/Companies/Company Web Scrape Tool - Company Names.csv'
driver_path = '/Users/zacharywong/Documents/Work/Portfolio/DigitalHealthWebscrape/chromedriver'
pathtoFile = '/Users/zacharywong/github/zacharywong2023/DigitalHealthWebscrape/CSV Files/'
googleurl = 'https://www.google.com/'

# floats/ints
adjustDenominator = 2
maxbackOff = 120
maxResult = 1
waitTime = 5
waitRun = 0.3

# bools
useURL = False
useName = False


In [23]:
# exponential wait backoff algorithm 
def wait():
    pass
    global waitRun
    sleepTime = waitRun + random.uniform(0, 1)
    time.sleep(sleepTime)
    waitRun = waitRun*2
    if (waitRun >= maxbackOff):
        sys.exit("error: read from sheets quota exceeded")

In [24]:
# Helper function: reads in values from DigitalHealthWebscrape google sheet
# Need spreadsheet ID and the cell address where the value should be read in 

def readinValue(cellLocation, sheetIndex):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(sheetIndex)
    try:
        value = worksheet.acell(cellLocation).value
    except:
        wait()
    return value

In [25]:
# Read Company names from csv file 
def readCompanies():
    names = []
    df = pd.read_csv(companyFilePath, usecols = [columnName])
    df[columnName]=df[columnName].fillna(' ')
    names = df[columnName].tolist()
    #print(df)
    return names 
    

In [26]:
# Read keywords for Keyword Detection from spreadsheet
def readKeyWords():
    keywords = []
    row = 2;
    sheetIndex = 1
    isDone = False
    cellLocationColumn = 'B'
    while (isDone == False):
        cellLocationKeyWords = cellLocationColumn + str(row)
        try:
            keyword = readinValue(cellLocationKeyWords, sheetIndex)
            keywords.append(keyword)
            if(keyword == None):
                isDone = True
                break
            else:
                row += 1 
        except:
            wait()
    keywords = keywords[0:len(keywords)-1]
    print('Keywords: ', keywords)
    return keywords

In [27]:
# Read whether to activate Keyword Detection from spreadsheet
def readCalculateLiklihood():
    #print(links, names)
    sheetIndex = 1
    cellLocationLiklihood = 'A2'
    calculateLiklihood = False
    try:
        calculateLiklihoodInput = readinValue(cellLocationLiklihood, sheetIndex)
        if (calculateLiklihoodInput == 'Yes'):
            calculateLiklihood = True
        else:
            calculateLiklihood = False
    except:
        wait()
    return calculateLiklihood

In [28]:
# read all user inputs from spreadsheet
def readInput():
    names = readCompanies()
    calculateLiklihood = readCalculateLiklihood()
    keywords = readKeyWords()
    return names, calculateLiklihood, keywords

In [29]:
# Extract links from 1 page of google search results
def extractLinks(soup):
    links = []
    #Tags and classes
    linksTag = 'div'
    linksClass = 'yuRUbf'
    linksAttr = 'href'
    searchLinks = soup.find_all(linksTag, class_ = linksClass)
    for h in searchLinks:
        link = h.a.get(linksAttr)
        links.append(link)
    return links 

In [30]:
# Extract titles from 1 page of google search results
def extractTitles(soup):
    titles = [] 
    
    titlesClass = 'LC20lb MBeuO DKV0Md'
    titlesTag = 'h3'
    searchTitles = soup.find_all(titlesTag, class_= titlesClass)
    for h in searchTitles:
        titles.append(h.text)
    return titles

In [31]:
# Extract texts from 1 page of google search results
def extractTexts(soup):
    texts = []
    textsClass = 'VwiC3b yXK7lf MUxGbd yDYNvb lyLwlc lEBKkf'
    textsTag = 'div'
    searchText = soup.find_all(textsTag, class_= textsClass)
    for h in searchText:
        fullText = h.text
        try:
            splitText = fullText.split('— ', 1)
            text = splitText[1]
            texts.append(text);
        except:
            texts.append(fullText)
    return texts;

In [32]:
# Calculate number of keywords detected in the top result from Google 
def calculateLiklihoods(links, keywords):
    #keywords = ['personalized', 'personalization', 'machine-learning', 'AI', 'Artificial Intelligence', '24/7', 'democratizing']
    liklihoods = []
    detectedWordsAll = []
    ignore = ['[document]', 'a', 'article', 'label', 'script', 'style']
    liklihoodDenom = len(keywords) - adjustDenominator
    
    # for each url, calculate number of words detected
    for url in links: 
        detectedWords = []
        output = ''
        count = 0
        liklihood = 0
        res = requests.get(url, headers={'User-Agent': 'Mozilla/5.0'})
        html_page = res.content
        soup = bs(html_page, 'html.parser')
        text = soup.find_all(text=True)
        for t in text:
            if t.parent.name not in ignore:
                output += '{} '.format(t)
        outputSub = output.split(' ')
        
        # check if each word matches any in keywords list 
        for word in keywords: 
            if (word in outputSub or word.capitalize() in outputSub or word + '\n' in outputSub):
                detectedWords.append(word)
                count +=1 
        liklihood = round((count / liklihoodDenom), 2)
        #print(liklihood)
        liklihoods.append(liklihood)
        detectedWordsAll.append(detectedWords)
    return liklihoods, detectedWordsAll

In [33]:
# Search each company name on Google 
def searchGoogle(index, namesInput, googleurl, useURL, waitTime, driver):
    websiteName = namesInput[index]
    driver.get(googleurl)
    searchBar = driver.find_element(By.NAME, 'q')
    useURL = False; 
    useName = True; 
    if (useURL):
        query = "site: " + siteURL
        print('query: ' + query)
        try:
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
        except Exception as e : 
            WebDriverWait(driver, waitTime).until(EC.presence_of_element_located((By.NAME, 'q')))
            searchBar.send_keys(query)
            searchBar.send_keys('\n')
    else:
        query = websiteName
        print('query: ' + query)
        searchBar.send_keys(query)
        searchBar.send_keys('\n')

In [34]:
# capture links, header, and text
# pageInfo is a list of dictionaries for each page with keys/value pairs: header, link, text
# extract and load each page of results to pageInfo 
def parseHTML(driver, calculateLiklihood, keywords, maxResult, pageInfo):
    soup = bs(driver.page_source, 'html.parser')
    links = extractLinks(soup);
    texts = extractTexts(soup);
    titles = extractTitles(soup);
    if (calculateLiklihood):
        liklihoods, detectedWordsAll = calculateLiklihoods(links[0:1], keywords)
        pageInfo = addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods, detectedWordsAll)
    else:
        pageInfo = addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods = None, detectedWordsAll = None)
    return pageInfo

In [35]:
# helper function to add 1 page of results to pageInfo list
def addToPageInfo(titles, links, texts, maxResult, pageInfo, liklihoods, detectedWordsAll):
    index = 0;
    while (index < maxResult):
        #create new dictionary of each search results' percentage of detected words, title, link, text, and detected words
        if (liklihoods == None):
            pageInfo.append({"Title": titles[index], "Link": links[index], "About": texts[index]})
        else:
            detectedWords = str(detectedWordsAll[index])
            pageInfo.append({"Percentage of Keywords Detected": liklihoods[index], "Title": titles[index], "Link": links[index], "About": texts[index],  "Detected KeyWords": detectedWords})
        index += 1
    return pageInfo


In [36]:
# export final result to CSV 
def exportCSV(df, pathtoFile, fileName):
    # convert pageInfo to pandas dataframe and export as csv 
    df.to_csv(pathtoFile + fileName)

In [37]:
# export final result to spreadsheet
def updateSpreadSheet(df, sheetIndex):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(sheetIndex)
    worksheet.clear()
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())
    

In [38]:
# turn dictionary of results ot pandas dataframe to prepare for export to CSV and spreadsheet
def exportDeliverables(pageInfo, pathtoFile, fileName, calculateLiklihood):
    df = pd.DataFrame(pageInfo)
    if (calculateLiklihood):
        df = df.sort_values(by = ['Percentage of Keywords Detected'], ascending=False)
        df['Percentage of Keywords Detected'] = df['Percentage of Keywords Detected'] * 100
    sheetIndex = 3
    exportCSV(df, pathtoFile, fileName)
    updateSpreadSheet(df, sheetIndex)
    return df 

In [39]:
# run all helper functions to scrape info from Google 
def runExtraction():
    index = 0
    namesInput, calculateLiklihood, keywords = readInput()
    print("Number of Companies: " + str((len(namesInput))))
    fileName = 'CompanyWebScrape_CSV.csv'
    
    # Access chromedriver and determine path 
    service = Service(driver_path)
    driver = webdriver.Chrome(service = service)
    
    # list of dictionaries with key/value pairs: title, link, text
    # Contains all information for all search results 
    pageInfo = []
    while (index < len(namesInput)):
        searchGoogle(index, namesInput, googleurl, useURL, waitTime, driver)
        for page in range(0, 1):
            pageInfo = parseHTML(driver, calculateLiklihood, keywords, maxResult, pageInfo)
            index += 1
        time.sleep(waitRun)
    df = exportDeliverables(pageInfo, pathtoFile, fileName, calculateLiklihood)
    return df

In [40]:
# Run program with output on how long program takes 
startTime = time.time()
df = runExtraction()
endTime = time.time()
timeElapsed = endTime - startTime
print('Time Elapsed: ', timeElapsed)
df

Keywords:  ['personalized', 'personalization', 'machine-learning', 'AI', 'Artificial Intelligence', 'A.I.', 'democratizing', '24/7', 'digital assistant', 'digital', 'match']
Number of Companies: 15
query: Babylon health
query: sword health
query: kaia health
query: Ada Health
query: Bulgarian Digital Helath
query: UCM Digital Health
query: Artexe
query: Fem Tec Health
query: Buoy
query: Gyant
query: Curai
query: Memora
query: Biofourmis Health
query: Talkspace
query: Arivale 
Time Elapsed:  63.228007316589355


Unnamed: 0,Percentage of Keywords Detected,Title,Link,About,Detected KeyWords
4,56.0,Be part of the Bulgarian digital health ecosystem,https://dhicluster.bg/?lang=en,Be part of the Bulgarian digital health ... Di...,"['personalized', 'AI', '24/7', 'digital', 'mat..."
12,56.0,Biofourmis: Personalized Predictive Care. Anyw...,https://biofourmis.com/,"Biofourmis is advancing drug development, clin...","['personalized', 'machine-learning', 'AI', '24..."
2,33.0,Kaia Health | Democratizing Healthcare,https://kaiahealth.com/,Kaia Health is the largest MSK platform worldw...,"['personalized', 'AI', 'democratizing']"
7,33.0,femtec health | a health and beauty sciences c...,https://www.femtechealth.com/,FemTec Health delivers care for every woman ac...,"['personalized', 'personalization', 'digital']"
0,22.0,About Us | Babylon Health,https://www.babylonhealth.com/en-us/about,We're creating a better model of healthcare .....,"['AI', '24/7']"
1,22.0,SWORD Health | A better way to treat MSK pain,https://swordhealth.com/,SWORD brings clinical-grade musculoskeletal ca...,"['digital', 'match']"
3,22.0,Health. Powered by Ada.,https://ada.com/,Hi. We're Ada. Our app helps people manage the...,"['AI', 'digital']"
5,22.0,UCM Digital Health: Digital Front Door Solution,https://www.ucmdigitalhealth.com/,UCM Digital Health delivers an end-to-end heal...,"['24/7', 'digital']"
6,22.0,Home - Artexe,https://artexe.mapsgroup.it/en/,Artexe offers healthcare facility managers the...,"['AI', 'digital']"
10,22.0,Curai Health,https://www.curaihealth.com/,"Curai can help with all kinds of ailments, fro...","['personalized', '24/7']"
