In [1]:
# Scrape LinkedIn companies based on user inputted search query and update into Company Web Scrape Google Sheets 
# Script for LinkedIn Company Mining Feature

#import libraries
import pandas as pd
from selenium import webdriver
from webdriver_manager.chrome import ChromeDriverManager
from selenium.webdriver.support.ui import WebDriverWait, Select
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By 
from selenium.webdriver.chrome.service import Service
from selenium.webdriver.common.keys import Keys
from bs4 import BeautifulSoup as bs
import time, sys, requests, random

# import and authorize gspread  
import gspread
from oauth2client.service_account import ServiceAccountCredentials
scope = ['https://spreadsheets.google.com/feeds',
         'https://www.googleapis.com/auth/drive']
google_key_file = '/Users/zacharywong/Documents/ServiceAccountKey-Secret/pelagic-tracker-338302-eaf0e0e671cb.json'
credentials = ServiceAccountCredentials.from_json_keyfile_name(google_key_file, scope)
gc = gspread.authorize(credentials)

In [2]:
#Global variables
companies = []
queries = []
query = ''
numberofPages = 0
#cellLocationQuery = 'C2'
cellLocationPages = 'B2'
sheetIndexRead = 0
sheetIndexWrite = 2
columnName = 'Company Names (Leave Blank if Using LinkedIn Company Mining)'
password = ''
email = 'zach.jl.wong@gmail.com'

#paths/urls
spreadsheet_id = '1vFXonFCyUlEKa1f0s5tvHCKeTek_sAv7rUPYfYss0Qo'
loginurl = 'https://www.linkedin.com/uas/login'
driver_path = '/Users/zacharywong/Documents/Work/Portfolio/DigitalHealthWebscrape/chromedriver'
pathtoLinkedInFile = '/Users/zacharywong/github/zacharywong2023/DigitalHealthWebscrape/Misc/Companies/Company Web Scrape Tool - Company Names.csv'
pathtoPassword = '/Users/zacharywong/Documents/LinkedIn/LinkedIn-Password-Secret.txt'

#waitTimes
waitLinkedInTime = 10
waitCaptcha = 25
waitSearch= 1
waitLogin = 0.8


# xPaths, classes, and javascripts
searchBarXPath = '//*[@id="global-nav-typeahead"]/input'
companyFilterXPath = '//*[@id="search-reusables__filters-bar"]/ul/li[2]/button'
scrollDownScript = "window.scrollTo(0,document.body.scrollHeight)"
pageButtonClass = "button[type='button']"
companyTitleClass = 'app-aware-link'
companyTitleAttr = 'a'


In [3]:
# helper function to read in value from spreadsheet
def readinValue(cellLocation, sheetIndexRead):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(sheetIndexRead)
    try:
        value = worksheet.acell(cellLocation).value
    except:
        wait()
    return value

In [4]:
# read in query from spreadsheet
def readQueries():
    global queries
    row = 2;
    sheetIndex = 0
    isDone = False
    cellLocationColumn = 'C'
    while (isDone == False):
        cellLocationQueries = cellLocationColumn + str(row)
        try:
            query = readinValue(cellLocationQueries, sheetIndex)
            if(query == None):
                isDone = True
                break
            else:
                queries.append(query)
                row += 1 
        except:
            wait()
    print('Queries: ', queries)

In [5]:
# read in number of pages from spreadsheet
def readinPages():
    global numberofPages
    numberofPages = int(readinValue(cellLocationPages, sheetIndexRead))
    print('numberofPages: ', numberofPages)

In [6]:
# login to linkedin 
def login(driver):
    driver.get(loginurl)
    emailInput = driver.find_element(By.ID, 'username')
    emailInput.send_keys(email)
    time.sleep(waitLogin)
    passwordInput = driver.find_element(By.ID, 'password')
    with open (pathtoPassword, 'r') as file:
        global password
        password = file.read()
    passwordInput.send_keys(password)
    time.sleep(waitLogin)
    passwordInput.send_keys(Keys.RETURN)
    

In [7]:
# search query on LinkedIn
def searchLinkedIn(driver):
    #search query
    try:
        searchBar = driver.find_element(By.XPATH, searchBarXPath)
        searchBar.send_keys(query)
        searchBar.send_keys(Keys.RETURN)
        time.sleep(1)
    except: 
        time.sleep(waitCaptcha)
        searchBar = driver.find_element(By.XPATH, searchBarXPath)
        searchBar.send_keys(query)
        searchBar.send_keys(Keys.RETURN)
        time.sleep(1)

In [8]:
# Click Company Filter option on LinkedIn
def companyFilter(driver):
    WebDriverWait(driver, waitLinkedInTime).until(EC.presence_of_element_located((By.XPATH, companyFilterXPath)))
    companyButtons = driver.find_elements_by_css_selector(pageButtonClass)
    for button in companyButtons: 
        buttonText = button.text
        if (buttonText == 'Companies'): 
            button.click()
            return;

In [9]:
# Click onto next page 
def nextPage(currentPage, driver):
    nextPageSuccess = False; 
    driver.execute_script(scrollDownScript)
    time.sleep(1)
    #WebDriverWait(driver, waitLinkedInTime).until(EC.presence_of_element_located((By.XPATH, '//*[@id="ember329"]/button')))
    pages = driver.find_elements_by_css_selector(pageButtonClass)
    for page in pages:
        #time.sleep(1)
        buttonNumber = page.text
        #print(buttonNumber) 
        if (buttonNumber == str(currentPage)):
            #print('hello')
            page.click()
            nextPageSuccess = True; 
            break
    if (nextPageSuccess == False):
        for page in pages:
            buttonNumber = page.text
            if (buttonNumber == '…'):
                print("click more option")
                page.click()
                return 
            

In [10]:
# add companies into a pandas dataframe 
def addCompanies(driver):
    currentPage = 1
    while (currentPage <= numberofPages):
        time.sleep(2)
        soup = bs(driver.page_source, 'html.parser')
        companySearch = soup.find_all(companyTitleAttr, class_ = companyTitleClass)
        for company in companySearch:
            companyName = company.text
            global companies 
            if '\n\n\n' not in companyName and companyName not in companies:
                companies.append(companyName.replace('\n', ''))
                print("companyName: " + companyName)
            #except: 
                #print("error adding company: " + companyName)
            currentPage += 1
        nextPage(currentPage, driver)

In [11]:
# export final list of companies to CSV 
def exportCSV(companies):
    df = pd.DataFrame(companies, columns = [columnName])
    df.to_csv(pathtoLinkedInFile)
    return df

In [12]:
# Update spreadsheet with list of companies
def updateSpreadSheet(df, sheetIndexWrite):
    sh = gc.open_by_key(spreadsheet_id)
    worksheet = sh.get_worksheet(sheetIndexWrite)
    worksheet.clear()
    worksheet.update([df.columns.values.tolist()] + df.values.tolist())

In [13]:
# run all functions to scrape LinkedIn companies  
def runLinkedIn():
    service = Service(driver_path)
    driver = webdriver.Chrome(service = service)
    readinPages()
    login(driver)
    searchLinkedIn(driver)
    companyFilter(driver)
    companies = addCompanies(driver)
    return companies

In [14]:
def exportDeliverables():
    df = exportCSV(companies)
    updateSpreadSheet(df, sheetIndexWrite)

In [15]:
# run program 
startTime = time.time()
readQueries() 
for q in queries: 
    global query
    query = q
    print("Current Query: ", query)
    companies = runLinkedIn()
    print(companies)
df = exportCSV(companies)
updateSpreadSheet(df, sheetIndexWrite)
endTime = time.time()
elapsedTime = endTime - startTime
print('Time Elapsed: ', elapsedTime)
df

Queries:  ['personalized health', 'digital health', 'artificial intelligence health', '24/7 health']
Current Query:  personalized health
numberofPages:  20


  companyButtons = driver.find_elements_by_css_selector(pageButtonClass)


companyName: 
Personalized Health and Fitness

companyName: 
Personalized Health Solutions, LLC

companyName: 
Health Advocate

companyName: 
Health Alliance Plan

companyName: 
Personalized Medicine Coalition

companyName: 
PERSONALIZED HEALTH NUTRITION, PLLC

companyName: 
Personalized Health Solutions

companyName: 
PERSONALIZED HEALTH SERVICES, CORP

companyName: 
PERSONALIZED HEALTH CARE INC

companyName: 
PERSONALIZED HEALTH AND PREVENTIVE MEDICINE LLC



  pages = driver.find_elements_by_css_selector(pageButtonClass)


click more option
None
Current Query:  digital health
numberofPages:  20


KeyboardInterrupt: 