# LinkedIn Scraper

In [None]:
import pandas as pd
import numpy as np
import re
import time
import logging
from tqdm import tqdm
from decouple import config

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,NoSuchElementException

from bs4 import BeautifulSoup

In [None]:
# Environment variables
EMAIL = config("EMAIL")
PASSWORD = config("PASSWORD")
PATH = config("WEBDRIVER_PATH")

In [None]:
# Change the filepath below to the name of your output pickle file from the Crunchbase notebook
# e.g. if your output file is called estonia_found.pkl, simply key in estonia

filepath = "test"

In [None]:
# Data

data = pd.read_pickle(f"{filepath}_found.pkl")

def link_matcher(list_of_founders):
    """
    Input should be a list of tuples of founders' details
    """
    linkedin_list = []
    for founder in list_of_founders:
        for attr in founder:
            if isinstance(attr,str) and "linkedin" in attr:
                linkedin_list.append(attr)
        # result = re.findall(r"(?<=\s|,)[^\s]+linkedin[^\s,]+",f)
        # if result:
        #     linkedin_list.append(result[0])
    if linkedin_list:
        return linkedin_list

founders_linkedin = data["founders"].dropna().apply(link_matcher).dropna()

In [None]:
def education_check(link,wd):
    wd.get(link)
    if wd.current_url!="https://www.linkedin.com/404/":
        time.sleep(2)
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            education_node = WebDriverWait(wd,8).until(EC.presence_of_element_located((By.ID,'education')))
            wd.execute_script("arguments[0].scrollIntoView();", education_node)
            education_parent_node = education_node.find_element(By.XPATH,'..')
            education_history = education_parent_node.find_elements(By.CLASS_NAME,"artdeco-list__item.pvs-list__item--line-separated.pvs-list__item--one-column")
            education_list = []
            for i in education_history:
                components = i.find_element(By.CLASS_NAME,"display-flex.flex-row.justify-space-between").find_element(By.XPATH,"./*").find_elements(By.XPATH,"./*")
                components = [c.find_element(By.XPATH,"./*").text for c in components]
                components = [c.split("\n")[0] if "\n" in c else c for c in components]
                for c in components:
                    nums = re.findall(r"\d+", c)
                    if nums and any(y > 2019 for y in [int(n) for n in nums]):
                        education_list.append(tuple(components))
            if education_list:
                return education_list
            else:
                return None
        except (TimeoutException, NoSuchElementException) as e:
            logging.warning(f"Could not find element in {link}")
            return "element error"
    else:
        logging.error(f"Error 404 occurred on {link}")
        return "page error"
    

In [None]:
driver = webdriver.Chrome(PATH) 

driver.get("https://www.linkedin.com")

username = driver.find_element(By.CLASS_NAME,'input__input')

username.send_keys(EMAIL)

password = driver.find_element(By.ID,'session_password')

password.send_keys(PASSWORD)

log_in_button = driver.find_element(By.CLASS_NAME,'sign-in-form__submit-button') 

log_in_button.click()

In [None]:
results = {}

for n,items in tqdm(founders_linkedin.items()):
    link_list = []
    if items:
        for link in items:
            time.sleep(2)
            education_results = education_check(link,driver)
            if isinstance(education_results,list):
                results[n] = (link,education_results)
                break
            elif education_results=="element error":
                link_list.append((link,"Check again"))
            elif education_results=="page error":
                link_list.append((link,"Error 404"))
            else:
                link_list.append((link,"Not eligible"))
            if not all(x[1]=="Not eligible" for x in link_list):
                results[n] = link_list
            else:
                results[n] = "Not eligible"

In [None]:
formatted = {}

for k,v in results.items():
    if isinstance(v,list):
        checklist = []
        for item in v:
            if item[1] in ("Check again","Error 404"):
                checklist.append(item)
        checklist = [", ".join(a) for a in checklist]
        result_str = "\n".join(checklist)
        formatted[k] = result_str
    elif isinstance(v,tuple):
        link = v[0]
        history = v[1]
        history = [str(a) for a in history]
        history_str = "\n".join(history)
        result_str = ",\n".join([link,history_str])
        formatted[k] = result_str
    else:
        formatted[k] = v
        
final = pd.merge(data,pd.Series(formatted,name="checked").to_frame(),how='inner',left_index=True,right_index=True)

In [None]:
final.to_excel(f"{filepath}_linkedin.xlsx",index=False)