# Linkedin Scraper

In [37]:
import pandas as pd
import numpy as np
import re
import time
import logging
from tqdm import tqdm
from decouple import config

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import TimeoutException,NoSuchElementException

from bs4 import BeautifulSoup

In [None]:
# Environment variables
EMAIL = config("EMAIL")
PASSWORD = config("PASSWORD")

In [52]:
def education_check(link,wd):
    wd.get(link)
    if wd.current_url!="https://www.linkedin.com/404/":
        time.sleep(2)
        wd.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        try:
            education_node = WebDriverWait(wd,8).until(EC.presence_of_element_located((By.ID,'education')))
            wd.execute_script("arguments[0].scrollIntoView();", education_node)
            education_parent_node = education_node.find_element(By.XPATH,'..')
            education_history = education_parent_node.find_elements(By.CLASS_NAME,"artdeco-list__item.pvs-list__item--line-separated.pvs-list__item--one-column")
            education_list = []
            for i in education_history:
                components = i.find_element(By.CLASS_NAME,"display-flex.flex-row.justify-space-between").find_element(By.XPATH,"./*").find_elements(By.XPATH,"./*")
                components = [c.find_element(By.XPATH,"./*").text for c in components]
                components = [c.split("\n")[0] if "\n" in c else c for c in components]
                for c in components:
                    nums = re.findall(r"\d+", c)
                    if nums and any(y > 2019 for y in [int(n) for n in nums]):
                        education_list.append(tuple(components))
            if education_list:
                return education_list
            else:
                return None
        except (TimeoutException, NoSuchElementException) as e:
            logging.warning(f"Could not find element in {link}")
            return "element error"
    else:
        logging.error(f"Error 404 occurred on {link}")
        return "page error"
    
def link_matcher(list_of_founders):
    """
    Input should be a list of tuples of founders' details
    """
    linkedin_list = []
    for f in list_of_founders:
        result = re.findall(r"(?<=\s|,)[^\s]+linkedin[^\s,]+",f)
        if result:
            linkedin_list.append(result[0])
    if linkedin_list:
        return linkedin_list

In [3]:
data = pd.read_excel("/Users/nathaniel/lkygbpc_scout/esd_found.xlsx")

data["new_founders"] = data["new_founders"].apply(lambda x: x if not isinstance(x,str) else x.split("_x000D_"))
founders_list = data["new_founders"].dropna()
    
founders_linkedin = founders_list.apply(link_matcher).dropna()

In [55]:
PATH = "/Users/nathaniel/chromedriver" 

driver = webdriver.Chrome(PATH) 

driver.get("https://www.linkedin.com")

username = driver.find_element(By.CLASS_NAME,'input__input')

username.send_keys(EMAIL)

password = driver.find_element(By.ID,'session_password')

password.send_keys(PASSWORD)

log_in_button = driver.find_element(By.CLASS_NAME,'sign-in-form__submit-button') 

log_in_button.click()

In [56]:
results = {}

for n,items in tqdm(founders_linkedin.items()):
    link_list = []
    if items:
        for link in items:
            time.sleep(2)
            education_results = education_check(link,driver)
            if isinstance(education_results,list):
                results[n] = (link,education_results)
                break
            elif education_results=="element error":
                link_list.append((link,"Check again"))
            elif education_results=="page error":
                link_list.append((link,"Error 404"))
            else:
                link_list.append((link,"Not eligible"))
            if not all(x[1]=="Not eligible" for x in link_list):
                results[n] = link_list
            else:
                results[n] = "Not eligible"

10it [02:10, 12.51s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/argorannamets
ERROR:root:Error 404 occurred on https://www.linkedin.com/in/rainvaana/
11it [02:16, 10.62s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/andrey-vavilin-8228ab7/
29it [05:41, 11.83s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/aleksey-korolyov
56it [09:59,  9.31s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/remigiuszkoscielny
70it [12:41,  7.96s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/baldfuturist/
84it [14:58,  7.54s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/ericyangthegreatest/
86it [15:12,  7.79s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/donguo/
101it [17:43,  9.34s/it]ERROR:root:Error 404 occurred on https://www.linkedin.com/in/zainallarakhia
ERROR:root:Error 404 occurred on https://www.linkedin.com/in/taavi-rannamets-95367986
122it [21:11, 11.63s/it]ERROR:root:Er

In [112]:
formatted = {}

for k,v in results.items():
    if isinstance(v,list):
        checklist = []
        for item in v:
            if item[1] in ("Check again","Error 404"):
                checklist.append(item)
        checklist = [", ".join(a) for a in checklist]
        result_str = "\n".join(checklist)
        formatted[k] = result_str
    elif isinstance(v,tuple):
        link = v[0]
        history = v[1]
        history = [str(a) for a in history]
        history_str = "\n".join(history)
        result_str = ",\n".join([link,history_str])
        formatted[k] = result_str
    else:
        formatted[k] = v
        
final = pd.merge(data,pd.Series(formatted,name="checked").to_frame(),how='inner',left_index=True,right_index=True)

In [114]:
final.to_excel("esd_final_checked.xlsx",index=False)

In [60]:
final = pd.merge(data,pd.Series(results,name="checked").to_frame(),how='inner',left_index=True,right_index=True)

# European Space Agency Scraper

In [12]:
from bs4 import BeautifulSoup
import requests
import pandas as pd
import re
import warnings

url = "https://commercialisation.esa.int/startups/"
response = requests.get(url, verify=False)
soup = BeautifulSoup(response.content,"html.parser")



In [174]:
results = soup.find("div",id="startup-results").findChildren(recursive=False)
print(len(results))

cols = ["title","country","bic","space_domains","industries","start_date","end_date","content","ext"]

startup_dict = {k: [] for k in cols}

for n,startup in enumerate(results):
    for col in cols:
        if col == "ext":
            ext = startup.find_all("a",href=True)[0]["href"]
            startup_dict[col].append(ext)
        else:
            attr = startup.find_all("div", id=lambda x: x and x.startswith(f"{col}-"))
            if len(attr)==0:
                startup_dict[col].append("")
            else:
                startup_dict[col].append(attr[0].text.strip())

df = pd.DataFrame(startup_dict)
        

1016


In [175]:
for col in ["country","bic","space_domains","industries","start_date","end_date"]:
    df[col] = df[col].apply(lambda x: re.sub(r".+:\s","",x))
    
df['bic'] = df['bic'].apply(lambda x: x.replace("ESA BIC",""))

In [185]:
after2018_df = df.query("start_date >= '2018'")

In [187]:
after2018_df.to_clipboard()