# Data Scraping dynamic website:

##### Initial setup: 

In [1]:
from selenium import webdriver
from selenium.webdriver.common.by import By
import time
from selenium.webdriver.chrome.options import Options
import pandas as  pd
import re

webdriver_path = "E:\downloads\selenium_scraper\selenium_scraper\chromedriver.exe"
chrome_options = Options()
chrome_options.add_experimental_option("detach", True)


def scroll_full_page(driver): 
    '''module to scroll until end of the current page height is reached'''
    
    time.sleep(1)  # Allow seconds for the web page to open
    scroll_pause_time = 2.1 
    screen_height = driver.execute_script("return window.screen.height;")   # get the screen height of the web
    i = 1

    while True:
        # scroll one screen height each time
        driver.execute_script("window.scrollTo(0, {screen_height}*{i});".format(screen_height=screen_height, i=i))  
        i += 1
        time.sleep(scroll_pause_time)
        # update scroll height each time after scrolled, as the scroll height can change after we scrolled the page
        scroll_height = driver.execute_script("return document.body.scrollHeight;")  
        # Break the loop when the height we need to scroll to is larger than the total scroll height
        if ((screen_height) * i > scroll_height):
            break


def ready_page(driver, url) :
    '''all the tedious work to get dynamic page fully loaded to the end before executing data scraping procedure'''
    driver.get(url)
    driver.maximize_window() 
    scroll_full_page(driver)
    
    # click button to show more :
    show_more_button = driver.find_element(By.XPATH, "//a[@id='load-button']") 
    show_more_button.click()   

    # Now we can fully load
    scroll_full_page(driver)


def create_csv(data_rows_list, csv_name : str) :
    df = pd.DataFrame(data_rows_list)
    df.to_csv(csv_name, index=False)

##### "AI trend dataset" - dynamic scraping Module 1 :

In [59]:
#fully load dynamic page first:
url = f"https://topstartups.io/?industries=Artificial+Intelligence&sort=valuation"
driver = webdriver.Chrome(webdriver_path, chrome_options=chrome_options)
ready_page(driver, url)

In [60]:
# Scraping: 
ai_data = []

startups = driver.find_elements(By.CSS_SELECTOR, 'div.col-12.col-md-6.col-xl-4.infinite-item') 

for startup in startups:
    
    name_element = startup.find_element(By.CSS_SELECTOR, 'div.col-8.col-md-9').find_element(By.TAG_NAME, "h3").find_element(By.TAG_NAME, "a")
    name = name_element.text

    header_2 = startup.find_element(By.ID, 'item-card-filter').find_elements(By.TAG_NAME, "p")[0]
    categories = header_2.find_elements(By.TAG_NAME, "span")
    # finding category :
    for id, category in enumerate(categories):
        row = {}
        if len(category.text)==0 or ("Artificial Intelligence" in category.text): continue
        else:   
            row['Startup'] = name
            row['AI field'] = category.text
            print(row)
            ai_data.append( row )

print(ai_data)

driver.close()

create_csv( ai_data, "AI_field_trend_dataset.csv")

##### "Top Startups dataset" dynamic scraping Module 2 :

In [62]:
# full load:
driver = webdriver.Chrome(webdriver_path, chrome_options=chrome_options)
url = f"https://topstartups.io/?sort=valuation"
ready_page(driver, url)

In [63]:
#scraping:
startup_data = []

startups = driver.find_elements(By.CSS_SELECTOR, 'div.col-12.col-md-6.col-xl-4.infinite-item') 

for id , startup in enumerate(startups):
    name_element = startup.find_element(By.CSS_SELECTOR, 'div.col-8.col-md-9').find_element(By.TAG_NAME, "h3").find_element(By.TAG_NAME, "a")
    name = name_element.text
    print(id,'### ',name)


    '''# scraping employees, founded year:'''
    card_header_2 = startup.find_element(By.ID, 'item-card-filter').find_elements(By.TAG_NAME, "p")[1]
    
    employees = card_header_2.find_elements(By.TAG_NAME, "span")[0].text.split()[0].split("-")
    #print(employees)
    if len(employees)==2 : 
        employees = employees[1]  
    else: 
        employees = employees[0]

    founded = card_header_2.find_elements(By.TAG_NAME, "span")[1].text.split(':')[1]
    print("employees, founded serially: ",employees, "," ,founded)



    '''# scraping value worth information  :'''
    card_header_3 = startup.find_element(By.ID, 'item-card-filter').find_elements(By.TAG_NAME, "p")[2]
    funding_elements = card_header_3.find_elements(By.TAG_NAME, "span")

    def process_currency(value):
        '''if billion(B) or in thousand(K), convert to million for better scaling. 
        otherwise data already is in million. processed accordingly'''
        
        if 'B' in value:
            value = value.strip('$B')
            value = int( float(value) * 1000.0 )
        elif 'K' in value:
            value = value.strip('$K')
            value = float(value) / 1000.0  
        else: value = value.strip('$M')

        return value

    value = 0
    for element in funding_elements: #capture valuation worth
        if("valuation" in element.text):
            value = element.text.split()[0]
            value = process_currency(value)
            print("valuation: M",value)

    if(value==0): #if valuation worth not provided, then capture funding round capital
        for element in funding_elements:
            if '$' in element.text:
                #print('fund $ found')
                value = element.text
                value = value[value.find('$'):].split()[0]
                value = process_currency(value)
                print("funding round: M",value)
    

    '''# scrap locations :'''
    location = card_header_2.text.replace("\n",'').split(':')[2]#.strip('-0123456789employees Found')
    match = re.search(r"([0-9]+)", location) 
    location = location [1: match.start()]
    
    country = location.split(',')[-1]
    print(country[1:], "|" ,location)


    '''# scrap technologies: '''
    card_header_1 = startup.find_element(By.ID, 'item-card-filter').find_elements(By.TAG_NAME, "p")[0]
    categories = card_header_1.find_elements(By.TAG_NAME, "span")
    
    technologies = ""
    for category in categories:
        if len(category.text)==0 : continue
        else:   technologies = technologies + category.text + ", "
    technologies = technologies[:-2]
    print(technologies)
    
    '''# scraping URls : '''
    startup_url = startup.find_element(By.ID, 'startup-website-link').get_attribute('href')
    image_url =  startup.find_element(By.TAG_NAME, "img").get_attribute('src')

    print()

    #enter data to dictionary:
    row_data = {}
    row_data["Startup"]=name
    row_data["Valuation (in Millions)"]=value
    row_data["Employees(estimate)"]=employees
    row_data["Founded"]=founded
    row_data["Country"]=country
    row_data["HQ Location"]=location
    row_data["Technologies"]=technologies
    row_data["Website URL"]=startup_url
    row_data["image URL"]=image_url
    
    startup_data.append( row_data )

driver.close()

create_csv(startup_data, "top_startups_details.csv")


##### "Software Engineers in Startups dataset" dynamic scraping Module 3 :

In [None]:
driver = webdriver.Chrome(webdriver_path, chrome_options=chrome_options)
url = f"https://topstartups.io/startup-salary-equity-database/?title=software+engineer"
ready_page(driver, url)
driver.implicitly_wait(5)

In [None]:
se_data = [] #software engineer row data collection 

def fetch_table_data(driver):
    tr = driver.find_elements(By.TAG_NAME, "tr")
    tr.pop(0) #this tr doesn't contain relevant data 
    
    for record in tr: 
        td =  record.find_elements(By.TAG_NAME, "td")
        row = {}
        row['Position title'] = td[0].get_attribute('innerHTML')
        row['Salary($)'] = td[1].find_element(By.TAG_NAME, "span").get_attribute('innerHTML').strip('$').replace(',', '')
        row['Years in Startup'] = td[6].get_attribute('innerHTML')
        row['Total Experience( Year)'] = td[5].get_attribute('innerHTML')

        print(row)
        se_data.append(row)


def button_click(xpath_string) : 
    button = driver.find_element(By.XPATH, xpath_string)
    driver.execute_script("arguments[0].scrollIntoView();", button)
    driver.execute_script("arguments[0].click();", button)

In [80]:
# 1 click sorts low years to high. click again to sort high to low 
# sort according to year in startup : 
button_click('//*[@id="mydatatable"]/thead/tr/th[7]') 
button_click('//*[@id="mydatatable"]/thead/tr/th[7]')

fetch_table_data(driver)

# click next button and scraping data
for i in range(2,51):  
    button_click('//*[@id="mydatatable_next"]')
    fetch_table_data(driver)

driver.close()

create_csv(se_data, 'SE_in_startups_dataset.csv')

{'Position title': 'Software engineer', 'Salary($)': '180000', 'Years in Startup': '7.0'}
{'Position title': 'Staff software engineer', 'Salary($)': '180000', 'Years in Startup': '6.0'}
{'Position title': 'Lead software engineer', 'Salary($)': '100000', 'Years in Startup': '5.0'}
{'Position title': 'Senior software engineer', 'Salary($)': '165000', 'Years in Startup': '4.0'}
{'Position title': 'Senior software engineer', 'Salary($)': '150000', 'Years in Startup': '4.0'}
{'Position title': 'Software engineer', 'Salary($)': '150000', 'Years in Startup': '4.0'}
{'Position title': 'Software engineer', 'Salary($)': '150000', 'Years in Startup': '4.0'}
{'Position title': 'Software engineer', 'Salary($)': '150000', 'Years in Startup': '4.0'}
{'Position title': 'Software engineer', 'Salary($)': '195000', 'Years in Startup': '4.0'}
{'Position title': 'Senior software engineer', 'Salary($)': '130000', 'Years in Startup': '4.0'}
{'Position title': 'Staff software engineer', 'Salary($)': '195000',

# Data processing :

In [50]:
import numpy as np
from datetime import datetime, date

df = pd.read_csv("Top_startups_details_dataset.csv")

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1207 entries, 0 to 1206
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Startup                  1207 non-null   object 
 1   Valuation (in Millions)  1207 non-null   float64
 2   Employees(estimate)      1207 non-null   object 
 3   Founded                  1207 non-null   int64  
 4   Country                  1207 non-null   object 
 5   HQ Location              1207 non-null   object 
 6   Technologies             1207 non-null   object 
 7   Website URL              1207 non-null   object 
 8   image URL                1207 non-null   object 
dtypes: float64(1), int64(1), object(7)
memory usage: 85.0+ KB


In [54]:
df['Year in Business'] = df['Founded'].apply(lambda year: (date.today().year - year) )

display(df)

df.to_csv('Top_startups_details_dataset_complete.csv', index = False)

Unnamed: 0,Startup,Valuation (in Millions),Employees(estimate),Founded,Country,HQ Location,Technologies,Website URL,image URL,Year in Business
0,Pogo,15.00,50,2020,USA,"New York, New York, USA","Consumer, Mobile App, FinTech",https://www.joinpogo.com/?utm_source=topstartu...,https://res.cloudinary.com/crunchbase-producti...,3
1,Greylock,500.00,50,1965,USA,"San Francisco Bay Area, California, USA","Enterprise Software, Consumer, Crypto",https://greylock.com/portfolio/?utm_source=top...,https://cdn.filestackcontent.com/output=f:webp...,58
2,Stripe,50000.00,5000+,2010,USA,"San Francisco Bay Area, California, USA","FinTech, Payments, SaaS",https://stripe.com/?utm_source=topstartups.io,https://res.cloudinary.com/crunchbase-producti...,13
3,Uber,48000.00,5000+,2009,USA,"San Francisco Bay Area, California, USA","Delivery, Apps, Ride Sharing",http://www.uber.com/?utm_source=topstartups.io,https://img-cdn.tnwcdn.com/image?url=https%3A%...,14
4,Snowflake,42000.00,5000,2012,USA,"Bozeman, Montana, USA","Analytics, Enterprise Software",http://www.snowflake.net/?utm_source=topstartu...,https://img-cdn.tnwcdn.com/image?url=https%3A%...,11
...,...,...,...,...,...,...,...,...,...,...
1202,Luminostics,0.12,100,2014,USA,"San Francisco Bay Area, California, USA","Mobile Healthcare, Healthcare, Medical Devices...",http://www.luminostics.com/?utm_source=topstar...,https://img-cdn.tnwcdn.com/image?url=https%3A%...,9
1203,Legalist,0.12,100,2016,USA,"San Francisco Bay Area, California, USA","LegalTech, Analytics",https://www.legalist.com/?utm_source=topstartu...,https://img-cdn.tnwcdn.com/image?url=https%3A%...,7
1204,SmartPath Financial,0.12,50,2010,USA,"Atlanta, Georgia, USA","FinTech, EdTech",http://smartpathfinancial.com/?utm_source=tops...,https://img-cdn.tnwcdn.com/image?url=https%3A%...,13
1205,Emote Education,0.12,10,2015,USA,"California City, California, USA",EdTech,http://emotenow.com/?utm_source=topstartups.io,https://res.cloudinary.com/crunchbase-producti...,8
