In [1]:
import requests 
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import os
import csv
import time

In [2]:
#Initialize Page
driver = webdriver.Firefox()
resopnse = driver.get('https://www.levels.fyi/comp.html?track=Product%20Manager') 

#Display 100 items per page
dropdown = driver.find_elements(By.CLASS_NAME, 'fixed-table-pagination')[0].find_elements(By.CLASS_NAME, 'btn-group')[0]
dropdown.click()
dropdown.find_elements(By.TAG_NAME, "li")[-1].find_element(By.TAG_NAME, "a").click()

#Remove regional filter
driver.find_elements(By.CLASS_NAME, 'remove-search-filter-region-tag')[0].click()

In [3]:
def next_page():
    driver.find_element(By.CLASS_NAME, 'page-next').find_element(By.TAG_NAME, "a").click()

In [4]:
def reached_end():
    pagination_str = driver.find_element(By.CLASS_NAME, "pagination-detail").text
    _, cur, total, _ = re.findall(r'\d+', pagination_str)
    return cur == total

In [5]:
def expand_rows():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")

    #Expand all rows
    for row in rows:
        row.find_element(By.CSS_SELECTOR, "td.d-none").click()

In [6]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [7]:
#Parse data
def parse():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr")
    #assert(len(rows) == 200)

    parsed_rows = []
    for row, details in chunks(rows, 2):
        data = row.text.replace('|', '\n').split('\n')
        data = list(map(str.strip, data))
        data = [d for d in data if not d.startswith('+')]

        other_details_section = details.find_elements(By.CSS_SELECTOR, ".other-details__content")
        if other_details_section:
            other_details = other_details_section[0].text
        else:
            other_details = "N/A"
        data.extend([other_details])

        gender_section = details.find_elements(By.CSS_SELECTOR, "p.small")
        if gender_section:
            gender = gender_section[0].text
        else:
            gender = "N/A"
        data.extend([gender])
        
        #assert(len(data) == len(cols))
        parsed_rows.append(data)
    return parsed_rows

In [8]:
#Create output file if it doesn't exist
cols = ['Company', 'Location', 'Date', 'Level', 'Title', 'YOE', 'TC', 'Base', 'Stock', 'Bonus', 'Details', 'Gender']
','.join(cols)

fpath = './data/pm-scraped.tsv'

if not os.path.exists(fpath):
    with open(fpath, 'w') as f:
        f.write('\t'.join(cols) + '\n')

In [9]:
def write_out(parsed_rows):
    with open(fpath, 'a', newline='', encoding='utf-8') as f:
        tsv_writer = csv.writer(f, delimiter='\t')
        tsv_writer.writerows(parsed_rows)

In [10]:
pg = 0
while not reached_end():
    start = time.time()
    
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr")
    if len(rows) < 200:
        expand_rows()
    expand_time = time.time()

    
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")
    parsed_rows = parse()
    parse_time = time.time()
    write_out(parsed_rows)
    try:
        next_page()
    except:
        print("Failed to click next page, retrying")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        next_page()
    pg += 1

    wait = WebDriverWait(driver, 10)
    element = wait.until(EC.staleness_of(rows[0]))

    end = time.time()
    print(f"Pg{pg} | Expand: {expand_time - start:.2f} | Parse: {parse_time - expand_time:.2f} | Write: {end - parse_time:.2f}")


Pg1 | Expand: 23.54 | Parse: 3.91 | Write: 0.73
Pg2 | Expand: 23.50 | Parse: 3.91 | Write: 0.27
Pg3 | Expand: 23.56 | Parse: 3.74 | Write: 0.28
Pg4 | Expand: 23.60 | Parse: 3.80 | Write: 0.29
Pg5 | Expand: 23.61 | Parse: 3.78 | Write: 0.31
Pg6 | Expand: 23.67 | Parse: 3.76 | Write: 0.32
Pg7 | Expand: 23.57 | Parse: 3.81 | Write: 0.27
Pg8 | Expand: 23.67 | Parse: 3.87 | Write: 0.29
Pg9 | Expand: 23.68 | Parse: 3.71 | Write: 0.28
Pg10 | Expand: 23.73 | Parse: 3.85 | Write: 0.26
Pg11 | Expand: 23.78 | Parse: 3.90 | Write: 0.29
Pg12 | Expand: 23.62 | Parse: 3.63 | Write: 0.27
Pg13 | Expand: 23.68 | Parse: 3.83 | Write: 0.28
Pg14 | Expand: 23.65 | Parse: 3.87 | Write: 0.73
Pg15 | Expand: 23.67 | Parse: 3.72 | Write: 0.26
Pg16 | Expand: 23.74 | Parse: 3.84 | Write: 0.27
Pg17 | Expand: 23.73 | Parse: 3.80 | Write: 0.28
Pg18 | Expand: 23.74 | Parse: 3.84 | Write: 0.73
Pg19 | Expand: 23.76 | Parse: 3.86 | Write: 0.73
Pg20 | Expand: 23.77 | Parse: 3.71 | Write: 0.74
Pg21 | Expand: 23.70 | Parse:

In [11]:
expand_rows()
parsed_rows = parse()
write_out(parsed_rows)

In [29]:
next_page()

In [None]:
import pandas as pd
df = pd.read_csv('./data/scraped.tsv', sep='\t')
df

Unnamed: 0,Company,Location,Date,Level,Title,YOE,TC,Base,Stock,Bonus,Details,Gender
