In [1]:
import requests 
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import os
import csv
import time

In [2]:
#Initialize Page
driver = webdriver.Firefox()
resopnse = driver.get('https://www.levels.fyi/comp.html?track=Software%20Engineer') 

#Display 100 items per page
dropdown = driver.find_elements(By.CLASS_NAME, 'fixed-table-pagination')[0].find_elements(By.CLASS_NAME, 'btn-group')[0]
dropdown.click()
dropdown.find_elements(By.TAG_NAME, "li")[-1].find_element(By.TAG_NAME, "a").click()

#Remove regional filter
driver.find_elements(By.CLASS_NAME, 'remove-search-filter-region-tag')[0].click()

In [3]:
def next_page():
    driver.find_element(By.CLASS_NAME, 'page-next').find_element(By.TAG_NAME, "a").click()

In [4]:
def reached_end():
    pagination_str = driver.find_element(By.CLASS_NAME, "pagination-detail").text
    _, cur, total, _ = re.findall(r'\d+', pagination_str)
    return cur == total

In [5]:
def expand_rows():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")

    #Expand all rows
    for row in rows:
        row.find_element(By.CSS_SELECTOR, "td.d-none").click()

In [6]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [7]:
#Parse data
def parse():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr")
    assert(len(rows) == 200)

    parsed_rows = []
    for row, details in chunks(rows, 2):
        data = row.text.replace('|', '\n').split('\n')
        data = list(map(str.strip, data))
        data = [d for d in data if not d.startswith('+')]

        data.extend([details.find_element(By.CSS_SELECTOR, ".other-details__content").text])

        gender_section = details.find_elements(By.CSS_SELECTOR, "p.small")
        if gender_section:
            gender = gender_section[0].text
        else:
            gender = "N/A"
        data.extend([gender])
        
        #assert(len(data) == len(cols))
        parsed_rows.append(data)
    return parsed_rows

In [8]:
#Create output file if it doesn't exist
cols = ['Company', 'Location', 'Date', 'Level', 'Title', 'YOE', 'TC', 'Base', 'Stock', 'Bonus', 'Details', 'Gender']
','.join(cols)

fpath = './data/scraped.tsv'

if not os.path.exists(fpath):
    with open(fpath, 'w') as f:
        f.write('\t'.join(cols) + '\n')

In [9]:
def write_out(parsed_rows):
    with open(fpath, 'a', newline='', encoding='utf-8') as f:
        tsv_writer = csv.writer(f, delimiter='\t')
        tsv_writer.writerows(parsed_rows)

In [10]:
pg = 0
while not reached_end():
    start = time.time()
    expand_rows()
    expand_time = time.time()

    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")
    parsed_rows = parse()
    parse_time = time.time()
    write_out(parsed_rows)
    next_page()
    pg += 1

    wait = WebDriverWait(driver, 10)
    element = wait.until(EC.staleness_of(rows[0]))

    end = time.time()
    print(f"Pg{pg} | Expand: {expand_time - start:.2f} | Parse: {parse_time - expand_time:.2f} | Write: {end - parse_time:.2f}")


Pg1 | Expand: 24.90 | Parse: 4.56 | Write: 0.25
Pg2 | Expand: 24.72 | Parse: 4.41 | Write: 0.24


NoSuchElementException: Message: Web element reference not seen before: 0fd75c29-4d5d-47c1-b07a-d3a0856a5911
Stacktrace:
WebDriverError@chrome://remote/content/shared/webdriver/Errors.jsm:183:5
NoSuchElementError@chrome://remote/content/shared/webdriver/Errors.jsm:395:5
get@chrome://remote/content/marionette/element.js:209:13
evaluate.toJSON@chrome://remote/content/marionette/evaluate.js:329:20
evaluate.toJSON@chrome://remote/content/marionette/evaluate.js:353:21
evaluate.toJSON@chrome://remote/content/marionette/evaluate.js:362:27
evaluate.toJSON@chrome://remote/content/marionette/evaluate.js:362:27
sendQuery@chrome://remote/content/marionette/actors/MarionetteCommandsParent.jsm:49:37
findElement@chrome://remote/content/marionette/actors/MarionetteCommandsParent.jsm:100:17
getMarionetteCommandsActorProxy/get/<@chrome://remote/content/marionette/actors/MarionetteCommandsParent.jsm:336:53
GeckoDriver.prototype.findElement@chrome://remote/content/marionette/driver.js:1430:26


In [None]:
import pandas as pd
df = pd.read_csv('./data/scraped.tsv', sep='\t')
df

Unnamed: 0,Company,Location,Date,Level,Title,YOE,TC,Base,Stock,Bonus,Details,Gender
