In [1]:
import requests 
from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

import re
import os
import csv
import time

In [2]:
from dataclasses import dataclass

@dataclass
class SalaryRow:
    company: str = None
    location: str = None
    date: str = None
    level_name: str = None
    tag: str = None
    yoe: float = None
    yoe_total: float = None
    tc: str = None
    base: str = None
    stock: str = None
    bonus: str = None
    title: str = None
    work_arrangement: str = None
    education: str = None
    ethnicity: str = None
    gender: str = None
    other: str = None
    track: str = None

In [3]:
#Initialize Page
driver = webdriver.Firefox()
resopnse = driver.get('https://www.levels.fyi/comp.html') 

In [4]:
track_selector_options = driver.find_element(By.ID, "track-selector").find_elements(By.TAG_NAME, "option")
track_selector_options = [x.text for x in track_selector_options]

In [5]:
track = track_selector_options[0]
driver.find_element(By.ID, "track-selector").find_element(By.XPATH, f"//option[text()='{track}']").click()

In [6]:
#Display 100 items per page
dropdown = driver.find_elements(By.CLASS_NAME, 'fixed-table-pagination')[0].find_elements(By.CLASS_NAME, 'btn-group')[0]
dropdown.click()
dropdown.find_elements(By.TAG_NAME, "li")[-1].find_element(By.TAG_NAME, "a").click()

#Remove regional filter
driver.find_elements(By.CLASS_NAME, 'remove-search-filter-region-tag')[0].click()

wait = WebDriverWait(driver, 3)

In [None]:
def next_page():
    driver.find_element(By.CLASS_NAME, 'page-next').find_element(By.TAG_NAME, "a").click()
def prev_page():
    driver.find_element(By.CLASS_NAME, 'page-pre').find_element(By.TAG_NAME, "a").click()

In [8]:
def reached_end():
    pagination_str = driver.find_element(By.CLASS_NAME, "pagination-detail").text
    _, cur, total, _ = re.findall(r'\d+', pagination_str)
    return cur == total

In [9]:
def expand_rows():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")

    #Expand all rows
    for row in rows:
        row.find_element(By.CSS_SELECTOR, "td.d-none").click()

In [10]:
def chunks(lst, n):
    """Yield successive n-sized chunks from lst."""
    for i in range(0, len(lst), n):
        yield lst[i:i + n]

In [18]:
#Parse data
def parse():
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    row_pairs = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view], tr.detail-view")
    #assert(len(row_pairs) == 200)

    parsed_rows = []
    for row, details in chunks(row_pairs, 2):
        parsed_row = SalaryRow()
        parsed_row.track = track
        #Parse first row
        row_sections = row.find_elements(By.CSS_SELECTOR, "td")
        assert(len(row_sections) == 5)

        parsed_row.company = row_sections[1].find_element(By.CSS_SELECTOR, "span").text
        parsed_row.location, parsed_row.date = row_sections[1].find_element(By.CSS_SELECTOR, "p").text.split(' | ')

        parsed_row.level_name = row_sections[2].find_element(By.CSS_SELECTOR, "span").text
        parsed_row.tag = row_sections[2].find_element(By.CSS_SELECTOR, "p").text

        parsed_row.yoe, parsed_row.yoe_total = row_sections[3].find_element(By.CSS_SELECTOR, "span").text.split(' / ')

        parsed_row.tc = row_sections[4].find_element(By.CSS_SELECTOR, "span").text
        try:
            parsed_row.base, parsed_row.stock, parsed_row.bonus = row_sections[4].find_element(By.CSS_SELECTOR, "span.dateDetails").text.split(' | ')
        except:
            pass
        #Parse details
        details_table = details.find_elements(By.CSS_SELECTOR, ".level-name-and-details table tr")
        for details_row in details_table:
            label = details_row.find_element(By.CSS_SELECTOR, "td").text
            if label == "Title":
                parsed_row.title = details_row.find_element(By.CSS_SELECTOR, "td:nth-child(2)").text
            elif label == "Work Arrangement":
                parsed_row.work_arrangement = details_row.find_element(By.CSS_SELECTOR, "td:nth-child(2)").text
            elif label == "Education":
                parsed_row.education = details_row.find_element(By.CSS_SELECTOR, "td:nth-child(2)").text
            elif label == "Ethnicity":
                parsed_row.ethnicity = details_row.find_element(By.CSS_SELECTOR, "td:nth-child(2)").text
            elif label == "Gender":
                parsed_row.gender = details_row.find_element(By.CSS_SELECTOR, "td:nth-child(2)").text
            elif label.startswith("Other Details"):
                parsed_row.other = details_row.find_element(By.CSS_SELECTOR, "span").text
            elif label == "Negotiated Amount":
                pass
            elif label != "Details":
                raise Exception("Unknown label: " + label)
        parsed_rows.append(parsed_row)
    return parsed_rows

In [12]:
#Create output file if it doesn't exist
cols = ['company','location','date','level_name','tag','yoe','yoe_total','tc','base','stock','bonus','title','work_arrangement','education','ethnicity','gender','other','track']

fpath = './data/main-scraped.tsv'

if not os.path.exists(fpath):
    with open(fpath, 'w') as f:
        f.write('\t'.join(cols) + '\n')

In [13]:
def write_out(parsed_rows):
    parsed_rows_lists = [x.__dict__.values() for x in parsed_rows]
    for row in parsed_rows_lists:
        assert(len(row) == len(cols))
        for item in row:
            if isinstance(item, str):              
                assert('\t' not in item)
    with open(fpath, 'a', newline='', encoding='utf-8') as f:
        tsv_writer = csv.writer(f, delimiter='\t')
        tsv_writer.writerows(parsed_rows_lists)

In [20]:
while not reached_end():
    pg = int(driver.find_element(By.CSS_SELECTOR, ".page-item.active").text)
    start = time.time()
    
    table = driver.find_element(By.CSS_SELECTOR, "#compTable")
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr")
    if len(table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view], tr.detail-view")) <= 100:
        expand_rows()
    expand_time = time.time()

    
    rows = table.find_element(By.CSS_SELECTOR, "tbody").find_elements(By.CSS_SELECTOR, "tr[data-has-detail-view]")
    parsed_rows = parse()
    parse_time = time.time()
    write_out(parsed_rows)
    try:
        next_page()
    except:
        print("Failed to click next page, retrying")
        driver.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(5)
        next_page()

    wait = WebDriverWait(driver, 10)
    element = wait.until(EC.staleness_of(rows[0]))

    end = time.time()
    print(f"Pg{pg:03} | Expand: {expand_time - start:.2f} | Parse: {parse_time - expand_time:.2f} | Write: {end - parse_time:.2f}")


Pg716 | Expand: 0.06 | Parse: 13.86 | Write: 1.78
Pg717 | Expand: 24.06 | Parse: 14.48 | Write: 1.81
Pg718 | Expand: 23.02 | Parse: 13.65 | Write: 1.90
Pg719 | Expand: 23.38 | Parse: 13.84 | Write: 1.77
Pg720 | Expand: 21.59 | Parse: 12.53 | Write: 2.29
Pg721 | Expand: 24.05 | Parse: 13.66 | Write: 1.79
Pg722 | Expand: 51.71 | Parse: 12.65 | Write: 1.73
Pg723 | Expand: 47.92 | Parse: 13.54 | Write: 2.83
Pg724 | Expand: 37.14 | Parse: 13.88 | Write: 1.73
Pg725 | Expand: 34.63 | Parse: 12.65 | Write: 1.77


KeyboardInterrupt: 

In [None]:
expand_rows()
parsed_rows = parse()
write_out(parsed_rows)

In [None]:
next_page()

In [15]:
import pandas as pd
df = pd.read_csv('./data/main-scraped.tsv', sep='\t')
df

Unnamed: 0,company,location,date,level_name,tag,yoe,yoe_total,tc,base,stock,bonus,title,work_arrangement,education,ethnicity,gender,other,track
0,Twitch,"San Francisco, CA",05/04/2022,L4,(hidden),0,0,"$213,360",148K,30K,35K,Software Engineer,Office,(hidden),(hidden),(hidden),,Software Engineer
1,Atlassian,"Bangalore, KA, India",05/04/2022,P4,Distributed Systems (Back-End),2,7,"$86,639",53K,29K,5.3K,Software Engineer,Remote,,,,,Software Engineer
2,Dell,"Bangalore, KA, India",05/04/2022,Software Engineer II,Embedded,3,3,"$21,094",19K,919,919,Software Engineer 2,Hybrid,,,,,Software Engineer
3,Walmart Global Tech,"Sunnyvale, CA",05/04/2022,Senior Software Engineer,(hidden),0,2,"$213,875",145K,30K,39K,Senior Software Engineer,Office,(hidden),(hidden),(hidden),I think they considered my PhD experience in t...,Software Engineer
4,Carvana,"Atlanta, GA",05/04/2022,II,Web Development (Front-End),1,4,"$130,000",130K,,,Software Engineer Ii,Remote,Bachelors degree,White,Female,,Software Engineer
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
542,Pegasystems,"Hyderabad, TS, India",04/30/2022,(hidden),Distributed Systems (Back-End),2-4,2-4,"$26,449",24K,,2.4K,Senior Software Engineer,Office,,,,,Software Engineer
543,Amazon,"Arlington, VA",04/30/2022,SDE II,Web Development (Front-End),1,8,"$278,000",185K,50K,43K,Frontend Software Engineer,Remote,,,,,Software Engineer
544,Facebook,"Menlo Park, CA",04/30/2022,L3,Distributed Systems (Back-End),1,1,"$161,000",121K,40K,,Software Engineer,Office,,,Female,,Software Engineer
545,Red Hat,"Pune, MH, India",04/30/2022,L2,Web Development (Front-End),2,2,"$9,983",10K,,,Associate Software Engineer,Office,Bachelors degree,Asian,Female,,Software Engineer
