In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict

In [2]:
df = pd.read_csv('./data/scraped.tsv', sep='\t')

df.replace('', np.nan, inplace=True)
df.replace('--', np.nan, inplace=True)

In [3]:
#Fix some rows that are misaligned
misaligned = df[df['YOE'].str.startswith('$')]

misaligned_corrected = pd.concat([
    misaligned[['Company', 'Location', 'Date']],
    misaligned.drop(['Company', 'Location', 'Date'], axis=1).shift(axis=1)
], axis=1).fillna('')

df.update(misaligned_corrected)

In [4]:
misaligned = df[df['Base'].str.len() > 5]
misaligned_cols = ['Base', 'Stock', 'Bonus', 'Details', 'Gender']

misaligned_corrected = pd.concat([
    misaligned.drop(misaligned_cols, axis=1),
    misaligned[misaligned_cols].shift(periods=3, axis=1)
], axis=1).fillna('')

df.update(misaligned_corrected)

In [5]:
misaligned = df[~df['Base'].str.contains('^\d', na=True)]
misaligned_cols = ['Base', 'Stock', 'Bonus', 'Details', 'Gender']

misaligned_corrected = pd.concat([
    misaligned.drop(misaligned_cols, axis=1),
    misaligned[misaligned_cols].shift(periods=3, axis=1)
], axis=1).fillna('')

df.update(misaligned_corrected)

In [6]:
misaligned = df[df['Stock'].str.contains('Gender', na=False)]
misaligned_cols = ['Stock', 'Bonus', 'Details', 'Gender']

misaligned_corrected = pd.concat([
    misaligned.drop(misaligned_cols, axis=1),
    misaligned[misaligned_cols].shift(periods=3, axis=1)
], axis=1).fillna('')

df.update(misaligned_corrected)

In [7]:
df.replace('', np.nan, inplace=True)
df['Gender'] = df['Gender'].str.replace('Gender: ', '')
df['Date'] = pd.to_datetime(df['Date'])
df[['yrs_at_company', 'yoe_total']] = pd.DataFrame(df['YOE'].str.split('/').tolist()).astype(float)
df = df.drop(['YOE'], axis=1)

In [8]:
def extract_degree(x):
    if pd.isna(x):
        return np.nan
        
    x = x.lower()
    if 'phd' in x or 'doctor' in x:
        return 'phd'
    elif 'master' in x:
        return 'master'
    elif 'bachelor' in x:
        return 'bachelor'
    else:
        return np.nan

df['degree'] = df['Details'].apply(extract_degree)

In [9]:
df['remote'] = df['Details'].str.lower().str.contains('remote')

In [10]:
#Delete only commas in numbers
df['other'] = df['Details'].str.replace(r'(\d),(\d)', '\\1\\2', regex=True)
df['other'] = df['other'].replace('--', np.nan)
df['other'] = df['other'].str.split(',')

In [11]:
racial_categories = defaultdict(int)

for row in df['other'].tolist():
    if isinstance(row, list):
        for item in row:
            if 'Race:' in item:
                racial_categories[item.split(':')[-1].strip().lower()] += 1
racial_categories

defaultdict(int,
            {'white': 9322,
             'asian': 14550,
             'hispanic / latino': 1369,
             'black or african american': 692,
             'two or more races': 877,
             'american indian or alaska native': 81,
             'native hawaiian or other pacific islander': 44,
             'hispanic': 1})

In [12]:
df = df.drop('other', axis=1)

In [13]:
def extract_racial_category(x):
    if pd.isna(x):
        return np.nan
        
    x = x.lower()

    for k in racial_categories.keys():
        if k in x:
            return k
    return np.nan

df['race'] = df['Details'].apply(extract_racial_category)

In [14]:
def process_num(x):
    if pd.isna(x):
        return np.nan
    if type(x) == float or type(x) == int:
        return x
        
    x = x.lower()
    if 'k' in x:
        if len(x) > 1:
            return float(x.replace('k', '')) * 1000
        return 1000.0
    if 'm' in x:
        if len(x) > 1:
            return float(x.replace('m', '')) * 1000000
        return 1000000.0
    x = float(x)
    return x

In [15]:
df['Base'] = df['Base'].apply(process_num)
df['Stock'] = df['Stock'].apply(process_num)
df['Bonus'] = df['Bonus'].apply(process_num)

In [16]:
df['TC'] = df['TC'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).astype(float)

In [19]:
df.columns = [x.lower() for x in df.columns]

In [21]:
df.to_csv('./data/processed.tsv', sep='\t', index=False)

In [23]:
df[['company', 'location', 'date', 'tc', 'gender', 'yoe_total', 'degree', 'race']]

Unnamed: 0,company,location,date,tc,gender,yoe_total,degree,race
0,Veracode,"Burlington, MA",2022-01-30,120000.0,Male,1.0,bachelor,white
1,IBM,"Rochester, MN",2022-01-30,112000.0,Male,0.0,master,white
2,Philips,"Cambridge, MA",2022-01-30,122000.0,Male,3.0,master,asian
3,Teleport,"Oakland, CA",2022-01-30,180000.0,Male,8.0,bachelor,white
4,IBM,"Rochester, MN",2022-01-30,128000.0,Male,7.0,bachelor,white
...,...,...,...,...,...,...,...,...
61218,Microsoft,"Seattle, WA",2017-06-21,208000.0,,8.5,,
61219,Amazon,"Seattle, WA",2017-06-20,190000.0,,3.0,,
61220,Microsoft,"Mountain View, CA",2017-06-20,157000.0,,5.0,,
61221,Amazon,"Vancouver, BC, Canada",2017-06-16,173000.0,,11.0,,


In [33]:
import plotly.express as px

In [36]:
df

Unnamed: 0,company,location,date,level,title,tc,base,stock,bonus,details,gender,yrs_at_company,yoe_total,degree,remote,race
0,Veracode,"Burlington, MA",2022-01-30,L1,API Development (Back-End),120000.0,120000.0,,,"Remote, Title: Software Engineer, Race: White,...",Male,0.0,1.0,bachelor,True,white
1,IBM,"Rochester, MN",2022-01-30,Staff Engineer,Distributed Systems (Back-End),112000.0,112000.0,,,"$15,000 sign-on bonus, Title: Software Develop...",Male,0.0,0.0,master,False,white
2,Philips,"Cambridge, MA",2022-01-30,L4,Data,122000.0,111000.0,,11000.0,"10K Relocation Bonus, Title: Cloud Software De...",Male,2.0,3.0,master,False,asian
3,Teleport,"Oakland, CA",2022-01-30,L3,Site Reliability (SRE),180000.0,180000.0,,,"Title: Sre, Race: White, Academic Level: Bache...",Male,1.0,8.0,bachelor,False,white
4,IBM,"Rochester, MN",2022-01-30,Advisory Engineer,API Development (Back-End),128000.0,128000.0,,,"Title: Advisory Software Developer, Race: Whit...",Male,7.0,7.0,bachelor,False,white
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
61218,Microsoft,"Seattle, WA",2017-06-21,,63,208000.0,,,,,,8.5,8.5,,,
61219,Amazon,"Seattle, WA",2017-06-20,,L5,190000.0,,,,,,3.0,3.0,,,
61220,Microsoft,"Mountain View, CA",2017-06-20,,60,157000.0,,,,,,3.0,5.0,,,
61221,Amazon,"Vancouver, BC, Canada",2017-06-16,,L5,173000.0,,,,,,1.0,11.0,,,


In [37]:
px.scatter(df, x='yoe_total', y='tc')