In [4]:
import pandas as pd
import json
import numpy as np

In [5]:
df = pd.read_csv('./data/scraped.tsv', sep='\t')
df

Unnamed: 0,Company,Location,Date,Level,Title,YOE,TC,Base,Stock,Bonus,Details,Gender
0,Veracode,"Burlington, MA",1/30/2022,L1,API Development (Back-End),0 / 1,"$120,000",120k,,,"Remote, Title: Software Engineer, Race: White,...",Gender: Male
1,IBM,"Rochester, MN",1/30/2022,Staff Engineer,Distributed Systems (Back-End),0 / 0,"$112,000",112k,,,"$15,000 sign-on bonus, Title: Software Develop...",Gender: Male
2,Philips,"Cambridge, MA",1/30/2022,L4,Data,2 / 3,"$122,000",111k,,11k,"10K Relocation Bonus, Title: Cloud Software De...",Gender: Male
3,Teleport,"Oakland, CA",1/30/2022,L3,Site Reliability (SRE),1 / 8,"$180,000",180k,,,"Title: Sre, Race: White, Academic Level: Bache...",Gender: Male
4,IBM,"Rochester, MN",1/30/2022,Advisory Engineer,API Development (Back-End),7 / 7,"$128,000",128k,,,"Title: Advisory Software Developer, Race: Whit...",Gender: Male
...,...,...,...,...,...,...,...,...,...,...,...,...
61218,Microsoft,"Seattle, WA",6/21/2017,63,8.5 / 8.5,"$208,000",,,,,,
61219,Amazon,"Seattle, WA",6/20/2017,L5,3 / 3,"$190,000",,,,,,
61220,Microsoft,"Mountain View, CA",6/20/2017,60,3 / 5,"$157,000",,,,,,
61221,Amazon,"Vancouver, BC, Canada",6/16/2017,L5,1 / 11,"$173,000",,,,,,


In [6]:
#Fix some rows that are misaligned
misaligned = df[df['YOE'].str.startswith('$')]

misaligned_corrected = pd.concat([
    misaligned[['Company', 'Location', 'Date']],
    misaligned.drop(['Company', 'Location', 'Date'], axis=1).shift(axis=1)
], axis=1).fillna('')

df.update(misaligned_corrected)

In [7]:
df['Gender'] = df['Gender'].str.replace('Gender: ', '')

In [8]:
df['Date'] = pd.to_datetime(df['Date'])

In [9]:
df[['yrs_at_company', 'yoe_total']] = pd.DataFrame(df['YOE'].str.split('/').tolist()).astype(float)
df['yrs_at_company']

0        0.0
1        0.0
2        2.0
3        1.0
4        7.0
        ... 
61218    8.5
61219    3.0
61220    3.0
61221    1.0
61222    3.0
Name: yrs_at_company, Length: 61223, dtype: float64

In [10]:
df = df.drop(['YOE'], axis=1)

In [28]:
#Delete only commas in numbers
df['other'] = df['Details'].str.replace(r'(\d),(\d)', '\\1\\2', regex=True)
df['other'] = df['other'].replace('--', np.nan)
df['other'] = df['other'].str.split(',')



In [47]:
from collections import defaultdict
categories = defaultdict(int)

for row in df['other'].tolist():
    if isinstance(row, list):
        for item in row:
            if ':' in item:
                categories[item.split(':')[0].strip().lower()] += 1

In [48]:
for k, v in categories.items():
    if v > 10:
        print(k, v)

title 36502
race 26936
academic level 30227
negotiated amount 224
sign on 21
sign-on bonus 13
relocation 13
sign on bonus 16
tag 21
signing bonus 45


In [16]:
df['Details'].str.lower().str.contains('bachelor').sum()

16549

In [49]:
df['Details'].str.lower().str.contains('bachelor').sum()

2228

In [17]:
df['Details'].str.lower().str.contains('master').sum()

14394

In [18]:
df['Details'].str.lower().str.contains('phd').sum()

1357

In [51]:
df['Details'].str.lower().str.contains('doctor').sum()

1005

In [15]:
df['Details'].str.lower().str.contains('academic').sum()

30226

In [20]:
df['Details'].sample(20, random_state=1)

32806                                                  NaN
49460                                                   --
54634                                                   --
57398                                                   --
24795    Title: Core Engineering Software Developer, Ra...
47923                                                   --
7534     Title: Associate Software Engineer, Race: Asia...
45459                                                   --
26046    Title: Senior Software Engineer, Race: Asian, ...
14403                 Title: Software Development Engineer
53644                                                  NaN
7408        Remote, Title: Principal Engineer, Race: White
45480                                                   --
41094                                                  NaN
36799    Title: Software Engineer Ii, Race: White, Acad...
7034     Title: Software Development Engineer, Race: As...
50413                                                   

# Old

In [None]:
with open('data/salaryData.json') as f:
    data = json.load(f)
df = pd.DataFrame(data)
df

In [20]:
df['timestamp'] = pd.to_datetime(df['timestamp'])

Unnamed: 0,Company,Location,Date,Level,Title,YOE,TC,Base,Stock,Bonus,Details,Gender
0,Veracode,"Burlington, MA",2022-01-30,L1,API Development (Back-End),0 / 1,"$120,000",120k,,,"Remote, Title: Software Engineer, Race: White,...",Gender: Male
1,IBM,"Rochester, MN",2022-01-30,Staff Engineer,Distributed Systems (Back-End),0 / 0,"$112,000",112k,,,"$15,000 sign-on bonus, Title: Software Develop...",Gender: Male
2,Philips,"Cambridge, MA",2022-01-30,L4,Data,2 / 3,"$122,000",111k,,11k,"10K Relocation Bonus, Title: Cloud Software De...",Gender: Male
3,Teleport,"Oakland, CA",2022-01-30,L3,Site Reliability (SRE),1 / 8,"$180,000",180k,,,"Title: Sre, Race: White, Academic Level: Bache...",Gender: Male
4,IBM,"Rochester, MN",2022-01-30,Advisory Engineer,API Development (Back-End),7 / 7,"$128,000",128k,,,"Title: Advisory Software Developer, Race: Whit...",Gender: Male
...,...,...,...,...,...,...,...,...,...,...,...,...
46795,Workday,"Pleasanton, CA",2020-03-11,SWE II,Android,3 / 3,"$145,000",110k,25k,10k,,
46796,Qualcomm,"Boulder, CO",2020-03-11,Engineer,Networking,0 / 0,"$105,000",90k,15k,,Masters,Gender: Male
46797,Guidewire,"San Mateo, CA",2020-03-11,Senior,API Development (Back-End),5 / 13,"$197,000",161k,20k,16k,Masters,Gender: Male
46798,Microsoft,"Redmond, WA",2020-03-11,59,ML / AI,0 / 0,"$160,000",110k,35k,15k,"Joining Bonus of $50,000 to be paid over 2 yea...",Gender: Male


In [5]:
df[(df['company'] == 'Amazon') & (df['title'] == "Software Engineer")].tail(50)

Unnamed: 0,timestamp,company,level,title,totalyearlycompensation,location,yearsofexperience,yearsatcompany,tag,basesalary,stockgrantvalue,bonus,gender,otherdetails,cityid,dmaid,rowNumber
61781,2021-08-10 15:30:22,Amazon,L5,Software Engineer,232,"Newark, NJ",3,3,Full Stack,176,53,3,Male,mcloiapt,1288,501,82641
61796,2021-08-10 16:41:53,Amazon,L4,Software Engineer,166,"Seattle, WA",6,1,Distributed Systems (Back-End),99,45,22,Male,jeiady traiall schmaol schriouw knapp mccleos ...,11527,819,82657
61854,2021-08-11 03:38:38,Amazon,L5,Software Engineer,220,"Sydney, NS, Australia",16,0,Distributed Systems (Back-End),160,29,31,,klesp sqaiantly mcuops kwaum,1311,0,82738
61887,2021-08-11 09:22:30,Amazon,SDE I,Software Engineer,176,"Seattle, WA",1,1,Full Stack,127,41,8,Male,nips schlioc shaay spraorry typehn smaiart hyd...,11527,819,82780
61904,2021-08-11 10:47:17,Amazon,L5,Software Engineer,238,"San Diego, CA",3,0,Security,167,48,23,Female,khiough pfaiv schneent xaemp yiady pholly phearth,7416,825,82801
61907,2021-08-11 12:05:32,Amazon,L5,Software Engineer,242,"Seattle, WA",4,4,Distributed Systems (Back-End),127,115,0,,cleency scraesk shraiacy,11527,819,82805
61908,2021-08-11 12:16:06,Amazon,L6,Software Engineer,270,"Seattle, WA",11,9,Full Stack,89,175,6,Male,rhaor briit thruc tseind mcmoary khoosch grerd...,11527,819,82807
61965,2021-08-11 19:00:13,Amazon,L4,Software Engineer,193,"Seattle, WA",0,0,Web Development (Front-End),162,31,0,Male,leiry mcdeulls scriids stiecy synin,11527,819,82886
61973,2021-08-11 20:52:38,Amazon,L5,Software Engineer,283,"Irvine, CA",2,2,Testing (SDET),194,74,15,Male,schraeps squiss mckaiox feiamp dynoialy sproil...,7229,803,82896
61977,2021-08-11 21:37:59,Amazon,L4,Software Engineer,166,"Seattle, WA",0,0,Full Stack,125,11,30,,skoiaf hypunt pluly juopt rhord hraiows feibs ...,11527,819,82902


In [11]:
df['gender'].value_counts()

Male                               35702
                                   19540
Female                              6999
Other                                400
Title: Senior Software Engineer        1
Name: gender, dtype: int64

In [13]:
df['timestamp'].max()

Timestamp('2021-08-17 08:28:57')

In [4]:
df = df.replace('', np.nan)
df.isna().sum()

timestamp                      0
company                        0
level                          0
title                          0
totalyearlycompensation        0
location                       0
yearsofexperience              0
yearsatcompany                 0
tag                          786
basesalary                     0
stockgrantvalue                0
bonus                          0
gender                     19540
otherdetails               22503
cityid                         0
dmaid                          2
rowNumber                      0
dtype: int64