In [1]:
import pandas as pd
import numpy as np
from collections import defaultdict
import json
import os

In [2]:
df = pd.read_csv('./data/main-scraped.tsv', sep='\t')

df.replace('', np.nan, inplace=True)
df.replace('--', np.nan, inplace=True)
df.replace('(hidden)', np.nan, inplace=True)

In [3]:
df.replace('', np.nan, inplace=True)
df['date'] = pd.to_datetime(df['date'])
df['yoe'] = df['yoe'].apply(lambda x: np.nan if '-' in x else x).astype(float)
df['yoe_total'] = df['yoe_total'].apply(lambda x: np.nan if '-' in x else x).str.replace('+', '', regex=False).astype(float)
df['education'] = df['education'].str.replace("â€™", "")

In [4]:
def extract_degree(row):
    if not pd.isna(row['education']):
        return row['education']
    
    details = row['other']
    if pd.isna(details):
        return np.nan
        
    details = details.lower()
    if 'phd' in details or 'doctor' in details:
        return 'Doctorate (PhD)'
    elif 'master' in details:
        return 'Masters degree'
    elif 'bachelor' in details:
        return 'Bachelors degree'
    else:
        return np.nan

In [5]:
df['education'] = df.apply(extract_degree, axis=1)

In [6]:
def process_num(x):
    if pd.isna(x):
        return np.nan
    if type(x) == float or type(x) == int:
        return x
        
    x = x.lower()
    if 'k' in x:
        if len(x) > 1:
            return float(x.replace('k', '')) * 1000
        return 1000.0
    if 'm' in x:
        if len(x) > 1:
            return float(x.replace('m', '')) * 1000000
        return 1000000.0
    x = float(x)
    return x

In [7]:
df['base'] = df['base'].apply(process_num)
df['stock'] = df['stock'].apply(process_num)
df['bonus'] = df['bonus'].apply(process_num)

In [8]:
df['tc'] = df['tc'].str.replace('$', '', regex=False).str.replace(',', '', regex=False).str.replace('+', '', regex=False)
df['tc'] = df['tc'].apply(process_num)

# Geocoding

In [9]:
if os.path.exists('./data/locations.json'):
    with open('./data/locations.json', 'r') as f:
        location_dict = json.load(f)

In [10]:
def get_country(x):
    if x and x in location_dict.keys() and location_dict[x]:
        return location_dict[x]['address']['country']
    return None
df['country'] = df['location'].apply(get_country)

def get_state(x):
    if x and x in location_dict.keys() and location_dict[x] and 'state' in location_dict[x]['address'].keys():
        return location_dict[x]['address']['state']
    return None
df['state'] = df['location'].apply(get_state)

In [14]:
df.to_csv('./data/main-processed.tsv', sep='\t', index=False)