In [2]:
import pandas as pd
import json
import numpy as np
from collections import defaultdict
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns

In [3]:
df = pd.read_csv('data/processed.tsv', sep='\t')

In [4]:
df = df[df['yoe_total'] <= 10]
df = df[df['degree'] != 'phd']

In [5]:
df['degree'] = df['degree'].replace({
    'bachelor': 'Bachelor',
    'master': 'Master',
})

In [6]:
from plotly.subplots import make_subplots

In [9]:
grped = df.groupby(['degree', 'yoe_total']).agg({'tc': ['mean', 'median', 'std']})
grped.columns = grped.columns.map('_'.join)
grped['lower'] = grped['tc_mean'] - grped['tc_std']
grped['upper'] = grped['tc_mean'] + grped['tc_std']

#fig = make_subplots(rows=1, cols=2)
fig = px.line(grped,
    x=grped.index.get_level_values('yoe_total'),
    y='tc_mean',
    color=grped.index.get_level_values('degree'),
    labels={
        "x": "Years of Experience",
        "tc_mean": "Total Compensation ($)",
        "color": "Degree Level"
    },
    title="Compensation Over Time",
    )
fig.update_layout(title_x=0.5)
fig.update_yaxes(range=[110000, 390000])
fig.write_image('./vis/fig1.png')

In [10]:
temp_df = df[df['location'] == 'San Francisco, CA']
grped = temp_df.groupby(['degree', 'yoe_total']).agg({'tc': ['mean', 'median', 'std']})
grped.columns = grped.columns.map('_'.join)
grped['lower'] = grped['tc_mean'] - grped['tc_std']
grped['upper'] = grped['tc_mean'] + grped['tc_std']

fig = px.line(grped,
    x=grped.index.get_level_values('yoe_total'),
    y='tc_mean',
    color=grped.index.get_level_values('degree'),
    labels={
        "x": "Years of Experience",
        "tc_mean": "Total Compensation ($)",
        "color": "Degree Level"
    },
    title="Compensation Over Time (San Francisco only)"
)
fig.update_layout(title_x=0.5)
fig.update_yaxes(range=[110000, 390000])
fig.write_image('./vis/fig2.png')