In [2]:
import pandas as pd
import numpy as np

# Read the data
with open('ds_salaries.csv', 'r') as f:
    salary = pd.read_csv(f)

From Job Salary visualization we can see that it seems like there are 3 drops in salaries.
 - 2020 -> 2021 EN
 - 2020 -> 2021 SE
 - 2021 -> 2022 EX

To test it out I'm going to do t-testing

In [34]:
exp_group = {}
for group, frame in salary.groupby('experience_level'):
    mean = frame.groupby('work_year')['salary_in_usd'].mean()
    std = frame.groupby('work_year')['salary_in_usd'].std()
    count = frame.groupby('work_year')['salary_in_usd'].count()
    mean.name = 'mean'
    std.name = 'std'
    count.name = 'count'
    data = pd.concat([mean, std, count], axis=1)
    exp_group[group] = data

In [36]:
EN = exp_group['EN']
MI = exp_group['MI']
SE = exp_group['SE']
EX = exp_group['EX']

In [52]:
# EN
import scipy.stats as stats
std_2020 = EN.loc[2020, 'std']
std_2021 = EN.loc[2021, 'std']
odds, p = stats.ttest_ind_from_stats(mean1=EN.loc[2020, 'mean'], std1=std_2020, nobs1=EN.loc[2020, 'count'],
                                    mean2=EN.loc[2021, 'mean'], std2=std_2021, nobs2=EN.loc[2021, 'count'])
print(f'T-stat: {odds}, p-value: {p}')

T-stat: 0.3689691325215797, p-value: 0.7133498969708484


We can see that drop is statistically insignificant

In [53]:
# SE
std_2020 = SE.loc[2020, 'std']
std_2021 = SE.loc[2021, 'std']
odds, p = stats.ttest_ind_from_stats(mean1=SE.loc[2020, 'mean'], std1=std_2020, nobs1=SE.loc[2020, 'count'],
                                    mean2=SE.loc[2021, 'mean'], std2=std_2021, nobs2=SE.loc[2021, 'count'])
print(f'T-stat: {odds}, p-value: {p}')

T-stat: 0.5877998883264128, p-value: 0.5582250317216748


Still an insignificant drop

In [54]:
#EX
std_2021 = EX.loc[2021, 'std']
std_2022 = EX.loc[2022, 'std']
odds, p = stats.ttest_ind_from_stats(mean1=EX.loc[2021, 'mean'], std1=std_2021, nobs1=EX.loc[2021, 'count'],
                                    mean2=EX.loc[2022, 'mean'], std2=std_2022, nobs2=EX.loc[2022, 'count'])
print(f'T-stat: {odds}, p-value: {p}')

T-stat: 0.9493969946202515, p-value: 0.3527324044921114


Summarizing, all those drops are not of statistical significance

Next is that in France 100% remotes earn seemingly almost 2 times more.

In [55]:
france_data = salary.query('company_location == "FR"')
remotes = france_data.query('remote_ratio == 100')
non_remotes = france_data.query('remote_ratio != 100')
odds, p = stats.ttest_ind(remotes['salary_in_usd'], non_remotes['salary_in_usd'])
print(f'T-stat: {odds}, p-value: {p}')

T-stat: 3.1058144724551284, p-value: 0.00835275638773842


P-value is very small so we reject H0 and conclude that remote workers earn more than non-remote, however I doubt that because there're only 2 remotes from France in set

Also I guess that working in huge company (size of L) you'll earn more

In [56]:
L_data = salary.query('company_size == "L"')
non_L_data = salary.query('company_size != "L"')
odds, p = stats.ttest_ind(L_data['salary_in_usd'], non_L_data['salary_in_usd'])
print(f'T-stat: {odds}, p-value: {p}')

T-stat: 1.680354798353194, p-value: 0.09340450658293355


However, it appears not to be the case