In [1]:
from scipy.stats import stats
import pandas as pd
from script.helper import *

In [2]:
df = pd.read_csv('ds_2019.csv')

us = df[df['Country'] == 'United States'].copy()

us_cleaned = us.copy()

looking = us['LookingForAnotherJob'].value_counts().index

looking_map = { v: i for i, v in enumerate(looking)}

# Mapping cat groups to integers
us_cleaned.loc[:, 'LookingForAnotherJob'] = us['LookingForAnotherJob'].map(looking_map)

# Cleaning and converting salary string to integers
us_cleaned.loc[:, 'SalaryUSD'] = us['SalaryUSD'].apply(str).replace(',|\s+', '', regex=True).astype(float)


### Breaking into three groups

In [3]:
not_looking_salary = us_cleaned.loc[us_cleaned['LookingForAnotherJob'] == 0, 'SalaryUSD']
passively_looking_salary = us_cleaned.loc[us_cleaned['LookingForAnotherJob'] == 1, 'SalaryUSD']
actively_looking_salary = us_cleaned.loc[us_cleaned['LookingForAnotherJob'] == 2, 'SalaryUSD']

**Removing Outliers**

In [4]:
not_looking_salary = outliers_removed(not_looking_salary)

passively_looking_salary = outliers_removed(passively_looking_salary)

actively_looking_salary = outliers_removed(actively_looking_salary)

## Testing

**Not Looking mean and std**

In [5]:
not_looking_salary.mean()

104183.50745614035

In [6]:
not_looking_salary.std()

26936.14009484742

**Passively Looking mean and std**

In [7]:
passively_looking_salary.mean()

100689.68649249584

In [8]:
passively_looking_salary.std()

26691.45942625123

**Actively Looking mean and std**

In [9]:
actively_looking_salary.mean()

98325.48837209302

In [10]:
actively_looking_salary.std()

31640.47331099886

### Welch

Null: **Not Looking** = **Passively Looking**  
Alt: **Not Looking** > **Passively Looking**

In [11]:
t, p = stats.ttest_ind(not_looking_salary, passively_looking_salary, equal_var=False)

In [12]:
p/2

1.8197053370711403e-05

**Conclusion:** Reject null

Null: **Passively Looking** = **Actively Looking**  
Alt: **Passively Looking** > **Actively Looking**

In [13]:
t, p = stats.ttest_ind(passively_looking_salary, actively_looking_salary, equal_var=False)

In [14]:
p/2

0.11058740836029142

**Conclusion:** Fail to reject null

Null: **Passively Looking** = **Actively Looking**  
Alt: **Passively Looking** > **Actively Looking**

In [15]:
t, p = stats.ttest_ind(not_looking_salary, actively_looking_salary, equal_var=False)

In [16]:
p/2

0.0011566581751661306

**Conclusion:** Reject null

### Bayesian

Alpha and beta calculation from here:  
https://en.wikipedia.org/wiki/Conjugate_prior

![Alpha Beta](images/alpha_beta.png)

In [18]:
def calc_ab(data):
#     image above from wikipedia 
    mu = data.mean()
    a = 1 + (data.size/2)
    
    ssd = ((data - mu) ** 2).sum()
    b = 1 + ssd/2
    
    return a, b

In [20]:
# Calculating alpha and beta for not looking, passively looking, actively looking

nla, nlb = calc_ab(not_looking_salary)
pla, plb = calc_ab(passively_looking_salary)
ala, alb = calc_ab(actively_looking_salary)

Inverse gamma  

https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.invgamma.html

In [21]:
nl_ig = stats.invgamma(a=nla, scale=nlb)
pl_ig = stats.invgamma(a=pla, scale=plb)
al_ig = stats.invgamma(a=ala, scale=alb)

In [29]:
# simulation of variance

samps_nl = nl_ig.rvs(10000)
samps_pl = pl_ig.rvs(10000)
samps_al = al_ig.rvs(10000)

# calculating standard error
nl_sq = np.sqrt(samps_nl/not_looking_salary.size)
pl_sq = np.sqrt(samps_pl/passively_looking_salary.size)
al_sq = np.sqrt(samps_al/actively_looking_salary.size)

# Generating distribution 
samps_bay_nl = stats.norm(not_looking_salary.mean(), nl_sq).rvs(10000)
samps_bay_pl = stats.norm(passively_looking_salary.mean(), pl_sq).rvs(10000)
samps_bay_al = stats.norm(actively_looking_salary.mean(), al_sq).rvs(10000)

NL Bayesian > PL Bayesian

In [30]:
(samps_bay_nl > samps_bay_pl).mean()

1.0

PL Bayesian > AL Bayesian

In [31]:
(samps_bay_pl > samps_bay_al).mean()

0.8849

NL Bayesian > AL Bayesian

In [32]:
(samps_bay_nl > samps_bay_al).mean()

0.9986