# Statistical part

## Goals and expectations :

- Is there any difference in salaries between the cities? (ANOVA Test) -> Tukey for advanced conclusions
- Is there any difference in salaries between the industries? (ANOVA Test) -> Tukey for advanced conclusions
- Is rating higher in "easy apply" jobs? (Two samples Hypothesis testing)
- What is the true value (Around) of the salary mean of data scientist and analysts (confidence intervals)

## Environment

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats
from scipy.stats import f_oneway

In [2]:
good_jobs = pd.read_csv('good-job.csv', index_col=0)
good_jobs.reset_index(drop=True, inplace=True)
good_jobs

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Type of ownership,Industry,Sector,Revenue,Easy Apply,Min Salary,Max Salary
0,Junior Data Analyst,59-85 (),Job Description\nJob description\nInterpret da...,5.0,"['Staffigo Technical Services, LLC', '5.0']","New York, NY","Woodridge, IL",51 to 200 employees,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),False,59,85
1,Data Analyst with Rochade,59-85 (),"Hi, Trust Yoursquore Doing Good. Enclosed is t...",4.0,"['Reliable Software Resources', '4.0']","New York, NY","Northville, MI",501 to 1000 employees,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),False,59,85
2,Advertising Data Analyst,59-85 (),Advertising Data Analyst\n\nHearst Television ...,4.3,"['Hearst Television, Inc', '4.3']","New York, NY","Cincinnati, OH",51 to 200 employees,Company - Private,TV Broadcast & Cable Networks,Media,$5 to $10 million (USD),False,59,85
3,Senior Data Analyst,59-85 (),"At Rockstar Games, we create the games we woul...",4.1,"['Rockstar Games', '4.1']","New York, NY","New York, NY",1001 to 5000 employees,Subsidiary or Business Segment,Video Games,Media,$10 to $25 million (USD),False,59,85
4,Data Analyst,59-85 (),Job Description\nPrimary Responsibilities: Exp...,4.2,"['Precision technologies corp', '4.2']","New York, NY","New Brunswick, NJ",201 to 500 employees,Company - Private,IT Services,Information Technology,$10 to $25 million (USD),False,59,85
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
126,Data Analyst Junior,57-100 (),Job Description\nJob description\nInterpret da...,5.0,"['Staffigo Technical Services, LLC', '5.0']","Denver, CO","Woodridge, IL",51 to 200 employees,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),False,57,100
127,Data Security Analyst,57-100 (),Title: Data Security Analyst\nLocation: Denver...,4.3,"['22nd Century Staffing', '4.3']","Denver, CO","McLean, VA",51 to 200 employees,Company - Private,Staffing & Outsourcing,Business Services,$5 to $10 million (USD),True,57,100
128,Data Security Analyst,57-100 (),"Role: Data Security Analyst\nLocation: Denver,...",4.1,"['Mavensoft Technologies, LLC.', '4.1']","Denver, CO","Beaverton, OR",1 to 50 employees,Company - Private,IT Services,Information Technology,Less than $1 million (USD),False,57,100
129,Data Analyst - Junior,57-100 (),Job Description\nJob description\nInterpret da...,5.0,"['Staffigo Technical Services, LLC', '5.0']","Denver, CO","Woodridge, IL",51 to 200 employees,Company - Private,IT Services,Information Technology,$50 to $100 million (USD),False,57,100


## Data exploration

In [3]:
good_jobs['Location'].value_counts()

San Francisco, CA    38
New York, NY         33
San Diego, CA        24
Los Angeles, CA      16
Denver, CO           15
Seattle, WA           5
Name: Location, dtype: int64

In [4]:
cities = good_jobs[['Location','Min Salary','Max Salary']]

cities

Unnamed: 0,Location,Min Salary,Max Salary
0,"New York, NY",59,85
1,"New York, NY",59,85
2,"New York, NY",59,85
3,"New York, NY",59,85
4,"New York, NY",59,85
...,...,...,...
126,"Denver, CO",57,100
127,"Denver, CO",57,100
128,"Denver, CO",57,100
129,"Denver, CO",57,100


In [5]:
sf = cities[cities['Location'] == 'San Francisco, CA']

sf_min_salary = sf['Min Salary']
sf_max_salary = sf['Max Salary']

sf_min_salary = pd.unique(pd.Series(sf_min_salary))
print(sf_min_salary)

sf_max_salary = pd.unique(pd.Series(sf_max_salary ))
print(sf_max_salary)

[57 99 93 82 65]
[104 178 159 116 120]


In [6]:
ny = cities[cities['Location'] == 'New York, NY']

ny_min_salary = ny['Min Salary']
ny_max_salary = ny['Max Salary']

ny_min_salary = pd.unique(pd.Series(ny_min_salary))
print(ny_min_salary)

ny_max_salary = pd.unique(pd.Series(ny_max_salary ))
print(ny_max_salary)

[59 60 73 84 77 98]
[ 85 110 127  90 132 114]


In [7]:
sd = cities[cities['Location'] == 'San Diego, CA']

sd_min_salary = sd['Min Salary']
sd_max_salary = sd['Max Salary']

sd_min_salary = pd.unique(pd.Series(sd_min_salary))
print(sd_min_salary)

sd_max_salary = pd.unique(pd.Series(sd_max_salary ))
print(sd_max_salary)

[76 60]
[122 124]


In [8]:
la = cities[cities['Location'] == 'Los Angeles, CA']

la_min_salary = la['Min Salary']
la_max_salary = la['Max Salary']

la_min_salary = pd.unique(pd.Series(la_min_salary))
print(la_min_salary)

la_max_salary = pd.unique(pd.Series(la_max_salary ))
print(la_max_salary)

[ 55  57 113]
[103 132]


In [9]:
den = cities[cities['Location'] == 'Denver, CO']

den_min_salary = den['Min Salary']
den_max_salary = den['Max Salary']

den_min_salary = pd.unique(pd.Series(den_min_salary))
print(den_min_salary)

den_max_salary = pd.unique(pd.Series(den_max_salary ))
print(den_max_salary)

[57]
[ 67 100]


In [10]:
seat = cities[cities['Location'] == 'Seattle, WA']

seat_min_salary = seat['Min Salary']
seat_max_salary = seat['Max Salary']

seat_min_salary = pd.unique(pd.Series(seat_min_salary))
print(seat_min_salary)

seat_max_salary = pd.unique(pd.Series(seat_max_salary ))
print(seat_max_salary)

[55]
[101]


### Minimun salaries between cities

In [11]:
# ANOVA

f_oneway(sf_min_salary,ny_min_salary,sd_min_salary,la_min_salary,den_min_salary,seat_min_salary)

F_onewayResult(statistic=0.44124824594519513, pvalue=0.8115508427137781)

In [12]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
from statsmodels.stats.multicomp import MultiComparison

In [13]:
# Tukeyhsd()

mc = MultiComparison(cities['Min Salary'], cities['Location'])
result_min = mc.tukeyhsd()

In [14]:
print(result_min)

            Multiple Comparison of Means - Tukey HSD, FWER=0.05            
      group1            group2      meandiff p-adj   lower    upper  reject
---------------------------------------------------------------------------
       Denver, CO   Los Angeles, CA      2.5    0.9 -11.8946 16.8946  False
       Denver, CO      New York, NY  18.3333  0.001   5.8611 30.8055   True
       Denver, CO     San Diego, CA   9.6667 0.2827  -3.5161 22.8494  False
       Denver, CO San Francisco, CA  22.9211  0.001   10.708 35.1341   True
       Denver, CO       Seattle, WA     -2.0    0.9 -22.6828 18.6828  False
  Los Angeles, CA      New York, NY  15.8333 0.0035    3.632 28.0346   True
  Los Angeles, CA     San Diego, CA   7.1667 0.5839  -5.7601 20.0934  False
  Los Angeles, CA San Francisco, CA  20.4211  0.001   8.4847 32.3574   True
  Los Angeles, CA       Seattle, WA     -4.5    0.9 -25.0206 16.0206  False
     New York, NY     San Diego, CA  -8.6667 0.1881 -19.4115  2.0782  False
     New Yor

### Maximum salaries between cities

In [15]:
# ANOVA

f_oneway (sf_max_salary,ny_max_salary,sd_max_salary,la_max_salary,den_max_salary,seat_max_salary)

F_onewayResult(statistic=1.6215259743465684, pvalue=0.2280376721159637)

In [16]:
# Tukeyhsd()

mc = MultiComparison(cities['Max Salary'], cities['Location'])
result_max = mc.tukeyhsd()

In [17]:
print(result_max)

            Multiple Comparison of Means - Tukey HSD, FWER=0.05             
      group1            group2      meandiff p-adj   lower    upper   reject
----------------------------------------------------------------------------
       Denver, CO   Los Angeles, CA  22.4125  0.035   0.9686  43.8564   True
       Denver, CO      New York, NY  28.2061  0.001    9.626  46.7861   True
       Denver, CO     San Diego, CA  40.7667  0.001  21.1281  60.4052   True
       Denver, CO San Francisco, CA  58.1526  0.001  39.9586  76.3467   True
       Denver, CO       Seattle, WA     18.6 0.5026 -12.2115  49.4115  False
  Los Angeles, CA      New York, NY   5.7936    0.9 -12.3829    23.97  False
  Los Angeles, CA     San Diego, CA  18.3542 0.0714   -0.903  37.6113  False
  Los Angeles, CA San Francisco, CA  35.7401  0.001  17.9584  53.5218   True
  Los Angeles, CA       Seattle, WA  -3.8125    0.9 -34.3823  26.7573  False
     New York, NY     San Diego, CA  12.5606 0.2138  -3.4461  28.5673  False

In [18]:
industries = good_jobs[['Sector','Min Salary','Max Salary']]

industries

Unnamed: 0,Sector,Min Salary,Max Salary
0,Information Technology,59,85
1,Information Technology,59,85
2,Media,59,85
3,Media,59,85
4,Information Technology,59,85
...,...,...,...
126,Information Technology,57,100
127,Business Services,57,100
128,Information Technology,57,100
129,Information Technology,57,100


In [19]:
industries['Sector'] = industries['Sector'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [20]:
industries['Sector'].value_counts()

Information Technology              57
Business Services                   26
Health Care                         11
Education                            9
nan                                  8
Media                                5
Accounting & Legal                   5
Finance                              5
Travel & Tourism                     1
Transportation & Logistics           1
Biotech & Pharmaceuticals            1
Arts, Entertainment & Recreation     1
Manufacturing                        1
Name: Sector, dtype: int64

In [21]:
info_tec =industries[industries['Sector'] == 'Information Technology']

info_tec_min_salary = info_tec['Min Salary']
info_tec_max_salary = info_tec['Max Salary']


In [22]:
bus_ser =industries[industries['Sector'] == 'Business Services']

bus_ser_min_salary = bus_ser['Min Salary']
bus_ser_max_salary = bus_ser['Max Salary']


In [23]:
heal_care =industries[industries['Sector'] == 'Business Services']

heal_care_min_salary = heal_care['Min Salary']
heal_care_max_salary = heal_care['Max Salary']

In [24]:
education =industries[industries['Sector'] == 'Education']

education_min_salary = education['Min Salary']
education_max_salary = education['Max Salary']

In [25]:
media =industries[industries['Sector'] == 'Media']

media_min_salary = media['Min Salary']
media_max_salary = media['Max Salary']

In [26]:
finnace =industries[industries['Sector'] == 'Finance']

finnace_min_salary = finnace['Min Salary']
finnace_max_salary = finnace['Max Salary']

In [27]:
acc_legal =industries[industries['Sector'] == 'Accounting & Legal']

acc_legal_min_salary = acc_legal['Min Salary']
acc_legal_max_salary = acc_legal['Max Salary']

In [28]:
bio_phar =industries[industries['Sector'] == 'Biotech & Pharmaceuticals']

bio_phar_min_salary = bio_phar['Min Salary']
bio_phar_max_salary = bio_phar['Max Salary']

In [29]:
manufac =industries[industries['Sector'] == 'Manufacturing']

manufac_min_salary = manufac['Min Salary']
manufac_max_salary = manufac['Max Salary']

In [30]:
tral_tour =industries[industries['Sector'] == 'Travel & Tourism']

tral_tour_min_salary = tral_tour['Min Salary']
tral_tour_max_salary = tral_tour['Max Salary']

In [31]:
arts =industries[industries['Sector'] == 'Arts, Entertainment & Recreation']

arts_min_salary = arts['Min Salary']
arts_max_salary = arts['Max Salary']

In [32]:
tran_logis =industries[industries['Sector'] == 'Transportation & Logistics']

tran_logis_min_salary = tran_logis['Min Salary']
tran_logis_max_salary = tran_logis['Max Salary']

### Minimum salaries between industries

In [33]:
# ANOVA

f_oneway (info_tec_min_salary,bus_ser_min_salary,heal_care_min_salary,education_min_salary,media_min_salary,finnace_min_salary,acc_legal_min_salary,bio_phar_min_salary,manufac_min_salary,tral_tour_min_salary,arts_min_salary,tran_logis_min_salary)

F_onewayResult(statistic=1.7879763044088637, pvalue=0.06263199715386741)

In [34]:
# Tukeyhsd()

mc_sector = MultiComparison(industries['Min Salary'],industries['Sector'])
sector_min = mc_sector.tukeyhsd()
print(sector_min)

                           Multiple Comparison of Means - Tukey HSD, FWER=0.05                            
             group1                           group2              meandiff p-adj    lower    upper  reject
----------------------------------------------------------------------------------------------------------
              Accounting & Legal Arts, Entertainment & Recreation      5.0    0.9  -51.5985 61.5985  False
              Accounting & Legal        Biotech & Pharmaceuticals      8.0    0.9  -48.5985 64.5985  False
              Accounting & Legal                Business Services   1.8462    0.9  -23.3842 27.0765  False
              Accounting & Legal                        Education  -5.4444    0.9   -34.263 23.3741  False
              Accounting & Legal                          Finance     19.8 0.6732  -12.8771 52.4771  False
              Accounting & Legal                      Health Care  12.0909    0.9  -15.7762 39.9581  False
              Accounting & Legal     

### Maximum salaries between industries

In [35]:
# Anova

f_oneway (info_tec_max_salary,bus_ser_max_salary,heal_care_max_salary,education_max_salary,media_min_salary,finnace_max_salary,acc_legal_max_salary,bio_phar_max_salary,manufac_max_salary,tral_tour_max_salary,arts_max_salary,tran_logis_max_salary)

F_onewayResult(statistic=5.449787753125887, pvalue=4.649650417964107e-07)

In [None]:
0.00000004649650417964107

In [45]:
from statsmodels.stats.multicomp import pairwise_tukeyhsd
print(pairwise_tukeyhsd(industries['Max Salary'],industries['Sector']))

                            Multiple Comparison of Means - Tukey HSD, FWER=0.05                            
             group1                           group2              meandiff p-adj    lower    upper   reject
-----------------------------------------------------------------------------------------------------------
              Accounting & Legal Arts, Entertainment & Recreation     18.0    0.9  -77.4193 113.4193  False
              Accounting & Legal        Biotech & Pharmaceuticals     13.0    0.9  -82.4193 108.4193  False
              Accounting & Legal                Business Services   5.8846    0.9  -36.6512  48.4204  False
              Accounting & Legal                        Education   5.3333    0.9  -43.2518  53.9185  False
              Accounting & Legal                          Finance     46.4 0.1949   -8.6903 101.4903  False
              Accounting & Legal                      Health Care  23.6364 0.8871  -23.3448  70.6176  False
              Accounting & L

In [46]:
industries.groupby('Sector').count()["Max Salary"]

Sector
Accounting & Legal                   5
Arts, Entertainment & Recreation     1
Biotech & Pharmaceuticals            1
Business Services                   26
Education                            9
Finance                              5
Health Care                         11
Information Technology              57
Manufacturing                        1
Media                                5
Transportation & Logistics           1
Travel & Tourism                     1
nan                                  8
Name: Max Salary, dtype: int64

In [36]:
# Tukeyhsd()

mc_sector = MultiComparison(industries['Max Salary'],industries['Sector'])
sector_max = mc_sector.tukeyhsd()
print(sector_max)

                            Multiple Comparison of Means - Tukey HSD, FWER=0.05                            
             group1                           group2              meandiff p-adj    lower    upper   reject
-----------------------------------------------------------------------------------------------------------
              Accounting & Legal Arts, Entertainment & Recreation     18.0    0.9  -77.4193 113.4193  False
              Accounting & Legal        Biotech & Pharmaceuticals     13.0    0.9  -82.4193 108.4193  False
              Accounting & Legal                Business Services   5.8846    0.9  -36.6512  48.4204  False
              Accounting & Legal                        Education   5.3333    0.9  -43.2518  53.9185  False
              Accounting & Legal                          Finance     46.4 0.1949   -8.6903 101.4903  False
              Accounting & Legal                      Health Care  23.6364 0.8871  -23.3448  70.6176  False
              Accounting & L

### Is rating higher in "easy apply" jobs?


In [37]:
rating = good_jobs[['Rating','Easy Apply']]

rating

Unnamed: 0,Rating,Easy Apply
0,5.0,False
1,4.0,False
2,4.3,False
3,4.1,False
4,4.2,False
...,...,...
126,5.0,False
127,4.3,True
128,4.1,False
129,5.0,False


In [38]:
easy = rating[rating['Easy Apply']==True]

easy

Unnamed: 0,Rating,Easy Apply
76,4.1,True
83,4.5,True
85,4.0,True
98,4.1,True
99,4.8,True
100,4.1,True
117,4.0,True
127,4.3,True
130,4.3,True


In [39]:
noteasy = rating[rating['Easy Apply']==False]
noteasy

Unnamed: 0,Rating,Easy Apply
0,5.0,False
1,4.0,False
2,4.3,False
3,4.1,False
4,4.2,False
...,...,...
124,4.2,False
125,4.1,False
126,5.0,False
128,4.1,False


In [42]:
import scipy.stats as st

In [41]:
def t_test (samp1, samp2):
    
    t_test = st.ttest_ind (a=samp1 ,b=samp2)
    return t_test

t_test(noteasy["Rating"], easy["Rating"])

Ttest_indResult(statistic=1.6114885744395542, pvalue=0.10951733735866148)