In [1]:
import pandas as pd
import seaborn as sns
import numpy as np
import statsmodels.formula.api as smf

# this allows plots to appear directly in the notebook
%matplotlib inline

In [2]:
# get the data
df = pd.read_csv('data/psam_p21.csv')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,RT,SERIALNO,DIVISION,SPORDER,PUMA,REGION,ST,ADJINC,PWGTP,AGEP,...,PWGTP71,PWGTP72,PWGTP73,PWGTP74,PWGTP75,PWGTP76,PWGTP77,PWGTP78,PWGTP79,PWGTP80
0,P,2014000000017,6,1,1800,3,21,1070673,13,42,...,14,5,5,32,15,10,11,15,11,26
1,P,2014000000017,6,2,1800,3,21,1070673,14,47,...,12,4,4,25,22,15,14,13,15,29
2,P,2014000000017,6,3,1800,3,21,1070673,13,13,...,10,3,4,24,21,11,13,13,11,25
3,P,2014000000017,6,4,1800,3,21,1070673,13,8,...,11,3,3,24,22,12,13,13,13,25
4,P,2014000000076,6,1,2500,3,21,1070673,43,43,...,11,76,46,68,73,41,70,13,38,41


# Calculate the value of a Civil Engineering degree and a degree in Transportation Sciences And Technologies, relative to a college degree in general. 

In [3]:
df = df[df['WKL']==1]

In [4]:
# Is someone a college grad
df['college_grad'] = df['SCHL'].apply(lambda x : x>=20)
df['CE_Grad'] = df['FOD1P'].apply(lambda x : x==2406)
df['TRN_Grad'] = df['FOD1P'].apply(lambda x : x==5901)

mod = smf.ols(formula="WAGP \
                       ~ WKHP \
                       + college_grad \
                       + CE_Grad \
                       + TRN_Grad", 
              data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   WAGP   R-squared:                       0.204
Model:                            OLS   Adj. R-squared:                  0.204
Method:                 Least Squares   F-statistic:                     6975.
Date:                Sun, 29 Mar 2020   Prob (F-statistic):               0.00
Time:                        23:20:07   Log-Likelihood:            -1.3195e+06
No. Observations:              108994   AIC:                         2.639e+06
Df Residuals:                  108989   BIC:                         2.639e+06
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept            -2.014e+04 

# Calculate the value of working in Architectural, Engineering, And Related Services versus working in transportation (anything starting in TRN). 

In [5]:
# National American Industry Classification System
def TRN_Workers(NAICSP): 
        if NAICSP in ['481','482', '483', '484', '4853','485M','486','487','488', '491','492','493']:
            return "TRN"
        elif NAICSP == '5413':
            return "ARCH_ENG_REL_SERV"
        else:
            return "Others"


df['NAICSP2']=df['NAICSP'].apply(TRN_Workers)
df['NAICSP2'].value_counts()

Others               103153
TRN                    5145
ARCH_ENG_REL_SERV       696
Name: NAICSP2, dtype: int64

In [6]:
mod = smf.ols(formula="WAGP \
                       ~ WKHP \
                       + C(NAICSP2)", 
              data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   WAGP   R-squared:                       0.137
Model:                            OLS   Adj. R-squared:                  0.137
Method:                 Least Squares   F-statistic:                     5752.
Date:                Sun, 29 Mar 2020   Prob (F-statistic):               0.00
Time:                        23:21:37   Log-Likelihood:            -1.3239e+06
No. Observations:              108994   AIC:                         2.648e+06
Df Residuals:                  108990   BIC:                         2.648e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept              572.2358 

# Calculate the wage difference between male and female civil engineers

In [7]:
df['CE'] = df['OCCP'].apply(lambda x : x == 1360)
mod = smf.ols(formula="WAGP \
                       ~ CE \
                       + C(SEX) \
                       + college_grad",
              data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   WAGP   R-squared:                       0.120
Model:                            OLS   Adj. R-squared:                  0.120
Method:                 Least Squares   F-statistic:                     4951.
Date:                Sun, 29 Mar 2020   Prob (F-statistic):               0.00
Time:                        23:24:38   Log-Likelihood:            -1.3249e+06
No. Observations:              108994   AIC:                         2.650e+06
Df Residuals:                  108990   BIC:                         2.650e+06
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                           coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------------
Intercept             3.542e+04 

In [16]:
# many of these are not significant, so let's drop some
def CE_TRN_None(schl): 
    if schl ==2406 or schl==5901:
        return 'Civil or Transport'
    else: 
        return "Non-civil"

df['major'] = df['FOD1P'].apply(CE_TRN_None)
df['major'].value_counts()

Non-civil             108641
Civil or Transport       353
Name: major, dtype: int64

In [20]:
mod = smf.ols(formula="WAGP \
                       ~ major", 
              data=df)
res = mod.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                   WAGP   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.003
Method:                 Least Squares   F-statistic:                     382.1
Date:                Sun, 29 Mar 2020   Prob (F-statistic):           6.10e-85
Time:                        22:44:55   Log-Likelihood:            -1.3317e+06
No. Observations:              108994   AIC:                         2.663e+06
Df Residuals:                  108992   BIC:                         2.663e+06
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
Intercept           9.006e+04   2607

In [11]:
df['civil_transp_grad'] = df[(df.FOD1P == 2406)]

ValueError: Wrong number of items passed 287, placement implies 1

In [8]:
df = df[df['WKL']==1]
# Is someone a college grad
# df['college_grad'] = df['SCHL'].apply(lambda x : x>=20)
# df['civil_transp_grad'] = df['FOD1P'].apply(lambda x : x==2406|x==5901)
# df['not_civil_transport'] = df['FOD1P'].apply(lambda x : x!=2406|x!=5901)
df['civil_transp_grad'] = df[(df['SCHL']>=20) & (df['FOD1P'] == 2406 | df['FOD1P'] == 5901)]
df['not_civil_transport'] = df[(df['SCHL']>=20) & (df['FOD1P'] != 2406 | df['FOD1P'] != 5901)]

TypeError: cannot compare a dtyped [float64] array with a scalar of type [bool]

In [None]:
# This is a basic model estimation in statsmodels
# the dependent variable is on the left side of the ~
mod = smf.ols(formula="WAGP \
                      ~ WKHP \
                      + college_grad", 
                      data=df)
res = mod.fit()
print(res.summary())

In [3]:
df[['FOD1P','FOD2P']]

Unnamed: 0,FOD1P,FOD2P
0,3606.0,
1,3600.0,
2,,
3,,
4,,
...,...,...
225035,1901.0,
225036,6004.0,
225037,,
225038,,
