# Exercise 5 | Gender Wage Gap

In [15]:
# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.formula.api as smf
import seaborn as sns
import os

# File Path
colab_path = '/content/drive/MyDrive/Colab Notebooks/'
file_path = 'Part_5'

# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')
os.chdir(colab_path + file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Gender Wage Gap

In [None]:
df = pd.read_csv(file_path + 'usa_00041_sample.csv')
df['MALE'] = 1 - df['SEX']
df['FEMALE'] = df['SEX']
df.head()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,YEAR,SAMPLE,SERIAL,CBSERIAL,HHWT,CLUSTER,STRATA,GQ,...,AGE,EDUC,EDUCD,DEGFIELD,DEGFIELDD,INCTOT,INCLOG10,EDU,MALE,FEMALE
0,619,1335639,2019,201901,583089,2019000700547,128.0,2019005830891,80023,1,...,51,6,63,0,0,38500,4.585461,12.0,0.0,1.0
1,531,950207,2019,201901,409203,2019001377206,71.0,2019004092031,280013,1,...,24,6,63,0,0,15000,4.176091,12.0,1.0,0.0
2,556,2117874,2019,201901,933278,2019000145752,55.0,2019009332781,330037,1,...,67,7,71,0,0,9600,3.982271,13.0,0.0,1.0
3,256,2905840,2019,201901,1284339,2019001153889,174.0,2019012843391,630248,1,...,42,6,61,0,0,45000,4.653213,12.0,1.0,0.0
4,50,1818924,2019,201901,801415,2019000335128,107.0,2019008014151,30534,1,...,42,10,101,62,6203,100000,5.0,16.0,1.0,0.0


## Model 1 | Fixed Effect, t-Test

In [None]:
# Fit the model with male indicator
model1 = smf.ols('INCLOG10 ~ MALE', data=df).fit()
print(model1.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.4040      0.021    211.401      0.000       4.363       4.445
MALE           0.1593      0.030      5.310      0.000       0.100       0.218


## Model 2 | Fixed Effect With Education Control

In [None]:
# Fit the model with male indicator
model1 = smf.ols('INCLOG10 ~ EDU + MALE', data=df).fit()
print(model1.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.0302      0.089     34.152      0.000       2.856       3.204
EDU            0.0987      0.006     15.836      0.000       0.086       0.111
MALE           0.2072      0.027      7.673      0.000       0.154       0.260


Male is associated with a roughly 0.2 log point higher income, unlikely due to chance.

## Model 3 | Differential Returns to Education

In [None]:
# Fit model with interaction
model2 = smf.ols('INCLOG10 ~ EDU + EDU:MALE', data=df).fit()
print(model2.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.1291      0.086     36.230      0.000       2.960       3.299
EDU            0.0917      0.006     14.796      0.000       0.080       0.104
EDU:MALE       0.0150      0.002      7.659      0.000       0.011       0.019


There is a difference in returns to education. Male has a 0.015 log point higher return to education.

## Model 4 | Full Model

In [None]:
# Fit full model with both indicator and interaction
model3 = smf.ols('INCLOG10 ~ EDU + MALE + EDU:MALE', data=df).fit()
print(model3.summary().tables[1])

                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      3.0720      0.120     25.625      0.000       2.837       3.307
EDU            0.0957      0.009     11.247      0.000       0.079       0.112
MALE           0.1186      0.173      0.686      0.493      -0.221       0.458
EDU:MALE       0.0065      0.013      0.519      0.604      -0.018       0.031
