In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import statsmodels.formula.api as smf
import matplotlib.pyplot as plt
import seaborn as sns
from linearmodels.iv import IV2SLS
import os
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from scipy import stats

In [3]:
#Importing Dataset
data = pd.read_stata('./NEW7080.dta')

data.rename(columns={
    'v1': 'AGE', 'v2': 'AGEQ', 'v4': 'EDUC', 'v5': 'ENOCENT', 'v6': 'ESOCENT',
    'v9': 'LWKLYWGE', 'v10': 'MARRIED', 'v11': 'MIDATL', 'v12': 'MT', 'v13': 'NEWENG',
    'v16': 'CENSUS', 'v18': 'QOB', 'v19': 'RACE', 'v20': 'SMSA', 'v21': 'SOATL',
    'v24': 'WNOCENT', 'v25': 'WSOCENT', 'v27': 'YOB'
}, inplace=True)

data.drop(columns=['v8'], inplace=True)

data['COHORT'] = 20.29
data.loc[(data['YOB'] <= 39) & (data['YOB'] >= 30), 'COHORT'] = 30.39
data.loc[(data['YOB'] <= 49) & (data['YOB'] >= 40), 'COHORT'] = 40.49

data.loc[data['CENSUS'] == 80, 'AGEQ'] = data['AGEQ'] - 1900

data['AGEQSQ'] = data['AGEQ'] ** 2

for year in range(1920, 1930):
    data[f'YR{year % 100}'] = (data['YOB'] == year).astype(int)

for q in range(1, 5):
    data[f'QTR{q}'] = (data['QOB'] == q).astype(int)

for q in range(1, 4):
    for year in range(1920, 1930):
        data[f'QTR{q}{year % 100}'] = data[f'QTR{q}'] * data[f'YR{year % 100}']

data = data[data['COHORT'] < 20.30]

data_new = data.astype('float128')

# **TABLE 4**

### <u>OLS REGRESSION (1)</u>:

In [11]:
formula = 'LWKLYWGE ~ EDUC + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28'

model1 = smf.ols(formula=formula, data=data)
results = model1.fit()

predicted1 = results.predict(data)

mse1 = mean_squared_error(predicted1, data['LWKLYWGE'])

print(f"Mean-Squared Error: {mse1}")

summary = results.summary2()

print(summary)

Mean-Squared Error: 0.35161217451765325
                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.171      
Dependent Variable: LWKLYWGE         AIC:                443163.4304
Date:               2024-06-17 14:26 BIC:                443278.0278
No. Observations:   247199           Log-Likelihood:     -2.2157e+05
Df Model:           10               F-statistic:        5101.      
Df Residuals:       247188           Prob (F-statistic): 0.00       
R-squared:          0.171            Scale:              0.35163    
----------------------------------------------------------------------
               Coef.    Std.Err.      t       P>|t|    [0.025   0.975]
----------------------------------------------------------------------
Intercept      4.2100     0.0056   749.5446   0.0000   4.1990   4.2210
EDUC           0.0802     0.0004   225.6699   0.0000   0.0795   0.0809
YR20           0.0235     0.0054     4.3587   0.0000   0.0129   0.0340
Y

### <u>TSLS REGRESSION (2)</u>:

In [15]:
#Stage 1 OLS Regression where Education is the endogenous variable and Quarter of birth + year are the instruments
stage1_formula = 'EDUC ~ QTR120 + QTR121 + QTR122 + QTR123 + QTR124 + QTR125 + QTR126 + QTR127 + QTR128 + QTR129 + QTR220 + QTR221 + QTR222 + QTR223 + QTR224 + QTR225 + QTR226 + QTR227 + QTR228 + QTR229 + QTR320 + QTR321 + QTR322 + QTR323 + QTR324 + QTR325 + QTR326 + QTR327 + QTR328 + QTR329 + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28'
stage1 = smf.ols(stage1_formula, data=data).fit()

#Adding a column to the original data frame of the predicted values of education for the given values of the instruments to 
#use in the second stage regression
data['pred_education'] = stage1.predict(data)

mse_stage_one = mean_squared_error(data['pred_education'], data['EDUC'])

print(f"Mean-Squared Error: {mse_stage_one}")

#Stage 2 OLS Regression where Log Weekly Wage is the outcome variable regressed upon year of birth and
#the predicted values of education from the previous regression
stage2_formula = 'LWKLYWGE ~ pred_education + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28'
stage2 = smf.ols(stage2_formula, data=data).fit()

#Adding a column to the original data frame of the predicted values of Log Weekly Wage to use in the manual R-Squared Test
data['predicted_LWKLYWGE'] = stage2.predict(data)

mse_stage_two = mean_squared_error(data['predicted_LWKLYWGE'], data['LWKLYWGE'])


#Printing OLS Regression Results with dotted lines for Readability
print(stage1.summary())
print('--------------------------------------------------------------------------------')
print('----------------------Regression of Interest Below------------------------------')
print('--------------------------------------------------------------------------------')
print(f"Mean-Squared Error: {mse_stage_two}")
print(stage2.summary())

Mean-Squared Error: 11.267615399549294
                            OLS Regression Results                            
Dep. Variable:                   EDUC   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     14.85
Date:                Mon, 17 Jun 2024   Prob (F-statistic):           3.29e-97
Time:                        14:30:58   Log-Likelihood:            -6.5011e+05
No. Observations:              247199   AIC:                         1.300e+06
Df Residuals:                  247159   BIC:                         1.301e+06
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    

### <u>OLS REGRESSION (3)</u>:

In [21]:
formula = 'LWKLYWGE ~ EDUC + AGEQ + AGEQSQ + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28'

# Fit the OLS model
model = smf.ols(formula=formula, data=data)
results = model.fit()

# Get the summary
summary = results.summary2()

# Print the summary
print(summary)


                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.171      
Dependent Variable: LWKLYWGE         AIC:                443161.5783
Date:               2024-06-11 16:54 BIC:                443297.0117
No. Observations:   247199           Log-Likelihood:     -2.2157e+05
Df Model:           12               F-statistic:        4251.      
Df Residuals:       247186           Prob (F-statistic): 0.00       
R-squared:          0.171            Scale:              0.35162    
----------------------------------------------------------------------
             Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
----------------------------------------------------------------------
Intercept    0.8830     1.5164     0.5823   0.5604   -2.0891    3.8552
EDUC         0.0802     0.0004   225.6454   0.0000    0.0795    0.0809
AGEQ         0.1446     0.0676     2.1383   0.0325    0.0121    0.2770
AGEQSQ      -0.0015     0.0007    -2.0623

### <u>TSLS REGRESSION (4)</u>:

In [16]:
#Stage 1 OLS Regression where Education is the endogenous variable and Quarter of birth + year are the instruments
stage1_formula = 'EDUC ~ QTR120 + QTR121 + QTR122 + QTR123 + QTR124 + QTR125 + QTR126 + QTR127 + QTR128 + QTR129 + QTR220 + QTR221 + QTR222 + QTR223 + QTR224 + QTR225 + QTR226 + QTR227 + QTR228 + QTR229 + QTR320 + QTR321 + QTR322 + QTR323 + QTR324 + QTR325 + QTR326 + QTR327 + QTR328 + QTR329 + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + AGEQ + AGEQSQ'
stage1 = smf.ols(stage1_formula, data=data).fit()

#Adding a column to the original data frame of the predicted values of education for the given values of the instruments to 
#use in the second stage regression
data['pred_education'] = stage1.predict(data)

mse_stage_one = mean_squared_error(data['pred_education'], data['EDUC'])

print(f"Mean-Squared Error: {mse_stage_one}")

#Stage 2 OLS Regression where Log Weekly Wage is the outcome variable regressed upon year of birth and
#the predicted values of education from the previous regression
stage2_formula = 'LWKLYWGE ~ pred_education + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + AGEQ + AGEQSQ'
stage2 = smf.ols(stage2_formula, data=data).fit()


#Adding a column to the original data frame of the predicted values of Log Weekly Wage to use in the manual R-Squared Test
data['predicted_LWKLYWGE2'] = stage2.predict(data)

mse_stage_two = mean_squared_error(data['predicted_LWKLYWGE'], data['LWKLYWGE'])

#Printing OLS Regression Results with dotted lines for Readability
print(stage1.summary())
print('--------------------------------------------------------------------------------')
print('----------------------Regression of Interest Below------------------------------')
print('--------------------------------------------------------------------------------')
print(f"Mean-Squared Error: {mse_stage_two}")
print(stage2.summary())

Mean-Squared Error: 11.267615399549298
                            OLS Regression Results                            
Dep. Variable:                   EDUC   R-squared:                       0.002
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     14.85
Date:                Mon, 17 Jun 2024   Prob (F-statistic):           3.29e-97
Time:                        16:21:32   Log-Likelihood:            -6.5011e+05
No. Observations:              247199   AIC:                         1.300e+06
Df Residuals:                  247159   BIC:                         1.301e+06
Df Model:                          39                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept    

### <u>OLS REGRESSION (5)</u>:

In [23]:
formula = (
    'LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + '
    'WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR20 + YR21 + YR22 + YR23 + '
    'YR24 + YR25 + YR26 + YR27 + YR28'
)


model = smf.ols(formula=formula, data=data)
results = model.fit()

# Get the summary
summary = results.summary2()

# Print the summary
print(summary)

                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.230      
Dependent Variable: LWKLYWGE         AIC:                425068.9266
Date:               2024-06-11 16:54 BIC:                425298.1215
No. Observations:   247199           Log-Likelihood:     -2.1251e+05
Df Model:           21               F-statistic:        3508.      
Df Residuals:       247177           Prob (F-statistic): 0.00       
R-squared:          0.230            Scale:              0.32679    
----------------------------------------------------------------------
             Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
----------------------------------------------------------------------
Intercept    4.1915     0.0072   578.7065   0.0000    4.1773    4.2057
EDUC         0.0701     0.0004   197.7129   0.0000    0.0694    0.0708
RACE        -0.2980     0.0043   -68.5815   0.0000   -0.3065   -0.2894
MARRIED      0.2928     0.0037    78.1871

### <u>TSLS REGRESSION (6)</u>:

In [17]:
#Stage 1 OLS Regression where Education is the endogenous variable and Quarter of birth + year are the instruments
stage1_formula = 'EDUC ~ QTR120 + QTR121 + QTR122 + QTR123 + QTR124 + QTR125 + QTR126 + QTR127 + QTR128 + QTR129 + QTR220 + QTR221 + QTR222 + QTR223 + QTR224 + QTR225 + QTR226 + QTR227 + QTR228 + QTR229 + QTR320 + QTR321 + QTR322 + QTR323 + QTR324 + QTR325 + QTR326 + QTR327 + QTR328 + QTR329 + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT'
stage1 = smf.ols(stage1_formula, data=data).fit()

#Adding a column to the original data frame of the predicted values of education for the given values of the instruments to 
#use in the second stage regression
data['pred_education'] = stage1.predict(data)

#Stage 2 OLS Regression where Log Weekly Wage is the outcome variable regressed upon year of birth and
#the predicted values of education from the previous regression
stage2_formula = 'LWKLYWGE ~ pred_education + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT'
stage2 = smf.ols(stage2_formula, data=data).fit()


#Adding a column to the original data frame of the predicted values of Log Weekly Wage to use in the manual R-Squared Test
data['predicted_LWKLYWGE2'] = stage2.predict(data)


#Printing OLS Regression Results with dotted lines for Readability
print(stage1.summary())
print('--------------------------------------------------------------------------------')
print('----------------------Regression of Interest Below------------------------------')
print('--------------------------------------------------------------------------------')
print(stage2.summary())

                            OLS Regression Results                            
Dep. Variable:                   EDUC   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     371.3
Date:                Fri, 21 Jun 2024   Prob (F-statistic):               0.00
Time:                        15:09:37   Log-Likelihood:            -6.4145e+05
No. Observations:              247199   AIC:                         1.283e+06
Df Residuals:                  247148   BIC:                         1.284e+06
Df Model:                          50                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     12.5592      0.048    260.646      0.0

### <u>OLS REGRESSION (7)</u>:

In [25]:
formula = (
    'LWKLYWGE ~ EDUC + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + '
    'WNOCENT + SOATL + ESOCENT + WSOCENT + MT + YR20 + YR21 + YR22 + YR23 + '
    'YR24 + YR25 + YR26 + YR27 + YR28 + AGEQ + AGEQSQ'
)

# Fit the OLS model
model = smf.ols(formula=formula, data=data)
results = model.fit()

# Get the summary
summary = results.summary2()

# Print the summary
print(summary)


                  Results: Ordinary least squares
Model:              OLS              Adj. R-squared:     0.230      
Dependent Variable: LWKLYWGE         AIC:                425069.2419
Date:               2024-06-11 16:54 BIC:                425319.2727
No. Observations:   247199           Log-Likelihood:     -2.1251e+05
Df Model:           23               F-statistic:        3204.      
Df Residuals:       247175           Prob (F-statistic): 0.00       
R-squared:          0.230            Scale:              0.32679    
----------------------------------------------------------------------
             Coef.    Std.Err.      t       P>|t|     [0.025    0.975]
----------------------------------------------------------------------
Intercept    1.5345     1.4619     1.0496   0.2939   -1.3309    4.3999
EDUC         0.0701     0.0004   197.6855   0.0000    0.0694    0.0708
RACE        -0.2980     0.0043   -68.5814   0.0000   -0.3065   -0.2894
MARRIED      0.2928     0.0037    78.1848

### <u>TSLS REGRESSION (8)</u>:

In [26]:
#Stage 1 OLS Regression where Education is the endogenous variable and Quarter of birth + year are the instruments
stage1_formula = 'EDUC ~ QTR120 + QTR121 + QTR122 + QTR123 + QTR124 + QTR125 + QTR126 + QTR127 + QTR128 + QTR129 + QTR220 + QTR221 + QTR222 + QTR223 + QTR224 + QTR225 + QTR226 + QTR227 + QTR228 + QTR229 + QTR320 + QTR321 + QTR322 + QTR323 + QTR324 + QTR325 + QTR326 + QTR327 + QTR328 + QTR329 + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ'
stage1 = smf.ols(stage1_formula, data=data).fit()

#Adding a column to the original data frame of the predicted values of education for the given values of the instruments to 
#use in the second stage regression
data['pred_education'] = stage1.predict(data)

#Stage 2 OLS Regression where Log Weekly Wage is the outcome variable regressed upon year of birth and
#the predicted values of education from the previous regression
stage2_formula = 'LWKLYWGE ~ pred_education + YR20 + YR21 + YR22 + YR23 + YR24 + YR25 + YR26 + YR27 + YR28 + RACE + MARRIED + SMSA + NEWENG + MIDATL + ENOCENT + WNOCENT + SOATL + ESOCENT + WSOCENT + MT + AGEQ + AGEQSQ'
stage2 = smf.ols(stage2_formula, data=data).fit()


#Adding a column to the original data frame of the predicted values of Log Weekly Wage to use in the manual R-Squared Test
data['predicted_LWKLYWGE2'] = stage2.predict(data)


#Printing OLS Regression Results with dotted lines for Readability
print(stage1.summary())
print('--------------------------------------------------------------------------------')
print('----------------------Regression of Interest Below------------------------------')
print('--------------------------------------------------------------------------------')
print(stage2.summary())

                            OLS Regression Results                            
Dep. Variable:                   EDUC   R-squared:                       0.070
Model:                            OLS   Adj. R-squared:                  0.070
Method:                 Least Squares   F-statistic:                     371.3
Date:                Tue, 11 Jun 2024   Prob (F-statistic):               0.00
Time:                        16:54:09   Log-Likelihood:            -6.4145e+05
No. Observations:              247199   AIC:                         1.283e+06
Df Residuals:                  247148   BIC:                         1.284e+06
Df Model:                          50                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      0.0342      0.002     19.944      0.0