## Taneth Germishuys - Economentrics 1 - PS 1
## Question 5

## Part a

In [14]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

In [15]:
# Load the dataset
data = pd.read_csv('ps1small.csv')

In [16]:
# Create the variable lwage=log(wage)
data['lwage'] = np.log(data['wage'])

In [17]:
# To verify the addition
print(data.head(5))

    wage  education  age      lwage
0  30000         17   30  10.308953
1  16000         12   27   9.680344
2  43500         17   28  10.680516
3  18000         12   30   9.798127
4  25000         17   27  10.126631


In [18]:
# Generating Dummies
# Create a combined education and age column to generate dummies
data['edu_age'] = data['education'].astype(str) + data['age'].astype(str)

# Create dummies for the unique combinations of education and age
dummies = pd.get_dummies(data['edu_age'], prefix='d')

# Convert True/False to 1/0 by casting the dummies to integers
dummies = dummies.astype(int)

# Ensure underscores are removed properly in the column names
dummies.columns = dummies.columns.str.replace(r'_+', '', regex=True)

# Concatenate the dummy variables with the original dataset
data = pd.concat([data, dummies], axis=1)

# Display the first few rows to verify
print(data.head())

    wage  education  age      lwage edu_age  d1226  d1227  d1228  d1229  \
0  30000         17   30  10.308953    1730      0      0      0      0   
1  16000         12   27   9.680344    1227      0      1      0      0   
2  43500         17   28  10.680516    1728      0      0      0      0   
3  18000         12   30   9.798127    1230      0      0      0      0   
4  25000         17   27  10.126631    1727      0      0      0      0   

   d1230  ...  d1626  d1627  d1628  d1629  d1630  d1726  d1727  d1728  d1729  \
0      0  ...      0      0      0      0      0      0      0      0      0   
1      0  ...      0      0      0      0      0      0      0      0      0   
2      0  ...      0      0      0      0      0      0      0      1      0   
3      1  ...      0      0      0      0      0      0      0      0      0   
4      0  ...      0      0      0      0      0      0      1      0      0   

   d1730  
0      1  
1      0  
2      0  
3      0  
4      0  

[

In [19]:
# Running the regression

# Setting variables
X = dummies
y = data['lwage']

# Run the OLS regression
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     2.938
Date:                Wed, 16 Oct 2024   Prob (F-statistic):           4.90e-07
Time:                        15:08:52   Log-Likelihood:                -1099.1
No. Observations:                 898   AIC:                             2258.
Df Residuals:                     868   BIC:                             2402.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
d1226          9.4513      0.091    104.112      0.0

In [20]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 608.0274467373473


In [21]:
# Get the coefficients (params) and standard errors (bse)
coefficients = results.params
standard_errors = results.bse

# Combine the two into a Pandas DataFrame
coef_se_pairs = pd.DataFrame({
    'Coefficient': coefficients,
    'Standard Error': standard_errors
})

# Print the DataFrame
print(coef_se_pairs)

       Coefficient  Standard Error
d1226     9.451320        0.090780
d1227     9.300426        0.084980
d1228     9.537856        0.083280
d1229     9.385826        0.085421
d1230     9.628323        0.087737
d1326     9.612120        0.192010
d1327     9.755901        0.192010
d1328     9.378158        0.174517
d1329     9.819982        0.223685
d1330     9.446488        0.202991
d1426     9.580658        0.223685
d1427     9.794423        0.170843
d1428     9.642237        0.216101
d1429     9.712199        0.174517
d1430     9.645440        0.170843
d1526     9.534555        0.241608
d1527     9.385619        0.232129
d1528     9.573155        0.264668
d1529     9.540361        0.252351
d1530     9.919473        0.241608
d1626    10.070500        0.167391
d1627     9.944426        0.187149
d1628     9.991987        0.147954
d1629    10.178524        0.130710
d1630    10.070591        0.164140
d1726     9.958317        0.483216
d1727    10.045760        0.223685
d1728    10.165648  

## Part b

In [22]:
# Unrestricted Model (with 30 dummy variables)
X_unrestricted = dummies  # Assuming 'dummies' contains the 30 dummy variables for education-age combinations
y = data['lwage']  # Dependent variable (log(wage))

# Fit the unrestricted model
unrestricted_model = sm.OLS(y, X_unrestricted)
unrestricted_results = unrestricted_model.fit()

# Print results
print(unrestricted_results.summary())



                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     2.938
Date:                Wed, 16 Oct 2024   Prob (F-statistic):           4.90e-07
Time:                        15:08:52   Log-Likelihood:                -1099.1
No. Observations:                 898   AIC:                             2258.
Df Residuals:                     868   BIC:                             2402.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
d1226          9.4513      0.091    104.112      0.0

In [23]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(unrestricted_results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 608.0274467373473


In [24]:
# Restricted Model (with linear education, age, and interaction)
data['edu_age_interaction'] = data['education'] * data['age']
X_restricted = data[['education', 'age', 'edu_age_interaction']]
X_restricted = sm.add_constant(X_restricted)  # Add intercept

# Fit the restricted model
restricted_model = sm.OLS(y, X_restricted)
restricted_results = restricted_model.fit()

# Print results
print(restricted_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.064
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     20.48
Date:                Wed, 16 Oct 2024   Prob (F-statistic):           7.69e-13
Time:                        15:08:52   Log-Likelihood:                -1111.3
No. Observations:                 898   AIC:                             2231.
Df Residuals:                     894   BIC:                             2250.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   6.1075    

In [25]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(restricted_results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 624.7819133989772


In [29]:
# Calculate the F-statistic for comparing the two models
RSS_unrestricted = 608.3
q = 26
RSS_restricted = 624.78
n_k = 868

f_stat = ((RSS_restricted-RSS_unrestricted)/q)/(RSS_unrestricted/n_k)
print(f_stat)

0.9044525095158016


In [31]:
# Decision
critical_value = 1.57
if f_stat < 0.05:
    print("Reject the null at 5% significance level")
else:
    print("Do not reject the null at 5% significant level")

Do not reject the null at 5% significant level
