## Taneth Germishuys - Economentrics 1 - PS 1
## Question 5

## Part a

In [27]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from statsmodels.stats.anova import anova_lm

In [28]:
# Load the dataset
data = pd.read_csv('ps1small.csv')

In [29]:
# Create the variable lwage=log(wage)
data['lwage'] = np.log(data['wage'])

In [30]:
# To verify the addition
print(data.head(5))

    wage  education  age      lwage
0  30000         17   30  10.308953
1  16000         12   27   9.680344
2  43500         17   28  10.680516
3  18000         12   30   9.798127
4  25000         17   27  10.126631


In [31]:
# Generating Dummies
# Create a combined education and age column to generate dummies
data['edu_age'] = data['education'].astype(str) + data['age'].astype(str)

# Create dummies for the unique combinations of education and age
dummies = pd.get_dummies(data['edu_age'], prefix='d')

# Convert True/False to 1/0 by casting the dummies to integers
dummies = dummies.astype(int)

# Ensure underscores are removed properly in the column names
dummies.columns = dummies.columns.str.replace(r'_+', '', regex=True)

# Concatenate the dummy variables with the original dataset
data = pd.concat([data, dummies], axis=1)

# Display the first few rows to verify
print(data.head())

    wage  education  age      lwage edu_age  d1226  d1227  d1228  d1229  \
0  30000         17   30  10.308953    1730      0      0      0      0   
1  16000         12   27   9.680344    1227      0      1      0      0   
2  43500         17   28  10.680516    1728      0      0      0      0   
3  18000         12   30   9.798127    1230      0      0      0      0   
4  25000         17   27  10.126631    1727      0      0      0      0   

   d1230  ...  d1626  d1627  d1628  d1629  d1630  d1726  d1727  d1728  d1729  \
0      0  ...      0      0      0      0      0      0      0      0      0   
1      0  ...      0      0      0      0      0      0      0      0      0   
2      0  ...      0      0      0      0      0      0      0      1      0   
3      1  ...      0      0      0      0      0      0      0      0      0   
4      0  ...      0      0      0      0      0      0      1      0      0   

   d1730  
0      1  
1      0  
2      0  
3      0  
4      0  

[

In [32]:
# Running the regression

# Setting variables
X = dummies
y = data['lwage']

# Add a constant term (intercept) to the model
X = sm.add_constant(X)

# Run the OLS regression
model = sm.OLS(y, X)
results = model.fit()

# Print the summary of the regression results
print(results.summary())


                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     2.938
Date:                Tue, 15 Oct 2024   Prob (F-statistic):           4.90e-07
Time:                        20:55:40   Log-Likelihood:                -1099.1
No. Observations:                 898   AIC:                             2258.
Df Residuals:                     868   BIC:                             2402.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.4098      0.040    235.212      0.0

In [33]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 608.0274467373465


In [34]:
# Get the coefficients (params) and standard errors (bse)
coefficients = results.params
standard_errors = results.bse

# Combine the two into a Pandas DataFrame
coef_se_pairs = pd.DataFrame({
    'Coefficient': coefficients,
    'Standard Error': standard_errors
})

# Print the DataFrame
print(coef_se_pairs)

       Coefficient  Standard Error
const     9.409801        0.040006
d1226     0.041519        0.096488
d1227    -0.109375        0.091412
d1228     0.128055        0.089936
d1229    -0.023974        0.091796
d1230     0.218522        0.093817
d1326     0.202320        0.189973
d1327     0.346100        0.189973
d1328    -0.031643        0.173470
d1329     0.410182        0.220017
d1330     0.036687        0.200368
d1426     0.170857        0.220017
d1427     0.384623        0.170014
d1428     0.232437        0.212808
d1429     0.302398        0.173470
d1430     0.235639        0.170014
d1526     0.124755        0.237084
d1527    -0.024181        0.228053
d1528     0.163354        0.259095
d1529     0.130560        0.247332
d1530     0.509672        0.237084
d1626     0.660699        0.166771
d1627     0.534626        0.185379
d1628     0.582187        0.148589
d1629     0.768723        0.132602
d1630     0.660790        0.163720
d1726     0.548517        0.469077
d1727     0.635959  

## Part b

In [35]:
# Unrestricted Model (with 30 dummy variables)
X_unrestricted = dummies  # Assuming 'dummies' contains the 30 dummy variables for education-age combinations
X_unrestricted = sm.add_constant(X_unrestricted)  # Add intercept
y = data['lwage']  # Dependent variable (log(wage))

# Fit the unrestricted model
unrestricted_model = sm.OLS(y, X_unrestricted)
unrestricted_results = unrestricted_model.fit()

# Print results
print(unrestricted_results.summary())



                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.089
Model:                            OLS   Adj. R-squared:                  0.059
Method:                 Least Squares   F-statistic:                     2.938
Date:                Tue, 15 Oct 2024   Prob (F-statistic):           4.90e-07
Time:                        20:55:40   Log-Likelihood:                -1099.1
No. Observations:                 898   AIC:                             2258.
Df Residuals:                     868   BIC:                             2402.
Df Model:                          29                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const          9.4098      0.040    235.212      0.0

In [36]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(unrestricted_results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 608.0274467373465


In [37]:
# Restricted Model (with linear education, age, and interaction)
data['edu_age_interaction'] = data['education'] * data['age']
X_restricted = data[['education', 'age', 'edu_age_interaction']]
X_restricted = sm.add_constant(X_restricted)  # Add intercept

# Fit the restricted model
restricted_model = sm.OLS(y, X_restricted)
restricted_results = restricted_model.fit()

# Print results
print(restricted_results.summary())

                            OLS Regression Results                            
Dep. Variable:                  lwage   R-squared:                       0.064
Model:                            OLS   Adj. R-squared:                  0.061
Method:                 Least Squares   F-statistic:                     20.48
Date:                Tue, 15 Oct 2024   Prob (F-statistic):           7.69e-13
Time:                        20:55:40   Log-Likelihood:                -1111.3
No. Observations:                 898   AIC:                             2231.
Df Residuals:                     894   BIC:                             2250.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
const                   6.1075    

In [38]:
# Manually calculate the residual sum of squares (RSS)
rss = sum(restricted_results.resid ** 2)
print(f"Residual Sum of Squares (RSS): {rss}")

Residual Sum of Squares (RSS): 624.7819133989772


In [39]:
# Calculate the F-statistic for comparing the two models
f_statistic = restricted_results.compare_f_test(unrestricted_results)
f_value, p_value, df_diff = f_statistic

# Print F-statistic, p-value, and degrees of freedom difference
print(f"F-statistic: {f_value}")
print(f"p-value: {p_value}")
print(f"Degrees of freedom difference: {df_diff}")

# Check whether to reject the null hypothesis at 5% significance level
if p_value < 0.05:
    print("Reject the null hypothesis at 5% significance level.")
else:
    print("Do not reject the null hypothesis at 5% significance level.")

F-statistic: 0.9220751750003997
p-value: nan
Degrees of freedom difference: -26.0
Do not reject the null hypothesis at 5% significance level.
