# Production Technology

The dataset contains `N = 441` firms observed over `T = 12` years, 1968-1979. There variables are: 
* `lcap`: Log of capital stock, $k_{it}$ 
* `lemp`: log of employment, $\ell_{it}$ 
* `ldsa`: log of deflated sales, $y_{it}$
* `year`: the calendar year of the observation, `year` $ = 1968, ..., 1979$, 
* `firmid`: anonymized indicator variable for the firm, $i = 1, ..., N$, with $N=441$. 

In [None]:
%load_ext autoreload
%autoreload 2

import pandas as pd 
import numpy as np
import seaborn as sns
import Project_1 as lm
from scipy.stats import chi2
from numpy import linalg as la


In [None]:
dat = pd.read_csv('firms.csv')

In [None]:
dat.sample(5)

In [None]:
dat.year.unique()

# Descriptives

In [None]:
dat.describe()

In [None]:
dat[['lcap','lemp','ldsa']].hist();

In [None]:
sns.scatterplot(x='lemp', y='ldsa', data=dat); 

# Converting data to numpy format 

In [None]:
dat.ldsa.values.shape

In [None]:
N = dat.firmid.unique().size
T = dat.year.unique().size
assert dat.shape[0] == N*T, f'Error: data is not a balanced panel'
print(f'Data has N={N} and T={T}')

Extract data from `pandas` to `numpy` arrays. 

In [None]:
y = dat.ldsa.values.reshape((N*T,1))

ones = np.ones((N*T,1))
l = dat.lemp.values.reshape((N*T,1))
k = dat.lcap.values.reshape((N*T,1))
x = np.hstack([l, k])


In [None]:
# Filter the data for odd years
dat_odd_years = dat[dat['year'] % 2 != 0].copy()

# Update T
T = dat_odd_years.year.unique().size
assert dat_odd_years.shape[0] == N*T, f'Error: data is not a balanced panel'
print(f'Data has N={N} and T={T}')

#naming the dependent and independent variables
label_y = 'Log deflated sales'
label_x = [
    'log of employment',
    'log of adjusted capital stock'
    ]

## Pooled OLS

In [None]:
# Estimate model using OLS
ols_result = lm.estimate(y,x, transform='', T=T, robust_se='True')

# Print table
lm.print_table((label_y, label_x), ols_result, title="Pooled OLS", floatfmt='.4f')

## FE model

In [None]:
# Transform the data
Q_T = np.eye(T) - np.tile(1/T, (T, T))
y_dot = lm.perm(Q_T, y)
x_dot = lm.perm(Q_T, x)

# Remove the columns that are only zeroes
x_dot, label_x_dot = lm.remove_zero_columns(x_dot, label_x)

# Estimate 
fe_result = lm.estimate(y_dot, x_dot, transform='fe', T=T, robust_se='True')
lm.print_table((label_y, label_x_dot), fe_result, title="Fixed Effects", floatfmt='.4f')

## FD model

In [None]:
# Transform the data
D_T = (np.eye(T) - np.eye(T, k=-1))[1:]

y_diff = lm.perm(D_T, y)
x_diff = lm.perm(D_T, x)

# Remove the columns that are only zeroes
x_diff, label_x_diff = lm.remove_zero_columns(x_diff, label_x)

# Estimate 
fd_result = lm.estimate(y_diff, x_diff, transform='fd', T=T-1, robust_se='True')
lm.print_table((label_y, label_x_diff), fd_result, title="First Difference", floatfmt='.4f')

## RE

In [None]:
# Transform the data
P_T = np.ones((1,T)) * 1/T

y_mean = lm.perm(P_T, y)
x_mean = lm.perm(P_T, x)

# Estimate 
be_result = lm.estimate(y_mean, x_mean, transform='be', T=T, robust_se='True')
lm.print_table((label_y, label_x), be_result, title="Between Estimator", floatfmt='.4f')

# Calculate lambda (note lambda is a reserved keyword in Python, so we use _lambda instead)
sigma2_u = fe_result['sigma2']
sigma2_w = be_result['sigma2']
sigma2_c = sigma2_w - 1/T * sigma2_u
_lambda = 1 - np.sqrt(sigma2_u / (sigma2_u + T*sigma2_c))

# Print lambda 
print(f'Lambda is approximately equal to {_lambda.item():.4f}.')

# Transform the data
C_T = - np.eye(T, T) + _lambda * P_T
y_re = lm.perm(C_T, y)
x_re = lm.perm(C_T, x)

# Estimate 
re_result = lm.estimate(y_re, x_re, transform='re', T=T, robust_se='True')
lm.print_table((label_y, label_x), re_result, title="Random Effects", floatfmt='.4f')

### Test for constant returns to scale

#### Fixed effects

In [None]:
# Define null hypothesis: R * b_hat = 1 (sum of first two coefficients equals 1)
R = np.array([[1, 1]])
r = np.array([[1]])

# Extract b_hat and covariance matrix
b_hat = fe_result['b_hat']  # Estimated coefficients
cov = fe_result['cov']      # Covariance matrix of coefficients

# Perform Wald test
w_stat, crit_val, p_value = lm.wald_test(b_hat, cov, R, r)

print(f'The test statistic is {w_stat.item():.2f}.')
print(f'The critical value at a 5% significance level is {crit_val:.2f}.')
print(f'The p-value is {p_value:.8f}.')

if w_stat > crit_val:
    print(f"Reject null hypothesis: We reject CRS for the FE-estimation - P-value of: {p_value:.4f}.")
else:
    print(f"Fail to reject null hypothesis: We cannot reject CRS for the FE-estimation. P-value of: {p_value:.4f}.")

#### First differences

In [None]:
# Extract b_hat and covariance matrix
b_hat = fd_result['b_hat']  # Estimated coefficients
cov = fd_result['cov']      # Covariance matrix of coefficients

# Perform Wald test
w_stat, crit_val, p_value = lm.wald_test(b_hat, cov, R, r)

print(f'The test statistic is {w_stat.item():.2f}.')
print(f'The critical value at a 5% significance level is {crit_val:.2f}.')
print(f'The p-value is {p_value:.8f}.')

if w_stat > crit_val:
    print(f"Reject null hypothesis: We reject CRS for the FD-estimation - P-value of: {p_value:.4f}.")
else:
    print(f"Fail to reject null hypothesis: We cannot reject CRS for the FD-estimation. P-value of: {p_value:.4f}.")

## Robustness Tests

### Hausman test

In [None]:
# Unpack
b_fe = fe_result['b_hat']
b_re = re_result['b_hat']
cov_fe = fe_result['cov']
cov_re = re_result['cov']

# Calculate the test statistic
b_diff = b_fe - b_re
cov_diff = cov_fe - cov_re
H = b_diff.T @ la.inv(cov_diff) @ b_diff

# Find critical value and p-value at 5% significance level of chi^2 with M degrees of freedom
M = len(b_diff)
crit_val = chi2.ppf(0.95, M)
p_val = 1 - chi2.cdf(H.item(), M)

# Print the results
print(f'The test statistic is {H.item():.2f}.')
print(f'The critical value at a 5% significance level is {crit_val:.2f}.')
print(f'The p-value is {p_val:.8f}.')

if H > crit_val:
    print(f"Reject null hypothesis: Prefer FE estimator over RE estimator, since the test statistic is greater than the critical value: {H.item():.2f} > {crit_val:.2f}.")
else:
    print("Fail to reject null hypothesis: Prefer RE estimator over FE estimator.")

### Test for serial correlation

Tests assumption FD.3, where the errors $e_{it} = \Delta u_{it}$ should be serially uncorrelated.

In [None]:
# Make function to calculate the serial correlation
def serial_corr(y, x, T):
    # Calculate the residuals
    b_hat = lm.est_ols(y, x)
    e = y - x@b_hat
    
    # Create a lag transformation matrix
    L_T = np.eye(T, k=-1)
    L_T = L_T[1:]

    # Lag residuals
    e_l = lm.perm(L_T, e)

    # Create a transformation matrix that removes the first observation of each individual
    I_T = np.eye(T, k=0)
    I_T = I_T[1:]
    
    # Remove first observation of each individual
    e = lm.perm(I_T, e)
    
    # Calculate the serial correlation
    return lm.estimate(e, e_l,T=T-1)

In [None]:
# Estimate serial correlation
corr_result = serial_corr(y_diff, x_diff, T-1)

# Print results
label_ye = 'OLS residual, e\u1d62\u209c'
label_e = ['e\u1d62\u209c\u208B\u2081']
lm.print_table(
    (label_ye, label_e), corr_result, 
    title='Serial Correlation', floatfmt='.4f'
)

### Test for strict exogeneity

#### Testing FE.1 ####

In [None]:
# Lead employment
F_T = np.eye(T, k=1)[:-1]
empl_lead = lm.perm(F_T, x[:, 0].reshape(-1, 1))

# Remove the last observed year for every individual
I_T = np.eye(T, k=0)[:-1]

x_exo = lm.perm(I_T, x)
y_exo = lm.perm(I_T, y)

# Add empl_lead to x_exo
x_exo = np.hstack((x_exo, empl_lead))

# Within transform the data
Q_T = np.eye(T-1) - np.tile(1/(T-1), ((T-1), (T-1))) #Demeaning matrix
yw_exo = lm.perm(Q_T, y_exo)
xw_exo = lm.perm(Q_T, x_exo)

# Estimate model
exo_test = lm.estimate(yw_exo, xw_exo, T=T-1, transform='fe', robust_se='True')

# Print results
label_exo = label_x + ['Employment lead']
lm.print_table((label_y, label_exo), exo_test, title='Exogeneity FE test', floatfmt='.4f')


In [None]:
# Lead capital
F_T = np.eye(T, k=1)[:-1]
cap_lead = lm.perm(F_T, x[:, 1].reshape(-1, 1))

# Remove the last observed year for every individual
I_T = np.eye(T, k=0)[:-1]

x_exo = lm.perm(I_T, x)
y_exo = lm.perm(I_T, y)

# Add empl_lead to x_exo
x_exo = np.hstack((x_exo, cap_lead))

# Within transform the data
Q_T = np.eye(T-1) - np.tile(1/(T-1), ((T-1), (T-1))) #Demeaning matrix
yw_exo = lm.perm(Q_T, y_exo)
xw_exo = lm.perm(Q_T, x_exo)

# Estimate model
exo_test = lm.estimate(yw_exo, xw_exo, T=T-1, transform='fe', robust_se='True')

# Print results
label_exo = label_x + ['Capital lead']
lm.print_table((label_y, label_exo), exo_test, title='Exogeneity FE test', floatfmt='.4f')


"employment lead" is significantly different from 0 meaning that we can reject strict exogeniety.

In [None]:
# Lead capital and employment
F_T = np.eye(T, k=1)[:-1]
empl_lead = lm.perm(F_T, x[:, 0].reshape(-1, 1))
cap_lead = lm.perm(F_T, x[:, 1].reshape(-1, 1))

# Remove the last observed year for every individual
I_T = np.eye(T, k=0)[:-1]

x_exo = lm.perm(I_T, x)
y_exo = lm.perm(I_T, y)

# Add empl_lead to x_exo
x_exo = np.hstack((x_exo, empl_lead, cap_lead))

# Within transform the data
Q_T = np.eye(T-1) - np.tile(1/(T-1), ((T-1), (T-1))) #Demeaning matrix
yw_exo = lm.perm(Q_T, y_exo)
xw_exo = lm.perm(Q_T, x_exo)

# Estimate model
exo_test = lm.estimate(yw_exo, xw_exo, T=T-1, transform='fe', robust_se='True')

# Print results
label_exo = label_x + ['Employment lead'] + ['Capital lead']
lm.print_table((label_y, label_exo), exo_test, title='Exogeneity FE test', floatfmt='.4f')


#### Testing FD.1

In [None]:
l_delta = x_diff[:,0].reshape(-1,1)
k_delta = x_diff[:,1].reshape(-1,1)
l_level = l

# Align dimensions over time
l_level = np.delete(l_level, np.arange(0, l_level.shape[0], T)).reshape(-1,1)

# Stacking in X_delta
x_delta = np.column_stack((l_delta, k_delta, l_level))

# Estimate the regression by OLS
exo_fd = lm.estimate(y=y_diff, x=x_delta, transform='', T=T-1, robust_se='True')

# Print results
label_exofd = label_x + ['Employment level']
lm.print_table((label_y, label_exofd), exo_fd, title='Exogeneity FD test', floatfmt='.4f')
 

"employment level" is not significantly different from 0 meaning that we cannot reject strict exogeniety.

In [None]:
l_delta = x_diff[:,0].reshape(-1,1)
k_delta = x_diff[:,1].reshape(-1,1)
k_level = k

# Align dimensions over time
k_level = np.delete(k_level, np.arange(0, k_level.shape[0], T)).reshape(-1,1)

# Stacking in X_delta
x_delta = np.column_stack((l_delta, k_delta, k_level))

# Estimate the regression by OLS
exo_fd = lm.estimate(y=y_diff, x=x_delta, transform='', T=T-1, robust_se='True')

# Print results
label_exofd = label_x + ['Capital level']
lm.print_table((label_y, label_exofd), exo_fd, title='Exogeneity FD test', floatfmt='.4f')
 

"capital level" is not significantly different from 0 meaning that we cannot reject strict exogeniety.

In [None]:
l_delta = x_diff[:,0].reshape(-1,1)
k_delta = x_diff[:,1].reshape(-1,1)
k_level = k
l_level = l

# Align dimensions over time
l_level = np.delete(l_level, np.arange(0, l_level.shape[0], T)).reshape(-1,1)
k_level = np.delete(k_level, np.arange(0, k_level.shape[0], T)).reshape(-1,1)

# Stacking in X_delta
x_delta = np.column_stack((l_delta, k_delta, l_level, k_level))

# Estimate the regression by OLS
exo_fd = lm.estimate(y=y_diff, x=x_delta, transform='', T=T-1, robust_se='True')

# Print results
label_exofd = label_x + ['Employment level'] + ['Capital level']
lm.print_table((label_y, label_exofd), exo_fd, title='Exogeneity FD test', floatfmt='.4f')


In [None]:
# Get the Residual Sum of Squares (RSS)
RSS_fd=fd_result['SSR'] 
RSS_fdlevel=exo_fd['SSR']

# Number of restrictions (q) - here we are testing 2 restrictions (k_lag and l_lag)
q = 2

# Number of observations (n) and parameters in the augmented model (p_aug)
n = len(dat)
x_delta = np.column_stack((l_delta, k_delta, l_level,k_level))
x_delta = x_delta.shape[1]  # Number of parameters in the augmented model

# Compute the F-statistic
F_stat = ((RSS_fd - RSS_fdlevel) / q) / (RSS_fdlevel / (n - x_delta - 1))

from scipy import stats
p_value = 1 - stats.f.cdf(F_stat, q, n - x_delta)

print(f"F-statistic: {F_stat}")
print(f"P-value: {p_value}")