# Project 1: Production Technology

The dataset contains `N = 441` firms observed over `T = 12` years, 1968-1979. There variables are: 
* `lcap`: Log of capital stock, $k_{it}$ 
* `lemp`: log of employment, $\ell_{it}$ 
* `ldsa`: log of deflated sales, $y_{it}$
* `year`: the calendar year of the observation, `year` $ = 1968, ..., 1979$, 
* `firmid`: anonymized indicator variable for the firm, $i = 1, ..., N$, with $N=441$. 

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from numpy import linalg as la
from tabulate import tabulate
from scipy.stats import chi2
import estimationModule as em

# Data

In [2]:
dat = pd.read_csv('firms.csv')
dat.head()

Unnamed: 0,firmid,year,lcap,lemp,ldsa
0,1,1968,0.998602,-0.242185,0.349053
1,1,1969,0.925214,-0.241278,0.312492
2,1,1970,0.879616,-0.265134,0.347566
3,1,1971,0.843098,-0.317875,0.234744
4,1,1972,0.82328,-0.372247,0.182199


In [3]:
## Extract dimension
N = dat.firmid.unique().size
T = dat.year.unique().size
assert dat.shape[0] == N*T, f'Error: data is not a balanced panel'
print(f'Data has N={N} and T={T}')

## From pandas columns to np arrays
y = dat.ldsa.values.reshape((N*T,1))
ones = np.ones((N*T,1))
l = dat.lemp.values.reshape((N*T,1))
k = dat.lcap.values.reshape((N*T,1))
X = np.hstack([ones, l, k])

Data has N=441 and T=12


# Estimation

In [4]:
## Labels
label_y=['log(sales)']
label_x_constant=['constant', 'lemp', 'lcap']
label_x=['lemp', 'lcap']

labels = (label_y, label_x)
labels_constant = (label_y, label_x_constant)

In [5]:
## Estimate Pooled OLS
pols_res = em.estimate(y, X, transform='', t=T, robust_se=True) 

In [6]:
## Print results
em.print_table(labels=labels_constant, results=pols_res ,title="Pooled OLS results\n", floatfmt='.4f')

Pooled OLS results

Dependent variable: ['log(sales)']

            Beta      Se    t-values
--------  ------  ------  ----------
constant  0.0000  0.0161      0.0000
lemp      0.6748  0.0366     18.4526
lcap      0.3100  0.0324      9.5810
R² = 0.914
σ² = 0.131


# Fixed effects

In [7]:
## Demean variables
# Demeaning matrix
def fe_matrix(t):
    return np.eye(t)-np.tile(1/t, (t,t))

Q_T = fe_matrix(T)

# Perform demeaning
x_fe = em.perm(Q_T, X)
y_fe = em.perm(Q_T,y)

# Matrix of regressors must be invertable
assert la.matrix_rank(x_fe) > 0

# Remove zero columns
x_fe = em.remove_zero_cols(x_fe)

In [8]:
## Estimate fixed effects
fe_res = em.estimate(y_fe, x_fe, transform='fe', t=T, robust_se=True) 

In [9]:
## Print results
em.print_table(labels=labels, results=fe_res, title='Fixed effect results\n', floatfmt='.4f')

Fixed effect results

Dependent variable: ['log(sales)']

        Beta      Se    t-values
----  ------  ------  ----------
lemp  0.6942  0.0417     16.6674
lcap  0.1546  0.0299      5.1630
R² = 0.477
σ² = 0.018


# First difference

In [10]:
## First difference variables
# First difference matrix
def fd_matrix(t): 
    return np.eye(t-1,t,1)-np.eye(t-1,t)

D_T = fd_matrix(T)

# Perform the first differencing
x_fd = em.perm(D_T, X)
y_fd = em.perm(D_T, y)

# Matrix of regressors must be invertable
assert la.matrix_rank(x_fd) > 0

# Remove zero columns
x_fd = em.remove_zero_cols(x_fd)

In [11]:
## Estimate first difference
fd_res = em.estimate(y_fd, x_fd, transform='fd', t=T-1, robust_se=True) 

In [12]:
## Print results
em.print_table(labels=labels, results=fd_res, title="First difference results\n", floatfmt='.4f')

First difference results

Dependent variable: ['log(sales)']

        Beta      Se    t-values
----  ------  ------  ----------
lemp  0.5487  0.0292     18.8191
lcap  0.0630  0.0232      2.7097
R² = 0.165
σ² = 0.014


# RE effects

In [13]:
## Between effects estimator
# Demeaning matrix
def be_matrix(t):
    return np.tile(1/t, (t,t))

P_T = be_matrix(T)

# Deamean variables
x_be = em.perm(P_T, X)
y_be = em.perm(P_T, y)

In [14]:
## Estimate between effects
be_res = em.estimate(y_be, x_be, transform='be',t=T)

In [15]:
## Print results
em.print_table(labels=labels_constant, results=be_res, title='Between Estimator results\n', floatfmt='.4f')

Between Estimator results

Dependent variable: ['log(sales)']

            Beta      Se    t-values
--------  ------  ------  ----------
constant  0.0000  0.0046      0.0000
lemp      0.6672  0.0099     67.6130
lcap      0.3188  0.0089     35.8720
R² = 0.923
σ² = 0.114


In [16]:
## Extract variances in order to create lambda_hat
sigma_u = fe_res.get('sigma2')
sigma_v = be_res.get('sigma2')
sigma_c = sigma_v - (1/T*sigma_u)

lambda_hat = 1 - np.sqrt(sigma_u/(sigma_u+T*sigma_c))

In [17]:
## Compute the random effects tranformation matrix
C_T = np.eye(T) - lambda_hat*P_T

x_re = em.perm(C_T, X)
y_re = em.perm(C_T, y)

In [18]:
## Estimate random effects
re_res = em.estimate(y_re, x_re, transform='re', t=T)

In [19]:
## Print results
em.print_table(labels=labels_constant, results=re_res, title='Random Effects results\n', floatfmt='.4f')

Random Effects results

Dependent variable: ['log(sales)']

            Beta      Se    t-values
--------  ------  ------  ----------
constant  0.0000  0.0162      0.0000
lemp      0.7197  0.0131     54.8444
lcap      0.1990  0.0117     17.0431
R² = 0.643
σ² = 0.018


# Hausman Test

The Hausman test is given by:
$$H = \left(\hat{\beta}_{RE} - \hat{\beta}_{FE}\right)^{'} \left(\hat{Avar}(\beta_{RE}) - \hat{Avar}(\beta_{FE})\right)^{-1} \left(\hat{\beta}_{RE} - \hat{\beta}_{FE}\right)$$

In [31]:
## Retrieve parameters from result dicts
# Difference in beta parameters
beta_diff = re_res.get('b_hat')[1:] - fe_res.get('b_hat')

# Difference in covariance matrices
avar_fe = fe_res.get('cov')
avar_re = re_res.get('cov')
cov_diff = avar_re[1:,1:] - avar_fe

In [32]:
## The Hausman test value...
H = beta_diff.T@la.inv(cov_diff)@beta_diff

# ... and the corresponding p-value
p_val = chi2.sf(H.item(), 4)

In [33]:
# Print results
def print_h_test(fe_res, re_res, beta_diff, p_val):
    table = []
    for i in range(len(beta_diff)):
        row = [
            fe_res['b_hat'][i], re_res['b_hat'][1:][i], beta_diff[i]
        ]
        table.append(row)

    print(tabulate(
        table, headers=['b_fe', 'b_re', 'b_diff'], floatfmt='.4f'
        ))
    print(f'\nThe Hausman test statistic is: {H.item():.2f}, with p-value: {p_val:.2f}.')

print_h_test(fe_res, re_res, beta_diff, p_val)

  b_fe    b_re    b_diff
------  ------  --------
0.6942  0.7197    0.0255
0.1546  0.1990    0.0444

The Hausman test statistic is: -6.17, with p-value: 1.00.


# we reject the null -> $c_{i}$ is correlated with $x_{i}$

In [23]:
def test_homogeneity(results):
    betas = results['b_hat']
    Avar = results['cov']
    
    if betas.shape[0] == 3:
        R = np.array([[0,1,1]])
        r = np.array([[1]])
        
    else:
        R = np.array([[1,1]])
        r = np.array([[1]])
              
    W = (R@betas-r).T@la.inv(R@Avar@R.T)@(R@betas-r)
    p_val = chi2.sf(W.item(), 2)
    return W.item(), p_val

for result, name in zip([pols_res, fe_res, re_res, fd_res], ['POLS', 'FE', 'RE', 'FD']):
    test = test_homogeneity(result)
    print('{:^44s}'.format(name))
    print('--------------------------------------------')
    print(f'Beta_1 + Beta_2 = 1: W = {test[0]:.2f} and p = {test[1]:.2f}')
    print('--------------------------------------------\n')

                    POLS                    
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 1.56 and p = 0.46
--------------------------------------------

                     FE                     
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 19.40 and p = 0.00
--------------------------------------------

                     RE                     
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 74.07 and p = 0.00
--------------------------------------------

                     FD                     
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 150.03 and p = 0.00
--------------------------------------------



$$W \equiv (\boldsymbol{R} \hat{\beta} - \boldsymbol{r})'[\boldsymbol{R} Avar(\hat{\beta})R']^{-1}(\boldsymbol{R} \hat{\beta} - \boldsymbol{r})$$

$$(QxK @ Kx1)' -> 1xQ$$

$$Q x K @ K x K @ K x Q$$

In [24]:
year = np.array(dat.year, dtype=int)
year.shape

(5292,)

In [25]:
reduced_year = year[year != 1968]

def serial_corr(y, x, t, year):
    b_hat = em.est_ols(y, x)
    e = y - x@b_hat
    
    # Create a lag to estimate the error on.
    L_T = np.eye(t, k=-1)
    L_T = L_T[1:]

    e_l = em.perm(L_T, e)

    # We then need to remove the first obs for every person again.
    e = e[year != 1969]

    return em.estimate(e, e_l)

corr_result = serial_corr(y_fd, x_fd, T-1, reduced_year)

label_ye = 'OLS residual, e\u1d62\u209c'
label_e = ['e\u1d62\u209c\u208B\u2081']
em.print_table((label_ye, label_e), corr_result, title='Serial Correlation', floatfmt='.4f')

Serial Correlation
Dependent variable: OLS residual, eᵢₜ

          Beta      Se    t-values
-----  -------  ------  ----------
eᵢₜ₋₁  -0.1987  0.0148    -13.4493
R² = 0.039
σ² = 0.014


In [26]:
def demeaning_matrix(t):
    Q_T = np.eye(t) - np.tile(1/t, (t, t))
    return Q_T

In [27]:
def demeaning_matrix(t):
    Q_T = np.eye(t) - np.tile(1/t, (t, t))
    return Q_T

def exogeneity_test(x, y, t, year, var):
    # Create lead
    F_T = np.eye(t, k=1)
    F_T = F_T[:-1]

    # Choose var
    if var == 'l':
        lead = em.perm(F_T, x[:, 1].reshape(-1, 1))
        label_exo = ["lemp","lcap"] + ['Labor lead']
    if var == 'k':
        lead = em.perm(F_T, x[:, 2].reshape(-1, 1))
        label_exo = ["lemp","lcap"] + ['Capital lead']

    # Collect variables to test for exogeneity
    x_exo = x[year != 1979]
    x_exo = np.hstack((x_exo, lead))
    y_exo = y[year != 1979]

    # Within transform the data
    Q_T = demeaning_matrix(t - 1)

    yw_exo = em.perm(Q_T, y_exo)
    xw_exo = em.perm(Q_T, x_exo)
    xw_exo = xw_exo[:, 1:]
    
    n = y.size/t
    # Estimate model
    exo_test = em.estimate(yw_exo, xw_exo, t=t - 1, transform='fe')

    em.print_table((label_y, label_exo), exo_test, title=f'Exogeneity test for {var}', floatfmt='.4f')
    print('\n')

for var in ['k', 'l']:
    exogeneity_test(X, y, T, year, var=var)

Exogeneity test for k
Dependent variable: ['log(sales)']

                Beta      Se    t-values
------------  ------  ------  ----------
lemp          0.6479  0.0162     39.9359
lcap          0.0210  0.0231      0.9093
Capital lead  0.1793  0.0258      6.9500
R² = 0.474
σ² = 0.016


Exogeneity test for l
Dependent variable: ['log(sales)']

              Beta      Se    t-values
----------  ------  ------  ----------
lemp        0.5681  0.0231     24.5458
lcap        0.1495  0.0134     11.1375
Labor lead  0.1532  0.0225      6.8037
R² = 0.473
σ² = 0.016


