# Project 1: Production Technology

The dataset contains `N = 441` firms observed over `T = 12` years, 1968-1979. There variables are: 
* `lcap`: Log of capital stock, $k_{it}$ 
* `lemp`: log of employment, $\ell_{it}$ 
* `ldsa`: log of deflated sales, $y_{it}$
* `year`: the calendar year of the observation, `year` $ = 1968, ..., 1979$, 
* `firmid`: anonymized indicator variable for the firm, $i = 1, ..., N$, with $N=441$. 

In [1]:
import pandas as pd 
import numpy as np 
import seaborn as sns
from numpy import linalg as la
from tabulate import tabulate
from scipy.stats import chi2

In [2]:
dat = pd.read_csv('firms.csv')
dat.head()

Unnamed: 0,firmid,year,lcap,lemp,ldsa
0,1,1968,0.998602,-0.242185,0.349053
1,1,1969,0.925214,-0.241278,0.312492
2,1,1970,0.879616,-0.265134,0.347566
3,1,1971,0.843098,-0.317875,0.234744
4,1,1972,0.82328,-0.372247,0.182199


# Converting data to numpy format 

In [6]:
N = dat.firmid.unique().size
T = dat.year.unique().size
assert dat.shape[0] == N*T, f'Error: data is not a balanced panel'
print(f'Data has N={N} and T={T}')

Data has N=441 and T=12


Extract data from `pandas` to `numpy` arrays. 

In [7]:
y = dat.ldsa.values.reshape((N*T,1))
ones = np.ones((N*T,1))
l = dat.lemp.values.reshape((N*T,1))
k = dat.lcap.values.reshape((N*T,1))
X = np.hstack([ones, l, k])

In [8]:
def estimate( y, x, transform='', n=None, t=None,robust=False):
    """Takes some np.arrays and estimates regular OLS, FE or FD.
    

    Args:
        y (np.array): The dependent variable, needs to have the shape (n*t, 1)
        x (np.array): The independent variable(s). If only one independent 
        variable, then it needs to have the shape (n*t, 1).
        transform (str, optional): Specify if estimating fe or fd, in order 
        to get correct variance estimation. Defaults to ''.
        n (int, optional): Number of observations. If panel, then the 
        number of individuals. Defaults to None.
        t (int, optional): If panel, then the number of periods an 
        individual is observerd. Defaults to None.

    Returns:
        dict: A dictionary with the results from the ols-estimation.
    """
    
    b_hat = np.dot(la.inv(x.T@x),x.T@y) # Fill in
    resid = y - x@b_hat # Fill in
    #u_hat = resid @ resid # Fill in
    SSR = np.square(resid).sum() # Fill in
    SST = np.square(y-np.mean(y)).sum()
    R2 = 1- SSR/SST # Fill in

    if robust == False:
        sigma, cov, se = variance(transform, SSR, x, t,robust=robust)
        t_values =  b_hat.reshape(-1,1) / se.reshape(-1,1)# Fill in

        names = ['b_hat', 'se', 'sigma', 't_values', 'R2', 'cov']
        results = [b_hat, se, sigma, t_values, R2, cov]
        return dict(zip(names, results))
    else:
        sigma, se = variance(transform, SSR, x, t,robust=robust)
        t_values =  b_hat.reshape(-1,1) / se.reshape(-1,1)# Fill in

        names = ['b_hat', 'se', 'sigma', 't_values', 'R2']
        results = [b_hat, se, sigma, t_values, R2]
        return dict(zip(names, results))

In [9]:
def variance(transform: str, SSR: float, x: np.array, t: int,robust=False) -> tuple:
    """Calculates the covariance and standard errors from the OLS
    estimation.

    Args:
        >> transform (str): Defaults to ''. If the data is transformed in 
        any way, the following transformations are allowed:
            '': No transformations
            'fd': First-difference
            'be': Between transformation
            'fe': Within transformation
            're': Random effects estimation
        >> SSR (float): Sum of squared residuals
        >> x (np.array): Dependent variables from regression
        >> t (int): The number of time periods in x.

    Raises:
        Exception: If invalid transformation is provided, returns
        an error.

    Returns:
        tuple: Returns the error variance (mean square error), 
        covariance matrix and standard errors.
    """

    # Store n and k, used for DF adjustments.
    k = x.shape[1]
    if transform in ('', 'fd', 'be'):
        n = x.shape[0]
    else:
        n = x.shape[0]/t

    # Calculate sigma2
    if robust == False:
        if transform in ('', 'fd', 'be'):
            sigma2 = (np.array(SSR/(n - k)))
        elif transform.lower() == 'fe':
            sigma2 = np.array(SSR/(n * (t - 1) - k))
        elif transform.lower() == 're':
            sigma2 = np.array(SSR/(t * n - k))
        else:
            raise Exception('Invalid transform provided.')
        cov = sigma2*la.inv(x.T@x)
        se = np.sqrt(cov.diagonal()).reshape(-1, 1)
        return sigma2, cov, se
    
    elif robust == True:
        if transform in ('', 'fd', 'be','fe','re'):
            sigma2 = np.array(la.inv(x.T@x)*np.sum(x@x.T@np.square(resid),axis=0)@la.inv(x.T@x))
        else:
            raise Exception('Invalid transform provided.')
        se = np.sqrt(np.diag(sigma2))
        return sigma2, se
    

In [10]:
pool_ols_res = estimate(y, X, transform='', n=N, t=T,robust=False) 

In [11]:
def print_table(
        labels: tuple,
        results: dict,
        headers=["", "Beta", "Se", "t-values"],
        title="Results",
        **kwargs
    ) -> None:
    label_y, label_x = labels
    # Create table for data on coefficients
    table = []
    for i, name in enumerate(label_x):
        row = [
            name, 
            results.get('b_hat')[i], 
            results.get('se')[i], 
            results.get('t_values')[i]
        ]
        table.append(row)
    
    # Print table
    print(title)
    print(f"Dependent variable: {label_y}\n")
    print(tabulate(table, headers, **kwargs))
    
    # Print data for model specification
    print(f"R\u00b2 = {results.get('R2').item():.3f}")
    #print(f"\u03C3\u00b2 = {results.get('sigma').item():.3f}")

In [12]:
label_y=["log(sales)"]
label_x=["constant","lemp","lcap"]

In [13]:
print_table(labels=(label_y,label_x),results=pool_ols_res,headers=["", "Beta", "Se", "t-values"],title="Results")

Results
Dependent variable: ['log(sales)']

                 Beta          Se    t-values
--------  -----------  ----------  ----------
constant  1.53588e-08  0.00497209   3.089e-06
lemp      0.674774     0.0101527   66.4625
lcap      0.310041     0.00913935  33.9237
R² = 0.914


# Fixed effects

In [14]:
Q_T = np.eye(T)-np.tile(1/T,(T,T))

In [15]:
def perm( Q_T: np.array, A: np.array, t=0) -> np.array:
    """Takes a transformation matrix and performs the transformation on 
    the given vector or matrix.

    Args:
        Q_T (np.array): The transformation matrix. Needs to have the same
        dimensions as number of years a person is in the sample.
        
        A (np.array): The vector or matrix that is to be transformed. Has
        to be a 2d array.

    Returns:
        np.array: Returns the transformed vector or matrix.
    """
    # We can infer t from the shape of the transformation matrix.
    if t==0:
        t = Q_T.shape[1]

    # Initialize the numpy array
    Z = np.array([[]])
    Z = Z.reshape(0, A.shape[1])

    # Loop over the individuals, and permutate their values.
    for i in range(int(A.shape[0]/t)):
        Z = np.vstack((Z, Q_T@A[i*t: (i + 1)*t]))
    return Z

In [16]:
x_fe = perm(Q_T, X)
y_fe = perm(Q_T,y)

In [17]:
la.matrix_rank(x_fe)

2

In [18]:
la.matrix_rank(y_fe)

1

In [19]:
idx = np.argwhere(np.all(np.isclose(x_fe[...,:], 0), axis=0))
x_fe = np.delete(x_fe,idx,axis=1)

In [20]:
fe_ols_res = estimate(y_fe, x_fe, transform='fe', n=N, t=T,robust=False) 

In [21]:
print_table(labels=(label_y,["lemp","lcap"]), results=fe_ols_res,headers=["", "Beta", "Se", "t-values"],title="Results")

Results
Dependent variable: ['log(sales)']

          Beta         Se    t-values
----  --------  ---------  ----------
lemp  0.694226  0.0146943     47.2447
lcap  0.15462   0.0129594     11.9311
R² = 0.477


# First difference

In [22]:
def fd_mat(t):
    D_t = np.eye(t-1,t,1)-np.eye(t-1,t)
    return D_t

In [23]:
D_T = fd_mat(T)

In [24]:
x_dd = perm(D_T, X)
y_dd = perm(D_T,y)

In [25]:
idx = np.argwhere(np.all(np.isclose(x_dd[...,:], 0), axis=0))
x_dd = np.delete(x_dd,idx,axis=1)

In [26]:
fd_ols_res = estimate(y_dd, x_dd, transform='fd', n=N, t=T-1) 

In [27]:
print_table(labels=(label_y,["lemp","lcap"]), results=fd_ols_res,headers=["", "Beta", "Se", "t-values"],title="Results")

Results
Dependent variable: ['log(sales)']

           Beta         Se    t-values
----  ---------  ---------  ----------
lemp  0.548666   0.0183111    29.9635
lcap  0.0629604  0.0190539     3.30432
R² = 0.165


# RE effects

In [28]:
P_T = np.tile(1/T,(T,T))

In [29]:
x_p = perm(P_T,X)
y_p = perm(P_T,y)

In [30]:
idx = np.argwhere(np.all(x_p[...,:]==0, axis=0))
x_p = np.delete(x_p,idx,axis=1)

In [31]:
be_result = estimate(y_p, x_p, transform='be',t=T)
print_table(
    (label_y, label_x), 
    be_result, title='BE regression', floatfmt='.4f'
)

BE regression
Dependent variable: ['log(sales)']

            Beta      Se    t-values
--------  ------  ------  ----------
constant  0.0000  0.0046      0.0000
lemp      0.6672  0.0099     67.6130
lcap      0.3188  0.0089     35.8720
R² = 0.923


In [32]:
sigma_u = fe_ols_res.get("sigma")
sigma_v = be_result.get("sigma")
sigma_c = sigma_v - 1/T*sigma_u

In [33]:
lambda_hat = 1 - np.sqrt(sigma_u/(sigma_u+T*sigma_c))

In [34]:
C_T = np.eye(T)-lambda_hat*P_T

In [35]:
x_c = perm(C_T,X)
y_c = perm(C_T,y)

In [36]:
re_result = estimate(y_c, x_c, transform='re',t=T)
print_table(
    (label_y, label_x), 
    re_result, title='RE regression', floatfmt='.4f'
)

RE regression
Dependent variable: ['log(sales)']

            Beta      Se    t-values
--------  ------  ------  ----------
constant  0.0000  0.0162      0.0000
lemp      0.7197  0.0131     54.8444
lcap      0.1990  0.0117     17.0431
R² = 0.643


# hausman

In [37]:
hat_diff = fe_ols_res.get("b_hat")-re_result.get("b_hat")[1:]

In [38]:
avar_fe = fe_ols_res.get("cov")
avar_re = re_result.get("cov")

In [39]:
cov_diff = avar_fe - avar_re[1:,1:]

In [40]:
H = hat_diff.T@la.inv(cov_diff)@hat_diff # The Hausman test value

# This calculates the p-value of the Hausman test.
p_val = chi2.sf(H.item(), 4)

In [41]:
# This code takes the results that you have made, and prints a nice looking table.
def print_h_test(fe_result, re_result, hat_diff, p_val):
    table = []
    for i in range(len(hat_diff)):
        row = [
            fe_ols_res['b_hat'][i], re_result['b_hat'][1:][i], hat_diff[i]
        ]
        table.append(row)

    print(tabulate(
        table, headers=['b_fe', 'b_re', 'b_diff'], floatfmt='.4f'
        ))
    print(f'The Hausman test statistic is: {H.item():.2f}, with p-value: {p_val:.2f}.')
print_h_test(fe_ols_res, re_result, hat_diff, p_val)

  b_fe    b_re    b_diff
------  ------  --------
0.6942  0.7197   -0.0255
0.1546  0.1990   -0.0444
The Hausman test statistic is: 73.87, with p-value: 0.00.


# we reject the null -> $c_{i}$ is correlated with $x_{i}$

In [198]:
def test_homogeneity(results):
    betas = results['b_hat']
    Avar = results['cov']
    
    if betas.shape[0] == 3:
        R = np.array([[0,1,1]])
        r = np.array([[1]])
        
    else:
        R = np.array([[1,1]])
        r = np.array([[1]])
              
    W = (R@betas-r).T@la.inv(R@Avar@R.T)@(R@betas-r)
    p_val = chi2.sf(W.item(), 2)
    return W.item(), p_val

for result, name in zip([pool_ols_res, fe_ols_res, re_result, fd_ols_res], ['POLS', 'FE', 'RE', 'FD']):
    test = test_homogeneity(result)
    print(f'                     {name}')
    print('--------------------------------------------')
    print(f'Beta_1 + Beta_2 = 1: W = {test[0]:.2f} and p = {test[1]:.2f}')
    print('--------------------------------------------\n')

                     POLS
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 12.98 and p = 0.00
--------------------------------------------

                     FE
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 135.19 and p = 0.00
--------------------------------------------

                     RE
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 74.07 and p = 0.00
--------------------------------------------

                     FD
--------------------------------------------
Beta_1 + Beta_2 = 1: W = 251.73 and p = 0.00
--------------------------------------------



$$W \equiv (\boldsymbol{R} \hat{\beta} - \boldsymbol{r})'[\boldsymbol{R} Avar(\hat{\beta})R']^{-1}(\boldsymbol{R} \hat{\beta} - \boldsymbol{r})$$

$$(QxK @ Kx1)' -> 1xQ$$

$$Q x K @ K x K @ K x Q$$

In [250]:
year = np.array(dat.year, dtype=int)
year.shape

(5292,)

In [251]:
def est_ols( y: np.array, x: np.array) -> np.array:
    """Estimates y on x by ordinary least squares, returns coefficents

    Args:
        >> y (np.array): Dependent variable (Needs to have shape 2D shape)
        >> x (np.array): Independent variable (Needs to have shape 2D shape)

    Returns:
        np.array: Estimated beta coefficients.
    """
    return la.inv(x.T@x)@(x.T@y)

In [252]:
reduced_year = year[year != 1968]

def serial_corr(y, x, t, year):
    b_hat = est_ols(y, x)
    e = y - x@b_hat
    
    # Create a lag to estimate the error on.
    L_T = np.eye(t, k=-1)
    L_T = L_T[1:]

    e_l = perm(L_T, e)

    # We then need to remove the first obs for every person again.
    e = e[year != 1969]

    return estimate(e, e_l)

corr_result = serial_corr(y_dd, x_dd, T-1, reduced_year)

label_ye = 'OLS residual, e\u1d62\u209c'
label_e = ['e\u1d62\u209c\u208B\u2081']
title = 'Serial Correlation'
print_table((label_ye, label_e), corr_result, title='Serial Correlation', floatfmt='.4f')

Serial Correlation
Dependent variable: OLS residual, eᵢₜ

          Beta      Se    t-values
-----  -------  ------  ----------
eᵢₜ₋₁  -0.1987  0.0148    -13.4493
R² = 0.039


In [253]:
def demeaning_matrix(t):
    Q_T = np.eye(t) - np.tile(1/t, (t, t))
    return Q_T

In [254]:
year[-1]

1979

In [255]:
X

array([[ 1.       , -0.242185 ,  0.9986017],
       [ 1.       , -0.241278 ,  0.9252139],
       [ 1.       , -0.265134 ,  0.8796163],
       ...,
       [ 1.       , -0.672649 , -0.719267 ],
       [ 1.       , -0.623978 , -0.536874 ],
       [ 1.       , -0.567195 , -0.522616 ]])

In [262]:
def demeaning_matrix(t):
    Q_T = np.eye(t) - np.tile(1/t, (t, t))
    return Q_T

def exogeneity_test(x, y, t, year, var):
    # Create lead
    F_T = np.eye(t, k=1)
    F_T = F_T[:-1]

    # Choose var
    if var == 'l':
        lead = perm(F_T, x[:, 1].reshape(-1, 1))
        label_exo = ["lemp","lcap"] + ['Labor lead']
    if var == 'k':
        lead = perm(F_T, x[:, 2].reshape(-1, 1))
        label_exo = ["lemp","lcap"] + ['Capital lead']

    # Collect variables to test for exogeneity
    x_exo = x[year != 1979]
    x_exo = np.hstack((x_exo, lead))
    y_exo = y[year != 1979]

    # Within transform the data
    Q_T = demeaning_matrix(t - 1)

    yw_exo = perm(Q_T, y_exo)
    xw_exo = perm(Q_T, x_exo)
    xw_exo = xw_exo[:, 1:]
    
    n = y.size/t
    # Estimate model
    exo_test = estimate(
        yw_exo, xw_exo, t=t - 1, transform='fe'
    )

    print_table(
        (label_y, label_exo), 
        exo_test, title='Exogeneity test', floatfmt='.4f'
    )
exogeneity_test(X, y, T, year, var='k')

Exogeneity test
Dependent variable: ['log(sales)']

                Beta      Se    t-values
------------  ------  ------  ----------
lemp          0.6479  0.0162     39.9359
lcap          0.0210  0.0231      0.9093
Capital lead  0.1793  0.0258      6.9500
R² = 0.474
