In [1]:
# Model is: ln(prod,y / pop,y) = k+c1*y+c2*ln(GDP,y/Pop,y)+c3*ln(Prod,y-1/Pop,y-1)

# Q1. how to perform this using a python package
# Q2. how to perform regression over all 21 economies at the same time to generate 21 regressions?

# https://towardsdatascience.com/the-complete-guide-to-linear-regression-in-python-3d3f8f06bf8

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
import statsmodels.api as sm

In [3]:
data = pd.read_csv("04_CHL.csv")
data.head()

Unnamed: 0,year,GDP,Pop,Steel
0,1980,75244937.47,11178.81699,809000
1,1981,78802559.99,11348.44199,724000
2,1982,70661594.54,11527.27403,334000
3,1983,67980358.79,11714.44195,462000
4,1984,73394748.44,11908.18892,628000


In [8]:
# data in per capita
data['GDP_pc'] = data[['GDP']].div(data.Pop, axis=0)
data['Prod_pc'] = data[['Steel']].div(data.Pop, axis=0)
data.head()

Unnamed: 0,year,GDP,Pop,Steel,GDP_pc,Prod_pc
0,1980,75244937.47,11178.81699,809000,6731.028653,72.369017
1,1981,78802559.99,11348.44199,724000,6943.910015,63.797304
2,1982,70661594.54,11527.27403,334000,6129.94836,28.97476
3,1983,67980358.79,11714.44195,462000,5803.123963,39.438498
4,1984,73394748.44,11908.18892,628000,6163.38462,52.736819


In [14]:
# take ln
data['GDP_pc_ln'] = np.log(data.GDP_pc)
data['Prod_pc_ln'] = np.log(data.Prod_pc)

In [15]:
data.head()

Unnamed: 0,year,GDP,Pop,Steel,GDP_pc,Prod_pc,GDP_pc_ln,Prod_pc_ln
0,1980,75244937.47,11178.81699,809000,6731.028653,72.369017,8.814483,4.281778
1,1981,78802559.99,11348.44199,724000,6943.910015,63.797304,8.84562,4.155711
2,1982,70661594.54,11527.27403,334000,6129.94836,28.97476,8.720942,3.366425
3,1983,67980358.79,11714.44195,462000,5803.123963,39.438498,8.666152,3.674742
4,1984,73394748.44,11908.18892,628000,6163.38462,52.736819,8.726381,3.965314


In [23]:
# regression
Xs = data.drop(['GDP','Pop','Steel','GDP_pc','Prod_pc'], axis=1)

In [24]:
y = data['Prod_pc_ln'].values.reshape(-1,1)

In [25]:
reg = LinearRegression()
reg.fit(Xs, y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)

In [21]:
print(reg.coef_)
print(reg.intercept_)

[[-1.36756170e-17  5.73345892e-16  1.00000000e+00]]
[2.30926389e-14]


In [26]:
# need to add lag term
X = np.column_stack((data['year'], data['GDP_pc_ln']))
y = data['Prod_pc_ln']

X2 = sm.add_constant(X)
est = sm.OLS(y, X2)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:             Prod_pc_ln   R-squared:                       0.916
Model:                            OLS   Adj. R-squared:                  0.911
Method:                 Least Squares   F-statistic:                     185.2
Date:                Sat, 22 Jun 2019   Prob (F-statistic):           5.24e-19
Time:                        18:22:08   Log-Likelihood:                 19.091
No. Observations:                  37   AIC:                            -32.18
Df Residuals:                      34   BIC:                            -27.35
Df Model:                           2                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
const         69.1163     20.241      3.415      0.0