In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression

In [2]:
aas = [f'aa{i}' for i in range(1, 55)]
wws = [f'ww{i}' for i in range(1, 31)]

df = pd.read_stata('Accounts_matched_collapsed.dta',
                   columns=[
                       'ly', 'ceo_behavior', 'lemp', 'lempm', 'cons', 'active',
                       'year', 'cty', 'emp_imputed', 'pa', 'reliability', *aas,
                       *wws
                   ]).dropna()
df = pd.get_dummies(df, columns=['year', 'cty'])

# 直接做

In [3]:
y = df['ly']
X = df.drop(columns=['ly'])

In [4]:
reg = LinearRegression().fit(X, y)

In [5]:
reg.coef_[:2]

array([0.36287507, 0.90402293], dtype=float32)

會不一樣是因為weight，stata用aweight，計算比較複雜

# 用我們的方法

In [6]:
df

Unnamed: 0,ly,ceo_behavior,lemp,lempm,cons,active,emp_imputed,pa,reliability,aa1,...,year_2011.0,year_2012.0,year_2013.0,year_2014.0,cty_br,cty_de,cty_fr,cty_gb,cty_in,cty_us
0,12.352137,0.494655,7.429093,0.0,1.0,1.0,0.0,1.0,10.0,0,...,False,False,True,False,True,False,False,False,False,False
1,10.096356,0.035102,7.229839,0.0,0.0,1.0,0.0,0.0,10.0,0,...,False,True,False,False,True,False,False,False,False,False
2,14.075560,0.991736,9.951241,0.0,1.0,1.0,0.0,1.0,10.0,0,...,False,False,True,False,True,False,False,False,False,False
3,12.358381,0.018195,7.949254,0.0,1.0,1.0,0.0,0.0,10.0,0,...,False,True,False,False,True,False,False,False,False,False
4,10.530302,0.228012,6.734591,0.0,1.0,1.0,0.0,0.0,10.0,0,...,False,False,False,False,True,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
964,9.616288,0.269143,5.225747,0.0,0.0,1.0,0.0,0.0,10.0,0,...,False,True,False,False,False,False,False,False,False,True
965,9.941437,0.981228,5.192957,0.0,0.0,1.0,0.0,0.0,10.0,0,...,True,False,False,False,False,False,False,False,False,True
966,10.852594,0.963902,6.633318,0.0,0.0,1.0,0.0,1.0,7.0,0,...,False,True,False,False,False,False,False,False,False,True
967,14.138969,0.953962,8.267854,0.0,1.0,1.0,0.0,1.0,9.0,0,...,False,False,True,False,False,False,False,False,False,True


# 調查

In [7]:
feat = [
    'ly', 'ceo_behavior', 'lemp', 'lempm', 'cons', 'active', 'year', 'cty',
    'emp_imputed', 'pa', 'reliability'
]

In [8]:
ddf = pd.read_stata('Accounts_matched_yearly.dta')
df = pd.read_stata('Accounts_matched_collapsed.dta')

In [9]:
company_id = ddf['company_id'].sample().iloc[0]
company_id

np.float32(268.0)

In [10]:

ddf[ddf['company_id'] == company_id][feat]

Unnamed: 0,ly,ceo_behavior,lemp,lempm,cons,active,year,cty,emp_imputed,pa,reliability
668,,0.025263,,1.0,0.0,1.0,2010.0,br,0.0,1.0,10.0
669,,0.025263,,1.0,0.0,1.0,2011.0,br,0.0,1.0,10.0
670,11.923362,0.025263,5.755742,0.0,0.0,1.0,2012.0,br,0.0,1.0,10.0
671,,0.025263,,1.0,0.0,1.0,2013.0,br,0.0,1.0,10.0
672,,0.025263,,1.0,0.0,1.0,2014.0,br,0.0,1.0,10.0


In [11]:

df[df['company_id'] == company_id][feat]

Unnamed: 0,ly,ceo_behavior,lemp,lempm,cons,active,year,cty,emp_imputed,pa,reliability
242,11.923362,0.025263,5.755742,0.0,0.0,1.0,2012.0,br,0.0,1.0,10.0


# 比較

In [44]:
#lempm, cons, active, pa, reliability, aa*, ww*, cty不隨時間變動，所以demean會估不出來，但是我覺得最後可以加進去：）
feat = [
    'ly', 'ceo_behavior', 'lemp', 'lempm', 'cons', 'active', 'year', 'cty',
    'emp_imputed', 'pa', 'reliability', *aas, *wws
]

no_var = ['lempm', 'cons', 'active', 'pa', 'reliability', *aas, *wws, 'cty']
df = pd.read_stata('Accounts_matched_yearly.dta')
df = df.dropna()
df = df[[*feat, 'company_id']]
ddf = df.drop(columns=no_var)
ddf = pd.get_dummies(ddf, columns=['year'], drop_first=True)

In [45]:
demean = lambda df: df - df.mean()

In [46]:
ddf_demean = ddf.groupby(['company_id']).transform(demean)

In [47]:
ddf_demean

Unnamed: 0,ly,ceo_behavior,lemp,emp_imputed,year_2008.0,year_2009.0,year_2010.0,year_2011.0,year_2012.0,year_2013.0,year_2014.0
0,-0.018352,0.0,0.066449,0.0,0.0,0.0,0.0,0.0,0.500000,-0.500000,0.000000
1,0.018353,0.0,-0.066448,0.0,0.0,0.0,0.0,0.0,-0.500000,0.500000,0.000000
4,0.166066,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.500000,-0.500000,0.000000
5,-0.166066,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,-0.500000,0.500000,0.000000
7,0.245578,0.0,-0.022671,0.0,0.0,0.0,0.0,0.0,0.666667,-0.333333,-0.333333
...,...,...,...,...,...,...,...,...,...,...,...
2843,-0.061296,0.0,-0.051765,0.0,0.0,0.0,0.0,0.0,0.666667,-0.333333,-0.333333
2844,0.025494,0.0,0.000878,0.0,0.0,0.0,0.0,0.0,-0.333333,0.666667,-0.333333
2845,0.035803,0.0,0.050888,0.0,0.0,0.0,0.0,0.0,-0.333333,-0.333333,0.666667
2846,-0.032114,0.0,-0.048337,0.0,0.0,0.0,0.0,0.0,0.000000,0.500000,-0.500000


In [48]:
y = ddf_demean['ly']
X = ddf_demean.drop(columns=['ly', 'ceo_behavior'])

In [49]:
reg_demean = LinearRegression(fit_intercept=False).fit(X, y)

In [50]:
delta = reg_demean.coef_

In [51]:
y_minus_covariates = ddf['ly'] - ddf.drop(columns=['ly', 'ceo_behavior', 'company_id']) @ delta

加入不變動的項直接做

In [59]:
LinearRegression().fit(
    pd.get_dummies(df[['ceo_behavior'] + no_var], drop_first=True),
    y_minus_covariates).coef_[0]

np.float32(0.64894646)

不加入

In [62]:
LinearRegression().fit(
    df[['ceo_behavior']],
    y_minus_covariates).coef_[0]

np.float32(1.1867132)