# Panel Regression - Firm Characteristics

### Random Effects Panel Regression

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects

Data

In [2]:
df = pd.read_csv("Dataframes/analysts_regression.csv")
df["Date"] = pd.to_datetime(df["Date"])
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Earnings Per Share – Coefficient of Variation,Number of Analysts,Recommendation - Mean (1-5),Recommendation change,Recommendation - Mean (1-5).1,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,11.178,3.625620,7.0,3.00000,0.00000,3.000,2.44949
1,AVY.N,2013-04-01,Materials,2.482,2.482,2.046169,7.0,2.87500,-0.12500,3.000,3.92272
2,AVY.N,2013-07-01,Materials,1.068,1.068,1.551601,8.0,2.87500,0.00000,2.875,4.30946
3,AVY.N,2013-10-01,Materials,8.095,8.095,2.775994,7.0,2.62500,-0.25000,2.875,3.45230
4,AVY.N,2014-01-01,Materials,1.471,1.471,3.204412,8.0,2.33333,-0.29167,2.625,3.42555
...,...,...,...,...,...,...,...,...,...,...,...
19196,POOL.OQ,2021-10-01,Retailing,17.194,17.194,2.836295,8.0,2.30000,0.00000,2.300,50.48823
19197,POOL.OQ,2022-01-01,Retailing,40.267,40.267,3.269867,8.0,2.30000,0.00000,2.300,41.33833
19198,POOL.OQ,2022-04-01,Retailing,34.342,34.342,6.630736,9.0,2.00000,-0.30000,2.300,37.49206
19199,POOL.OQ,2022-07-01,Retailing,1.503,1.503,3.625249,9.0,2.20000,0.20000,2.000,57.57373


Standardising

In [3]:
rescale = df.copy()
#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["Earnings Per Share – Coefficient of Variation"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share – Coefficient of Variation"]).reshape(-1,1))
rescale["Number of Analysts"] = MinMaxScaler().fit_transform(np.array(rescale["Number of Analysts"]).reshape(-1,1))
rescale["Recommendation - Mean (1-5)"] = MinMaxScaler().fit_transform(np.array(rescale["Recommendation - Mean (1-5)"]).reshape(-1,1))
rescale["Recommendation change"] = MinMaxScaler().fit_transform(np.array(rescale["Recommendation change"]).reshape(-1,1))
rescale["Recommendation - Mean (1-5).1"] = MinMaxScaler().fit_transform(np.array(rescale["Recommendation - Mean (1-5).1"]).reshape(-1,1))
rescale["Price Target - Standard Deviation"] = MinMaxScaler().fit_transform(np.array(rescale["Price Target - Standard Deviation"]).reshape(-1,1))
df_clean = rescale
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Earnings Per Share – Coefficient of Variation,Number of Analysts,Recommendation - Mean (1-5),Recommendation change,Recommendation - Mean (1-5).1,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,11.178,0.385569,0.125000,0.742857,0.538462,0.666667,0.002079
1,AVY.N,2013-04-01,Materials,2.482,2.482,0.384977,0.125000,0.696428,0.480770,0.666667,0.003330
2,AVY.N,2013-07-01,Materials,1.068,1.068,0.384792,0.145833,0.696428,0.538462,0.625000,0.003658
3,AVY.N,2013-10-01,Materials,8.095,8.095,0.385251,0.125000,0.603571,0.423078,0.625000,0.002930
4,AVY.N,2014-01-01,Materials,1.471,1.471,0.385411,0.145833,0.495236,0.403846,0.541667,0.002908
...,...,...,...,...,...,...,...,...,...,...,...
19196,POOL.OQ,2021-10-01,Retailing,17.194,17.194,0.385273,0.145833,0.482857,0.538462,0.433333,0.042855
19197,POOL.OQ,2022-01-01,Retailing,40.267,40.267,0.385436,0.145833,0.482857,0.538462,0.433333,0.035088
19198,POOL.OQ,2022-04-01,Retailing,34.342,34.342,0.386695,0.166667,0.371428,0.400001,0.433333,0.031824
19199,POOL.OQ,2022-07-01,Retailing,1.503,1.503,0.385569,0.166667,0.445714,0.630770,0.333333,0.048869


In [4]:
#df_clean = df_clean.drop(["Recommendation - Mean (1-5)","Recommendation - Mean (1-5).1"], axis=1)
#df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Earnings Per Share – Coefficient of Variation,Number of Analysts,Recommendation change,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,11.178,0.385569,0.125000,0.538462,0.002079
1,AVY.N,2013-04-01,Materials,2.482,2.482,0.384977,0.125000,0.480770,0.003330
2,AVY.N,2013-07-01,Materials,1.068,1.068,0.384792,0.145833,0.538462,0.003658
3,AVY.N,2013-10-01,Materials,8.095,8.095,0.385251,0.125000,0.423078,0.002930
4,AVY.N,2014-01-01,Materials,1.471,1.471,0.385411,0.145833,0.403846,0.002908
...,...,...,...,...,...,...,...,...,...
19196,POOL.OQ,2021-10-01,Retailing,17.194,17.194,0.385273,0.145833,0.538462,0.042855
19197,POOL.OQ,2022-01-01,Retailing,40.267,40.267,0.385436,0.145833,0.538462,0.035088
19198,POOL.OQ,2022-04-01,Retailing,34.342,34.342,0.386695,0.166667,0.400001,0.031824
19199,POOL.OQ,2022-07-01,Retailing,1.503,1.503,0.385569,0.166667,0.630770,0.048869


In [5]:
df_panel = df_clean.copy()

#### Fixed Effects

In a fixed effects panel regression, the individual-specific effects are modeled as fixed variables that do not vary across time. This means that the coefficients of the independent variables are estimated based on the within-entity variation in the data, which eliminates the effect of time-invariant unobserved heterogeneity.

Fixed effects models are useful when there are time-invariant unobserved variables that may affect the dependent variable, but are not included in the model. By modeling the individual-specific effects as fixed variables, fixed effects models can control for this unobserved heterogeneity and estimate the coefficients of the independent variables based on the within-entity variation, which provides more efficient estimates of the coefficients.

One limitation of fixed effects models is that they do not allow for testing the effect of time-invariant variables on the dependent variable. In addition, fixed effects models may suffer from the incidental parameter problem, which may lead to biased estimates of the coefficients of the independent variables in the presence of a large number of fixed effects.

In [6]:
#reformatting indices of dataframe for panel regression
df_clean['Instrument'] = df_clean['Instrument'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# set the index to be the time variable and the cross-sectional variable
df_clean.set_index(['Instrument', 'Date'], inplace=True)
#df_clean

FE Panel Regression with **Absolute** Surprise Values

In [8]:
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = df_clean.loc[:, "Earnings Per Share – Coefficient of Variation":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.0014
Estimator:,PanelOLS,R-squared (Between):,0.2389
No. Observations:,18208,R-squared (Within):,0.0016
Date:,"Tue, Mar 07 2023",R-squared (Overall):,0.0944
Time:,00:04:23,Log-likelihood,-8.859e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,6.1727
Entities:,502,P-value,0.0001
Avg Obs:,36.271,Distribution:,"F(4,17664)"
Min Obs:,0.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Earnings Per Share – Coefficient of Variation,38.342,20.013,1.9159,0.0554,-0.8855,77.570
Number of Analysts,-16.228,4.5421,-3.5729,0.0004,-25.131,-7.3254
Recommendation change,-6.8439,4.1289,-1.6575,0.0974,-14.937,1.2492
Price Target - Standard Deviation,-34.468,17.705,-1.9469,0.0516,-69.171,0.2345


By Industry: example tech industry

In [9]:
group = df_panel[df_panel['GICS Industry Group Name'] == 'Technology Hardware & Equipment']

group['Instrument'] = group['Instrument'].astype('category')
group['Date'] = pd.to_datetime(group['Date'])
group.set_index(['Instrument', 'Date'], inplace=True)

y = group.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = group.loc[:, "Earnings Per Share – Coefficient of Variation":]

model = PanelOLS(y, X, entity_effects=True, time_effects=True).fit()
model

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['Instrument'] = group['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['Date'] = pd.to_datetime(group['Date'])
Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.1941
Estimator:,PanelOLS,R-squared (Between):,-2.125e+04
No. Observations:,603,R-squared (Within):,0.2097
Date:,"Tue, Mar 07 2023",R-squared (Overall):,-1.051e+04
Time:,00:04:24,Log-likelihood,-2132.2
Cov. Estimator:,Unadjusted,,
,,F-statistic:,32.567
Entities:,19,P-value,0.0000
Avg Obs:,31.737,Distribution:,"F(4,541)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Earnings Per Share – Coefficient of Variation,3721.0,327.75,11.353,0.0000,3077.2,4364.8
Number of Analysts,4.3042,4.9202,0.8748,0.3821,-5.3608,13.969
Recommendation change,-3.8471,6.4659,-0.5950,0.5521,-16.548,8.8542
Price Target - Standard Deviation,9.9472,67.685,0.1470,0.8832,-123.01,142.90


# COVID Tests

In [10]:
df_clean = df_panel.copy()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Earnings Per Share – Coefficient of Variation,Number of Analysts,Recommendation change,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,11.178,0.385569,0.125000,0.538462,0.002079
1,AVY.N,2013-04-01,Materials,2.482,2.482,0.384977,0.125000,0.480770,0.003330
2,AVY.N,2013-07-01,Materials,1.068,1.068,0.384792,0.145833,0.538462,0.003658
3,AVY.N,2013-10-01,Materials,8.095,8.095,0.385251,0.125000,0.423078,0.002930
4,AVY.N,2014-01-01,Materials,1.471,1.471,0.385411,0.145833,0.403846,0.002908
...,...,...,...,...,...,...,...,...,...
19196,POOL.OQ,2021-10-01,Retailing,17.194,17.194,0.385273,0.145833,0.538462,0.042855
19197,POOL.OQ,2022-01-01,Retailing,40.267,40.267,0.385436,0.145833,0.538462,0.035088
19198,POOL.OQ,2022-04-01,Retailing,34.342,34.342,0.386695,0.166667,0.400001,0.031824
19199,POOL.OQ,2022-07-01,Retailing,1.503,1.503,0.385569,0.166667,0.630770,0.048869


In [11]:
covid_start = pd.to_datetime("2020-01-01")

df_covid = df_clean[df_clean["Date"] >= covid_start]

df_nocovid = df_clean[df_clean["Date"] < covid_start]

In [12]:
df_covid['Instrument'] = df_covid['Instrument'].astype('category')
df_covid['Date'] = pd.to_datetime(df_covid['Date'])

# set the index to be the time variable and the cross-sectional variable
df_covid.set_index(['Instrument', 'Date'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid['Instrument'] = df_covid['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid['Date'] = pd.to_datetime(df_covid['Date'])


In [13]:
df_nocovid['Instrument'] = df_nocovid['Instrument'].astype('category')
df_nocovid['Date'] = pd.to_datetime(df_nocovid['Date'])

# set the index to be the time variable and the cross-sectional variable
df_nocovid.set_index(['Instrument', 'Date'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nocovid['Instrument'] = df_nocovid['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nocovid['Date'] = pd.to_datetime(df_nocovid['Date'])


## COVID

In [14]:
y = df_covid.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = df_covid.loc[:, "Earnings Per Share – Coefficient of Variation":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.0012
Estimator:,PanelOLS,R-squared (Between):,0.3733
No. Observations:,5450,R-squared (Within):,0.0003
Date:,"Tue, Mar 07 2023",R-squared (Overall):,0.1809
Time:,00:04:35,Log-likelihood,-2.704e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,1.4736
Entities:,501,P-value,0.2074
Avg Obs:,10.878,Distribution:,"F(4,4935)"
Min Obs:,0.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Earnings Per Share – Coefficient of Variation,54.949,33.709,1.6301,0.1031,-11.135,121.03
Number of Analysts,-22.808,17.428,-1.3087,0.1907,-56.975,11.359
Recommendation change,-1.6580,9.0486,-0.1832,0.8546,-19.397,16.081
Price Target - Standard Deviation,48.137,38.458,1.2517,0.2108,-27.258,123.53


In [15]:
y = df_nocovid.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = df_nocovid.loc[:, "Earnings Per Share – Coefficient of Variation":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.0003
Estimator:,PanelOLS,R-squared (Between):,-0.5730
No. Observations:,12758,R-squared (Within):,0.0003
Date:,"Tue, Mar 07 2023",R-squared (Overall):,-0.2346
Time:,00:04:52,Log-likelihood,-6.056e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,0.7871
Entities:,498,P-value,0.5333
Avg Obs:,25.618,Distribution:,"F(4,12267)"
Min Obs:,0.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Earnings Per Share – Coefficient of Variation,-9.9617,25.300,-0.3937,0.6938,-59.553,39.630
Number of Analysts,-1.5567,5.6078,-0.2776,0.7813,-12.549,9.4355
Recommendation change,-7.0043,4.3790,-1.5995,0.1097,-15.588,1.5791
Price Target - Standard Deviation,-19.549,41.911,-0.4664,0.6409,-101.70,62.604


### Based on these results we can conclude that Analyst factors are a bad indicator for prediction