# Panel Regression - Firm Characteristics

A panel regression is a suitable regression method for analyzing our data, especially since you have quarterly data for multiple companies over a long period of time. A panel regression model accounts for both within-entity and between-entity variations, making it a useful tool to analyze data with a time series and cross-sectional dimension.  
With panel data, we can control for individual-level characteristics that may affect forecast accuracy by including fixed effects for each company or industry. We can also account for time-specific factors that may affect forecast accuracy by including time fixed effects or time-varying covariates.

### Random Effects Panel Regression

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects

Data

In [2]:
df = pd.read_csv("Dataframes/characteristics.csv")
#X_var_names = ["Revenue - Actual","Enterprise Value","Market Capitalization","loss firm status", "Enterprise Value To Sales (Daily Time Series Ratio)", "3 Month Total Return", "Volume"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


In [3]:
df["Date"] = pd.to_datetime(df["Date"])

Remove outliers

In [4]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
#df_clean = df_clean.dropna()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


Standardising

In [5]:
rescale = df_clean
#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["Revenue - Actual"] = MinMaxScaler().fit_transform(np.array(rescale["Revenue - Actual"]).reshape(-1,1))
rescale["Enterprise Value"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value"]).reshape(-1,1))
rescale["Market Capitalization"] = MinMaxScaler().fit_transform(np.array(rescale["Market Capitalization"]).reshape(-1,1))
rescale["Enterprise Value To Sales (Daily Time Series Ratio)"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value To Sales (Daily Time Series Ratio)"]).reshape(-1,1))
rescale["3 Month Total Return"] = MinMaxScaler().fit_transform(np.array(rescale["3 Month Total Return"]).reshape(-1,1))
rescale["Volume"] = MinMaxScaler().fit_transform(np.array(rescale["Volume"]).reshape(-1,1))
df_clean = rescale
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,11.178,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
1,AVY.N,2013-04-01,Materials,2.482,2.482,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
2,AVY.N,2013-07-01,Materials,1.068,1.068,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
3,AVY.N,2013-10-01,Materials,8.095,8.095,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
4,AVY.N,2014-01-01,Materials,1.471,1.471,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,17.194,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,40.267,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,34.342,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,1.503,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


### Create 2 new dataframes one with only covid years one with only non-covid years

In [6]:
covid_start = pd.to_datetime("2020-01-01")

df_covid = df_clean[df_clean["Date"] >= covid_start]

df_nocovid = df_clean[df_clean["Date"] < covid_start]

#### Fixed Effects

In a fixed effects panel regression, the individual-specific effects are modeled as fixed variables that do not vary across time. This means that the coefficients of the independent variables are estimated based on the within-entity variation in the data, which eliminates the effect of time-invariant unobserved heterogeneity.

Fixed effects models are useful when there are time-invariant unobserved variables that may affect the dependent variable, but are not included in the model. By modeling the individual-specific effects as fixed variables, fixed effects models can control for this unobserved heterogeneity and estimate the coefficients of the independent variables based on the within-entity variation, which provides more efficient estimates of the coefficients.

One limitation of fixed effects models is that they do not allow for testing the effect of time-invariant variables on the dependent variable. In addition, fixed effects models may suffer from the incidental parameter problem, which may lead to biased estimates of the coefficients of the independent variables in the presence of a large number of fixed effects.

In [7]:
df_covid['Instrument'] = df_covid['Instrument'].astype('category')
df_covid['Date'] = pd.to_datetime(df_covid['Date'])

# set the index to be the time variable and the cross-sectional variable
df_covid.set_index(['Instrument', 'Date'], inplace=True)
df_covid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid['Instrument'] = df_covid['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_covid['Date'] = pd.to_datetime(df_covid['Date'])


Unnamed: 0_level_0,Unnamed: 1_level_0,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
Instrument,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AVY.N,2020-01-01,Materials,2.173,2.173,0.066610,0.030588,0.003781,0.028372,0.314080,0.000739,-1
AVY.N,2020-04-01,Materials,10.733,10.733,0.066302,0.029848,0.002870,0.027949,0.195548,0.003046,-1
AVY.N,2020-07-01,Materials,11.960,11.960,0.065100,0.030087,0.003146,0.028190,0.294251,0.001339,-1
AVY.N,2020-10-01,Materials,25.104,25.104,0.066339,0.030304,0.003424,0.028455,0.304258,0.001266,-1
AVY.N,2021-01-01,Materials,8.799,8.799,0.067957,0.031290,0.004455,0.028837,0.333446,0.000633,-1
...,...,...,...,...,...,...,...,...,...,...,...
POOL.OQ,2021-10-01,Retailing,17.194,17.194,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
POOL.OQ,2022-01-01,Retailing,40.267,40.267,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
POOL.OQ,2022-04-01,Retailing,34.342,34.342,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
POOL.OQ,2022-07-01,Retailing,1.503,1.503,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


In [8]:
df_nocovid['Instrument'] = df_nocovid['Instrument'].astype('category')
df_nocovid['Date'] = pd.to_datetime(df_nocovid['Date'])

# set the index to be the time variable and the cross-sectional variable
df_nocovid.set_index(['Instrument', 'Date'], inplace=True)
df_nocovid

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nocovid['Instrument'] = df_nocovid['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_nocovid['Date'] = pd.to_datetime(df_nocovid['Date'])


Unnamed: 0_level_0,Unnamed: 1_level_0,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
Instrument,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
AVY.N,2013-01-01,Materials,11.178,11.178,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
AVY.N,2013-04-01,Materials,2.482,2.482,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
AVY.N,2013-07-01,Materials,1.068,1.068,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
AVY.N,2013-10-01,Materials,8.095,8.095,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
AVY.N,2014-01-01,Materials,1.471,1.471,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...,...
POOL.OQ,2018-10-01,Retailing,4.779,4.779,0.060670,0.028808,0.002311,0.029245,0.297481,0.000000,-1
POOL.OQ,2019-01-01,Retailing,-7.082,7.082,0.059013,0.028595,0.002050,0.028850,0.230637,0.000743,-1
POOL.OQ,2019-04-01,Retailing,23.077,23.077,0.059349,0.028782,0.002240,0.029078,0.300100,0.000000,-1
POOL.OQ,2019-07-01,Retailing,-0.194,0.194,0.062585,0.029119,0.002606,0.029490,0.315412,0.000000,-1


In [9]:
y = df_covid.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_covid.loc[:, "Revenue - Actual":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary

0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.0124
Estimator:,PanelOLS,R-squared (Between):,0.4965
No. Observations:,5605,R-squared (Within):,0.0259
Date:,"Thu, Mar 02 2023",R-squared (Overall):,0.2066
Time:,11:49:05,Log-likelihood,-2.389e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,9.1267
Entities:,500,P-value,0.0000
Avg Obs:,11.210,Distribution:,"F(7,5087)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Revenue - Actual,17.625,15.465,1.1397,0.2545,-12.693,47.943
Enterprise Value,-138.23,87.780,-1.5748,0.1154,-310.32,33.852
Market Capitalization,131.61,85.559,1.5382,0.1241,-36.124,299.34
Enterprise Value To Sales (Daily Time Series Ratio),-4.3225,13.304,-0.3249,0.7453,-30.405,21.760
3 Month Total Return,29.511,5.1430,5.7381,0.0000,19.428,39.593
Volume,31.625,17.058,1.8539,0.0638,-1.8171,65.067
loss firm status,-2.2954,0.5506,-4.1687,0.0000,-3.3748,-1.2159


In [10]:
y = df_nocovid.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_nocovid.loc[:, "Revenue - Actual":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary

0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.0031
Estimator:,PanelOLS,R-squared (Between):,0.1052
No. Observations:,12928,R-squared (Within):,0.0036
Date:,"Thu, Mar 02 2023",R-squared (Overall):,0.0337
Time:,11:49:06,Log-likelihood,-5.283e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,5.5025
Entities:,497,P-value,0.0000
Avg Obs:,26.012,Distribution:,"F(7,12396)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Revenue - Actual,2.6563,10.621,0.2501,0.8025,-18.163,23.476
Enterprise Value,-5.4139,42.680,-0.1268,0.8991,-89.073,78.245
Market Capitalization,28.625,43.591,0.6567,0.5114,-56.820,114.07
Enterprise Value To Sales (Daily Time Series Ratio),-180.23,80.918,-2.2273,0.0259,-338.84,-21.619
3 Month Total Return,20.908,3.7437,5.5850,0.0000,13.570,28.246
Volume,15.169,12.855,1.1800,0.2380,-10.030,40.367
loss firm status,-0.4366,0.3081,-1.4169,0.1565,-1.0405,0.1674


In [11]:
y = df_covid.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = df_covid.loc[:, "Revenue - Actual":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.0019
Estimator:,PanelOLS,R-squared (Between):,0.2251
No. Observations:,5605,R-squared (Within):,0.0076
Date:,"Thu, Mar 02 2023",R-squared (Overall):,0.1453
Time:,11:49:11,Log-likelihood,-2.238e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,1.3732
Entities:,500,P-value,0.2119
Avg Obs:,11.210,Distribution:,"F(7,5087)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Revenue - Actual,-2.9640,11.821,-0.2507,0.8020,-26.138,20.210
Enterprise Value,72.151,67.096,1.0753,0.2823,-59.387,203.69
Market Capitalization,-68.028,65.399,-1.0402,0.2983,-196.24,60.182
Enterprise Value To Sales (Daily Time Series Ratio),-16.697,10.170,-1.6418,0.1007,-36.633,3.2400
3 Month Total Return,5.8447,3.9311,1.4868,0.1371,-1.8620,13.551
Volume,15.815,13.039,1.2129,0.2252,-9.7469,41.377
loss firm status,0.6408,0.4209,1.5226,0.1279,-0.1843,1.4659


In [12]:
y = df_nocovid.loc[:, "Earnings Per Share - Actual Surprise AbsVals"]
X = df_nocovid.loc[:, "Revenue - Actual":]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary

0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise AbsVals,R-squared:,0.0091
Estimator:,PanelOLS,R-squared (Between):,-0.7441
No. Observations:,12928,R-squared (Within):,0.0103
Date:,"Thu, Mar 02 2023",R-squared (Overall):,-0.4798
Time:,11:49:13,Log-likelihood,-4.906e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,16.196
Entities:,497,P-value,0.0000
Avg Obs:,26.012,Distribution:,"F(7,12396)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Revenue - Actual,-41.975,7.9315,-5.2922,0.0000,-57.522,-26.428
Enterprise Value,24.387,31.872,0.7652,0.4442,-38.086,86.860
Market Capitalization,1.0043,32.552,0.0309,0.9754,-62.802,64.811
Enterprise Value To Sales (Daily Time Series Ratio),-93.521,60.426,-1.5477,0.1217,-211.97,24.923
3 Month Total Return,7.6472,2.7956,2.7354,0.0062,2.1674,13.127
Volume,-1.7303,9.5998,-0.1802,0.8570,-20.547,17.087
loss firm status,1.9761,0.2301,8.5887,0.0000,1.5251,2.4271
