# Panle Regression - Firm Characteristics

### Random Effects Panel Regression

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects


Data

In [2]:
df = pd.read_csv("Dataframes/characteristics.csv")
X_var_names = ["Revenue - Actual","Enterprise Value","Market Capitalization","loss firm status", "Enterprise Value To Sales (Daily Time Series Ratio)", "3 Month Total Return", "Volume"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...
19335,POOL.OQ,2021-10-01,Retailing,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19336,POOL.OQ,2022-01-01,Retailing,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19337,POOL.OQ,2022-04-01,Retailing,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19338,POOL.OQ,2022-07-01,Retailing,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


In [3]:
df["Date"] = pd.to_datetime(df["Date"])

Remove outliers

In [4]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
#df_clean = df_clean.dropna()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...
19335,POOL.OQ,2021-10-01,Retailing,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19336,POOL.OQ,2022-01-01,Retailing,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19337,POOL.OQ,2022-04-01,Retailing,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19338,POOL.OQ,2022-07-01,Retailing,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


Standardising

In [5]:
rescale = df_clean
#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["Revenue - Actual"] = MinMaxScaler().fit_transform(np.array(rescale["Revenue - Actual"]).reshape(-1,1))
rescale["Enterprise Value"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value"]).reshape(-1,1))
rescale["Market Capitalization"] = MinMaxScaler().fit_transform(np.array(rescale["Market Capitalization"]).reshape(-1,1))
rescale["Enterprise Value To Sales (Daily Time Series Ratio)"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value To Sales (Daily Time Series Ratio)"]).reshape(-1,1))
rescale["3 Month Total Return"] = MinMaxScaler().fit_transform(np.array(rescale["3 Month Total Return"]).reshape(-1,1))
rescale["Volume"] = MinMaxScaler().fit_transform(np.array(rescale["Volume"]).reshape(-1,1))
df_clean = rescale
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
1,AVY.N,2013-04-01,Materials,2.482,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
2,AVY.N,2013-07-01,Materials,1.068,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
3,AVY.N,2013-10-01,Materials,8.095,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
4,AVY.N,2014-01-01,Materials,1.471,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...,...
19335,POOL.OQ,2021-10-01,Retailing,17.194,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
19336,POOL.OQ,2022-01-01,Retailing,40.267,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
19337,POOL.OQ,2022-04-01,Retailing,34.342,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
19338,POOL.OQ,2022-07-01,Retailing,1.503,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


In [6]:
df_panel = df_clean.copy()

#### Random Effects

In [7]:
# turn dataframe into right dimension for panel regression with multiindex
df_clean['Instrument'] = df_clean['Instrument'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# set the index to be the time variable and the cross-sectional variable
df_clean.set_index(['Instrument', 'Date'], inplace=True)

# define the dependent variable
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the random effects panel regression
random_effects_model = RandomEffects(y, X)

# fit the model and print the summary statistics
random_effects_results = random_effects_model.fit()
print(random_effects_results.summary)


                                 RandomEffects Estimation Summary                                 
Dep. Variable:     Earnings Per Share - Actual Surprise   R-squared:                        0.0525
Estimator:                                RandomEffects   R-squared (Between):              0.5975
No. Observations:                                 18569   R-squared (Within):               0.0141
Date:                                  Mon, Feb 27 2023   R-squared (Overall):              0.1531
Time:                                          20:49:30   Log-likelihood                -7.821e+04
Cov. Estimator:                              Unadjusted                                           
                                                          F-statistic:                      146.84
Entities:                                           501   P-value                           0.0000
Avg Obs:                                         37.064   Distribution:                 F(7,18562)
Min Obs:  

**R-squared (Within)**: This is the proportion of variation in the dependent variable (in your case, Earnings Per Share - Actual Surprise) that is explained by the independent variables (in our case, the variables in X) within each individual instrument (the 501 companies). In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable (surprise) within each instrument. The R-squared (Within) is 0.0141, which means that the independent variables explain 0.141% of the variation in the dependent variable within each instrument.  

**R-squared (Between)**: This is the proportion of variation in the dependent variable that is explained by the independent variables across the instruments. In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable between different instruments. The R-squared (Between) is 0.5975, which means that the independent variables actually explain the variation in the dependent variable across instruments quite well (to 59.75%).  

**R-squared (Overall)**: This is the proportion of variation in the dependent variable that is explained by the independent variables overall, taking into account both the within-instrument and between-instrument variation. The R-squared (Overall) is 0.1531, which means that the independent variables explain only about 2.74% of the variation in the dependent variable overall.

#### Fixed Effects

In [8]:
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.0091
Estimator:,PanelOLS,R-squared (Between):,0.5647
No. Observations:,18569,R-squared (Within):,0.0137
Date:,"Mon, Feb 27 2023",R-squared (Overall):,0.1448
Time:,20:49:37,Log-likelihood,-7.762e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,23.591
Entities:,501,P-value,0.0000
Avg Obs:,37.064,Distribution:,"F(7,18021)"
Min Obs:,6.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Revenue - Actual,23.627,6.7639,3.4931,0.0005,10.369,36.885
Enterprise Value,-90.805,32.207,-2.8194,0.0048,-153.93,-27.676
Market Capitalization,82.666,31.240,2.6462,0.0081,21.433,143.90
loss firm status,-1.2540,0.2607,-4.8096,0.0000,-1.7651,-0.7430
Enterprise Value To Sales (Daily Time Series Ratio),-41.310,10.797,-3.8260,0.0001,-62.474,-20.147
3 Month Total Return,29.022,2.9316,9.8995,0.0000,23.275,34.768
Volume,7.8320,9.6007,0.8158,0.4146,-10.986,26.650


By Industry: example tech industry

In [9]:
df2 = df_panel[df_panel['GICS Industry Group Name'] == 'Technology Hardware & Equipment']
groups = df2.groupby('Instrument')

def panel_regression_random(group):
    if len(group) < 2:
        return None
    
    group['Instrument'] = group['Instrument'].astype('category')
    group['Date'] = pd.to_datetime(group['Date'])
    group.set_index(['Instrument', 'Date'], inplace=True)

    y = group.loc[:, "Earnings Per Share - Actual Surprise"]
    X = group[X_var_names]

    model = RandomEffects(y, X).fit()
    return model


results_random = groups.apply(panel_regression_random)
results_random = results_random.dropna()
models_random = results_random.tolist()
models_random

[                                 RandomEffects Estimation Summary                                 
 Dep. Variable:     Earnings Per Share - Actual Surprise   R-squared:                        0.4285
 Estimator:                                RandomEffects   R-squared (Between):              0.0000
 No. Observations:                                    40   R-squared (Within):               0.4285
 Date:                                  Mon, Feb 27 2023   R-squared (Overall):              0.4285
 Time:                                          20:49:47   Log-likelihood                   -132.40
 Cov. Estimator:                              Unadjusted                                           
                                                           F-statistic:                      4.1245
 Entities:                                             1   P-value                           0.0034
 Avg Obs:                                         40.000   Distribution:                    F(6,33)
