# Panle Regression - Firm Characteristics

### Random Effects Panel Regression

In [7]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects


Data

In [8]:
df = pd.read_csv("Dataframes/analysts.csv")
X_var_names = ["Earnings Per Share – Coefficient of Variation", "Number of Analysts", "Recommendation - Mean (1-5)", "Recommendation change", "Price Target - Mean", "Price Target - Standard Deviation"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share – Coefficient of Variation,Recommendation change,Number of Analysts,Recommendation - Mean (1-5),Price Target - Mean,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,3.625620,0.00000,7.0,3.00000,36.00000,2.44949
1,AVY.N,2013-04-01,Materials,2.482,2.046169,-0.12500,7.0,2.87500,42.57143,3.92272
2,AVY.N,2013-07-01,Materials,1.068,1.551601,0.00000,8.0,2.87500,45.00000,4.30946
3,AVY.N,2013-10-01,Materials,8.095,2.775994,-0.25000,7.0,2.62500,47.71429,3.45230
4,AVY.N,2014-01-01,Materials,1.471,3.204412,-0.29167,8.0,2.33333,54.62500,3.42555
...,...,...,...,...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,2.836295,0.00000,8.0,2.30000,545.28571,50.48823
20116,POOL.OQ,2022-01-01,Retailing,40.267,3.269867,0.00000,8.0,2.30000,571.00000,41.33833
20117,POOL.OQ,2022-04-01,Retailing,34.342,6.630736,-0.30000,9.0,2.00000,519.88889,37.49206
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.625249,0.20000,9.0,2.20000,435.37500,57.57373


In [9]:
df["Date"] = pd.to_datetime(df["Date"])

Remove outliers

In [10]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
#df_clean = df_clean.dropna()
df_panel = df_clean.copy()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share – Coefficient of Variation,Recommendation change,Number of Analysts,Recommendation - Mean (1-5),Price Target - Mean,Price Target - Standard Deviation
0,AVY.N,2013-01-01,Materials,11.178,3.625620,0.00000,7.0,3.00000,36.00000,2.44949
1,AVY.N,2013-04-01,Materials,2.482,2.046169,-0.12500,7.0,2.87500,42.57143,3.92272
2,AVY.N,2013-07-01,Materials,1.068,1.551601,0.00000,8.0,2.87500,45.00000,4.30946
3,AVY.N,2013-10-01,Materials,8.095,2.775994,-0.25000,7.0,2.62500,47.71429,3.45230
4,AVY.N,2014-01-01,Materials,1.471,3.204412,-0.29167,8.0,2.33333,54.62500,3.42555
...,...,...,...,...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,2.836295,0.00000,8.0,2.30000,545.28571,50.48823
20116,POOL.OQ,2022-01-01,Retailing,40.267,3.269867,0.00000,8.0,2.30000,571.00000,41.33833
20117,POOL.OQ,2022-04-01,Retailing,34.342,6.630736,-0.30000,9.0,2.00000,519.88889,37.49206
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.625249,0.20000,9.0,2.20000,435.37500,57.57373


#### Random Effects

In [11]:
# turn dataframe into right dimension for panel regression with multiindex
df_clean['Instrument'] = df_clean['Instrument'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# set the index to be the time variable and the cross-sectional variable
df_clean.set_index(['Instrument', 'Date'], inplace=True)

# define the dependent variable
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the random effects panel regression
random_effects_model = RandomEffects(y, X)

# fit the model and print the summary statistics
random_effects_results = random_effects_model.fit()
print(random_effects_results.summary)


                                 RandomEffects Estimation Summary                                 
Dep. Variable:     Earnings Per Share - Actual Surprise   R-squared:                        0.0343
Estimator:                                RandomEffects   R-squared (Between):              0.4126
No. Observations:                                 17707   R-squared (Within):               0.0044
Date:                                  Mon, Feb 27 2023   R-squared (Overall):              0.1409
Time:                                          20:31:41   Log-likelihood                -7.456e+04
Cov. Estimator:                              Unadjusted                                           
                                                          F-statistic:                      104.67
Entities:                                           502   P-value                           0.0000
Avg Obs:                                         35.273   Distribution:                 F(6,17701)
Min Obs:  

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


**R-squared (Within)**: This is the proportion of variation in the dependent variable (in your case, Earnings Per Share - Actual Surprise) that is explained by the independent variables (in our case, the variables in X) within each individual instrument (the 501 companies). In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable (surprise) within each instrument. The R-squared (Within) is 0.0044, which means that the independent variables explain only about 0.44% of the variation in the dependent variable within each instrument.  

**R-squared (Between)**: This is the proportion of variation in the dependent variable that is explained by the independent variables across the instruments. In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable between different instruments. The R-squared (Between) is 0.4126, which means that...

**R-squared (Overall)**: This is the proportion of variation in the dependent variable that is explained by the independent variables overall, taking into account both the within-instrument and between-instrument variation. The R-squared (Overall) is 0.1409, which means that the independent variables explain about 14.09% of the variation in the dependent variable overall.

#### Fixed Effects

In [12]:
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.0028
Estimator:,PanelOLS,R-squared (Between):,0.1242
No. Observations:,17707,R-squared (Within):,0.0040
Date:,"Mon, Feb 27 2023",R-squared (Overall):,0.0405
Time:,20:34:07,Log-likelihood,-7.399e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,7.9169
Entities:,502,P-value,0.0000
Avg Obs:,35.273,Distribution:,"F(6,17161)"
Min Obs:,0.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
Earnings Per Share – Coefficient of Variation,0.0430,0.0079,5.4714,0.0000,0.0276,0.0583
Number of Analysts,-0.0807,0.0485,-1.6655,0.0958,-0.1757,0.0143
Recommendation - Mean (1-5),0.9575,0.5222,1.8336,0.0667,-0.0661,1.9811
Recommendation change,-3.7171,1.0072,-3.6906,0.0002,-5.6912,-1.7429
Price Target - Mean,0.0008,0.0018,0.4399,0.6600,-0.0027,0.0043
Price Target - Standard Deviation,-0.0047,0.0118,-0.3958,0.6923,-0.0278,0.0185


By Industry: example tech industry

In [13]:
df2 = df_panel[df_panel['GICS Industry Group Name'] == 'Technology Hardware & Equipment']
groups = df2.groupby('Instrument')

def panel_regression_random(group):
    if len(group) < 2:
        return None
    
    group['Instrument'] = group['Instrument'].astype('category')
    group['Date'] = pd.to_datetime(group['Date'])
    group.set_index(['Instrument', 'Date'], inplace=True)

    y = group.loc[:, "Earnings Per Share - Actual Surprise"]
    X = group[X_var_names]

    model = RandomEffects(y, X).fit()
    return model


results_random = groups.apply(panel_regression_random)
results_random = results_random.dropna()
models_random = results_random.tolist()
models_random

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


ValueError: exog does not have full column rank. If you wish to proceed with model estimation irrespective of the numerical accuracy of coefficient estimates, you can set check_rank=False.