# Panle Regression - Firm Characteristics

### Random Effects Panel Regression

In [4]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects


Data

In [13]:
df = pd.read_csv("macro_2.csv")
X_var_names = ["Inflation Premium(%)", "90-Day AA Financial Commercial Paper Interest Rate",
               "CBOE Crude Oil ETF Volatility Index", "Consumer Sentiment", "Unemployment rate"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,"WACC Inflation Adjusted Risk Free Rate, (%)",Inflation Premium(%),90-Day AA Financial Commercial Paper Interest Rate,CPFFM,CBOE Crude Oil ETF Volatility Index,Consumer Sentiment,Unemployment rate
0,AVY.N,2013-01-01,Materials,11.178,,0.359537,0.17,0.02,22.54,76.67,7.733333
1,AVY.N,2013-04-01,Materials,2.482,,0.360572,0.14,0.03,23.47,81.67,7.533333
2,AVY.N,2013-07-01,Materials,1.068,,0.463789,0.13,0.04,24.10,81.57,7.233333
3,AVY.N,2013-10-01,Materials,8.095,,0.441854,0.13,0.04,19.74,76.93,6.933333
4,AVY.N,2014-01-01,Materials,1.471,,0.465460,0.13,0.05,19.56,80.93,6.666667
...,...,...,...,...,...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,1.527139,0.382645,0.14,0.06,43.04,69.90,4.200000
20116,POOL.OQ,2022-01-01,Retailing,40.267,1.515266,0.380946,0.47,0.32,52.70,63.13,3.800000
20117,POOL.OQ,2022-04-01,Retailing,34.342,2.325202,0.495588,1.39,0.61,50.46,57.87,3.600000
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.092855,0.423045,2.89,0.64,49.86,56.10,3.566667


In [6]:
df["Date"] = pd.to_datetime(df["Date"])

Remove outliers

In [15]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
df_clean = df_clean.dropna()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,"WACC Inflation Adjusted Risk Free Rate, (%)",Inflation Premium(%),90-Day AA Financial Commercial Paper Interest Rate,CPFFM,CBOE Crude Oil ETF Volatility Index,Consumer Sentiment,Unemployment rate
12,AVY.N,2016-01-01,Materials,8.817,2.304994,0.420570,0.55,0.19,58.78,91.57,4.900000
13,AVY.N,2016-04-01,Materials,9.159,1.829833,0.383655,0.56,0.18,42.38,92.40,4.933333
14,AVY.N,2016-07-01,Materials,7.522,1.491713,0.406042,0.70,0.30,39.96,90.33,4.900000
15,AVY.N,2016-10-01,Materials,1.290,1.606498,0.454642,0.77,0.32,38.25,93.07,4.766667
16,AVY.N,2017-01-01,Materials,6.180,2.431507,0.444016,0.92,0.22,29.34,97.23,4.566667
...,...,...,...,...,...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,1.527139,0.382645,0.14,0.06,43.04,69.90,4.200000
20116,POOL.OQ,2022-01-01,Retailing,40.267,1.515266,0.380946,0.47,0.32,52.70,63.13,3.800000
20117,POOL.OQ,2022-04-01,Retailing,34.342,2.325202,0.495588,1.39,0.61,50.46,57.87,3.600000
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.092855,0.423045,2.89,0.64,49.86,56.10,3.566667


Standardising

get wierd results when standardising

In [5]:
'''
rescale = df_clean
#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["WACC Inflation Adjusted Risk Free Rate, (%)"] = MinMaxScaler().fit_transform(np.array(rescale["WACC Inflation Adjusted Risk Free Rate, (%)"]).reshape(-1,1))
rescale["Unemployment rate"] = MinMaxScaler().fit_transform(np.array(rescale["Unemployment rate"]).reshape(-1,1))
df_clean = rescale
df_clean
'''

'\nrescale = df_clean\n#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))\nrescale["WACC Inflation Adjusted Risk Free Rate, (%)"] = MinMaxScaler().fit_transform(np.array(rescale["WACC Inflation Adjusted Risk Free Rate, (%)"]).reshape(-1,1))\nrescale["Unemployment rate"] = MinMaxScaler().fit_transform(np.array(rescale["Unemployment rate"]).reshape(-1,1))\ndf_clean = rescale\ndf_clean\n'

In [16]:
df_panel = df_clean.copy()

In [None]:
df_clean['Instrument'] = df_clean['Instrument'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# set the index to be the time variable and the cross-sectional variable
df_clean.set_index(['Instrument', 'Date'], inplace=True)

#### Random Effects

**R-squared (Within)**: This is the proportion of variation in the dependent variable (in your case, Earnings Per Share - Actual Surprise) that is explained by the independent variables (in our case, the variables in X) within each individual instrument (the 501 companies). In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable (surprise) within each instrument. The R-squared (Within) is 0.5745, which means that the independent variables explain 57.45% of the variation in the dependent variable within each instrument.  

**R-squared (Between)**: This is the proportion of variation in the dependent variable that is explained by the independent variables across the instruments. In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable between different instruments. The R-squared (Between) is 0.0011 (0.1%), which means that the independent variables are not really able to explain the variation in the dependent variable across instruments.  

**R-squared (Overall)**: This is the proportion of variation in the dependent variable that is explained by the independent variables overall, taking into account both the within-instrument and between-instrument variation. The R-squared (Overall) is 0.1601, which means that the independent variables explain 16.01% of the variation in the dependent variable overall.

#### Fixed Effects

In [11]:
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True, drop_absorbed=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


Variables have been fully absorbed and have removed from the regression:

90-Day AA Financial Commercial Paper Interest Rate, CBOE Crude Oil ETF Volatility Index, Consumer Sentiment, Unemployment rate

  fixed_effects_results = fixed_effects_model.fit()


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.0002
Estimator:,PanelOLS,R-squared (Between):,-0.5219
No. Observations:,12860,R-squared (Within):,0.0126
Date:,"Tue, Feb 28 2023",R-squared (Overall):,-0.1366
Time:,16:57:51,Log-likelihood,-5.387e+04
Cov. Estimator:,Unadjusted,,
,,F-statistic:,1.8883
Entities:,498,P-value,0.1694
Avg Obs:,25.823,Distribution:,"F(1,12333)"
Min Obs:,1.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
"WACC Inflation Adjusted Risk Free Rate, (%)",-1.4298,1.0405,-1.3742,0.1694,-3.4693,0.6097


By Industry: example tech industry

In [12]:
group = df_panel[df_panel['GICS Industry Group Name'] == 'Technology Hardware & Equipment']

group['Instrument'] = group['Instrument'].astype('category')
group['Date'] = pd.to_datetime(group['Date'])
group.set_index(['Instrument', 'Date'], inplace=True)

y = group.loc[:, "Earnings Per Share - Actual Surprise"]
X = group[X_var_names]

model = RandomEffects(y, X).fit()
model


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['Instrument'] = group['Instrument'].astype('category')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  group['Date'] = pd.to_datetime(group['Date'])


0,1,2,3
Dep. Variable:,Earnings Per Share - Actual Surprise,R-squared:,0.2559
Estimator:,RandomEffects,R-squared (Between):,0.8145
No. Observations:,483,R-squared (Within):,0.0714
Date:,"Tue, Feb 28 2023",R-squared (Overall):,0.4131
Time:,16:57:59,Log-likelihood,-1831.9
Cov. Estimator:,Unadjusted,,
,,F-statistic:,32.871
Entities:,19,P-value,0.0000
Avg Obs:,25.421,Distribution:,"F(5,478)"
Min Obs:,6.0000,,

0,1,2,3,4,5,6
,Parameter,Std. Err.,T-stat,P-value,Lower CI,Upper CI
"WACC Inflation Adjusted Risk Free Rate, (%)",0.7221,1.0772,0.6704,0.5030,-1.3945,2.8388
90-Day AA Financial Commercial Paper Interest Rate,-1.4436,0.8234,-1.7532,0.0802,-3.0615,0.1743
CBOE Crude Oil ETF Volatility Index,-0.1567,0.0413,-3.7901,0.0002,-0.2380,-0.0755
Consumer Sentiment,0.0761,0.0249,3.0532,0.0024,0.0271,0.1251
Unemployment rate,1.7764,0.4013,4.4266,0.0000,0.9879,2.5650
