# Panle Regression - Firm Characteristics

### Random Effects Panel Regression

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects


Data

In [2]:
df = pd.read_csv("Dataframes/macro.csv")
X_var_names = ["WACC Inflation Adjusted Risk Free Rate, (%)", "Unemployment rate"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,"WACC Inflation Adjusted Risk Free Rate, (%)",Unemployment rate
0,AVY.N,2013-01-01,Materials,11.178,,8.0
1,AVY.N,2013-04-01,Materials,2.482,,7.6
2,AVY.N,2013-07-01,Materials,1.068,,7.3
3,AVY.N,2013-10-01,Materials,8.095,,7.2
4,AVY.N,2014-01-01,Materials,1.471,,6.6
...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,1.527139,4.5
20116,POOL.OQ,2022-01-01,Retailing,40.267,1.515266,4.0
20117,POOL.OQ,2022-04-01,Retailing,34.342,2.325202,3.6
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.092855,3.5


In [3]:
df["Date"] = pd.to_datetime(df["Date"])

Remove outliers

In [4]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,"WACC Inflation Adjusted Risk Free Rate, (%)",Unemployment rate
0,AVY.N,2013-01-01,Materials,11.178,,8.0
1,AVY.N,2013-04-01,Materials,2.482,,7.6
2,AVY.N,2013-07-01,Materials,1.068,,7.3
3,AVY.N,2013-10-01,Materials,8.095,,7.2
4,AVY.N,2014-01-01,Materials,1.471,,6.6
...,...,...,...,...,...,...
20115,POOL.OQ,2021-10-01,Retailing,17.194,1.527139,4.5
20116,POOL.OQ,2022-01-01,Retailing,40.267,1.515266,4.0
20117,POOL.OQ,2022-04-01,Retailing,34.342,2.325202,3.6
20118,POOL.OQ,2022-07-01,Retailing,1.503,3.092855,3.5


Standardising

get wierd results when standardising

In [5]:
'''
rescale = df_clean
#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["WACC Inflation Adjusted Risk Free Rate, (%)"] = MinMaxScaler().fit_transform(np.array(rescale["WACC Inflation Adjusted Risk Free Rate, (%)"]).reshape(-1,1))
rescale["Unemployment rate"] = MinMaxScaler().fit_transform(np.array(rescale["Unemployment rate"]).reshape(-1,1))
df_clean = rescale
df_clean
'''

'\nrescale = df_clean\n#rescale["Earnings Per Share - Actual Surprise"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))\nrescale["WACC Inflation Adjusted Risk Free Rate, (%)"] = MinMaxScaler().fit_transform(np.array(rescale["WACC Inflation Adjusted Risk Free Rate, (%)"]).reshape(-1,1))\nrescale["Unemployment rate"] = MinMaxScaler().fit_transform(np.array(rescale["Unemployment rate"]).reshape(-1,1))\ndf_clean = rescale\ndf_clean\n'

In [6]:
df_panel = df_clean.copy()

#### Random Effects

In [7]:
# turn dataframe into right dimension for panel regression with multiindex
df_clean['Instrument'] = df_clean['Instrument'].astype('category')
df_clean['Date'] = pd.to_datetime(df_clean['Date'])

# set the index to be the time variable and the cross-sectional variable
df_clean.set_index(['Instrument', 'Date'], inplace=True)

# define the dependent variable
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the random effects panel regression
random_effects_model = RandomEffects(y, X)

# fit the model and print the summary statistics
random_effects_results = random_effects_model.fit()
print(random_effects_results.summary)


                                 RandomEffects Estimation Summary                                 
Dep. Variable:     Earnings Per Share - Actual Surprise   R-squared:                        0.0528
Estimator:                                RandomEffects   R-squared (Between):              0.5745
No. Observations:                                 12860   R-squared (Within):               0.0011
Date:                                  Mon, Feb 27 2023   R-squared (Overall):              0.1601
Time:                                          21:01:07   Log-likelihood                -5.446e+04
Cov. Estimator:                              Unadjusted                                           
                                                          F-statistic:                      358.44
Entities:                                           502   P-value                           0.0000
Avg Obs:                                         25.618   Distribution:                 F(2,12858)
Min Obs:  

Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


**R-squared (Within)**: This is the proportion of variation in the dependent variable (in your case, Earnings Per Share - Actual Surprise) that is explained by the independent variables (in our case, the variables in X) within each individual instrument (the 501 companies). In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable (surprise) within each instrument. The R-squared (Within) is 0.5745, which means that the independent variables explain 57.45% of the variation in the dependent variable within each instrument.  

**R-squared (Between)**: This is the proportion of variation in the dependent variable that is explained by the independent variables across the instruments. In other words, this R-squared measures how well the independent variables explain the variation in the dependent variable between different instruments. The R-squared (Between) is 0.0011 (0.1%), which means that the independent variables are not really able to explain the variation in the dependent variable across instruments.  

**R-squared (Overall)**: This is the proportion of variation in the dependent variable that is explained by the independent variables overall, taking into account both the within-instrument and between-instrument variation. The R-squared (Overall) is 0.1601, which means that the independent variables explain 16.01% of the variation in the dependent variable overall.

#### Fixed Effects

In [8]:
y = df_clean.loc[:, "Earnings Per Share - Actual Surprise"]
X = df_clean[X_var_names]

# perform the fixed effects panel regression
fixed_effects_model = PanelOLS(y, X, entity_effects=True, time_effects=True)

# fit the model and print the summary statistics
fixed_effects_results = fixed_effects_model.fit()
fixed_effects_results.summary


Inputs contain missing values. Dropping rows with missing observations.
  super().__init__(dependent, exog, weights=weights, check_rank=check_rank)


AbsorbingEffectError: 
The model cannot be estimated. The included effects have fully absorbed
one or more of the variables. This occurs when one or more of the dependent
variable is perfectly explained using the effects included in the model.

The following variables or variable combinations have been fully absorbed
or have become perfectly collinear after effects are removed:

          Unemployment rate

Set drop_absorbed=True to automatically drop absorbed variables.


By Industry: example tech industry

In [None]:
df2 = df_panel[df_panel['GICS Industry Group Name'] == 'Technology Hardware & Equipment']
groups = df2.groupby('Instrument')

def panel_regression_random(group):
    if len(group) < 2:
        return None
    
    group['Instrument'] = group['Instrument'].astype('category')
    group['Date'] = pd.to_datetime(group['Date'])
    group.set_index(['Instrument', 'Date'], inplace=True)

    y = group.loc[:, "Earnings Per Share - Actual Surprise"]
    X = group[X_var_names]

    model = RandomEffects(y, X).fit()
    return model


results_random = groups.apply(panel_regression_random)
results_random = results_random.dropna()
models_random = results_random.tolist()
models_random