# Gradient Boosting

In [1]:
import pandas as pd
import numpy as np
import datetime as dt
import sklearn
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from linearmodels.panel import PanelOLS
from linearmodels.panel import RandomEffects
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit


Data

In [2]:
df = pd.read_csv("Dataframes/characteristics.csv")
#X_var_names = ["Revenue - Actual","Enterprise Value","Market Capitalization","loss firm status", "Enterprise Value To Sales (Daily Time Series Ratio)", "3 Month Total Return", "Volume"]
df

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


In [3]:
df["Date"] = pd.to_datetime(df["Date"])

In [4]:
df.dtypes

Instrument                                                     object
Date                                                   datetime64[ns]
GICS Industry Group Name                                       object
Earnings Per Share - Actual Surprise                          float64
Earnings Per Share - Actual Surprise AbsVals                  float64
Revenue - Actual                                              float64
Enterprise Value                                              float64
Market Capitalization                                         float64
Enterprise Value To Sales (Daily Time Series Ratio)           float64
3 Month Total Return                                          float64
Volume                                                        float64
loss firm status                                                int64
dtype: object

Remove outliers

In [5]:
#removing outliers
summary_stats = df["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df["Earnings Per Share - Actual Surprise"].loc[~((df["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_clean = df.copy()
df_clean["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_clean = df_clean.dropna(subset=["Earnings Per Share - Actual Surprise"])
#df_clean = df_clean.dropna()
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,11.178,1.532200e+09,4.413001e+09,3.426001e+09,0.799347,10.637479,207295.0,-1
1,AVY.N,2013-04-01,Materials,2.482,2.482,1.498900e+09,5.459128e+09,4.309428e+09,0.895787,24.148726,0.0,-1
2,AVY.N,2013-07-01,Materials,1.068,1.068,1.552300e+09,5.436229e+09,4.258229e+09,0.906078,-0.054218,0.0,-1
3,AVY.N,2013-10-01,Materials,8.095,8.095,1.504900e+09,5.029514e+09,4.273414e+09,0.903176,2.467620,236291.0,-1
4,AVY.N,2014-01-01,Materials,1.471,1.471,1.583900e+09,5.553068e+09,4.877168e+09,0.920659,16.003852,157761.0,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,17.194,1.411448e+09,1.769954e+10,1.742020e+10,3.684400,-5.130322,157740.0,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,40.267,1.035557e+09,2.384882e+10,2.268979e+10,4.504382,30.489705,82349.0,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,34.342,1.412650e+09,1.843682e+10,1.696712e+10,3.422880,-25.160664,290897.0,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,1.503,2.055818e+09,1.556187e+10,1.405795e+10,2.749481,-21.661570,185584.0,-1


Standardising

In [6]:
rescale = df_clean
rescale["Earnings Per Share - Actual Surprise AbsVals"] = MinMaxScaler().fit_transform(np.array(rescale["Earnings Per Share - Actual Surprise"]).reshape(-1,1))
rescale["Revenue - Actual"] = MinMaxScaler().fit_transform(np.array(rescale["Revenue - Actual"]).reshape(-1,1))
rescale["Enterprise Value"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value"]).reshape(-1,1))
rescale["Market Capitalization"] = MinMaxScaler().fit_transform(np.array(rescale["Market Capitalization"]).reshape(-1,1))
rescale["Enterprise Value To Sales (Daily Time Series Ratio)"] = MinMaxScaler().fit_transform(np.array(rescale["Enterprise Value To Sales (Daily Time Series Ratio)"]).reshape(-1,1))
rescale["3 Month Total Return"] = MinMaxScaler().fit_transform(np.array(rescale["3 Month Total Return"]).reshape(-1,1))
rescale["Volume"] = MinMaxScaler().fit_transform(np.array(rescale["Volume"]).reshape(-1,1))
df_clean = rescale
df_clean

Unnamed: 0,Instrument,Date,GICS Industry Group Name,Earnings Per Share - Actual Surprise,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,Materials,11.178,0.526057,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
1,AVY.N,2013-04-01,Materials,2.482,0.476798,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
2,AVY.N,2013-07-01,Materials,1.068,0.468788,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
3,AVY.N,2013-10-01,Materials,8.095,0.508593,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
4,AVY.N,2014-01-01,Materials,1.471,0.471071,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,Retailing,17.194,0.560135,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
19300,POOL.OQ,2022-01-01,Retailing,40.267,0.690834,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
19301,POOL.OQ,2022-04-01,Retailing,34.342,0.657271,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
19302,POOL.OQ,2022-07-01,Retailing,1.503,0.471252,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


In [7]:
# df_clean['Instrument'] = df_clean['Instrument'].astype('category')
# df_clean['Date'] = pd.to_datetime(df_clean['Date'])
#
# # set the index to be the time variable and the cross-sectional variable
# df_clean.set_index(['Instrument', 'Date'], inplace=True)
# df_clean

In [8]:
df_clean =df_clean.drop("GICS Industry Group Name", axis=1)
df_clean =df_clean.drop("Earnings Per Share - Actual Surprise", axis=1)
df_clean

Unnamed: 0,Instrument,Date,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,0.526057,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
1,AVY.N,2013-04-01,0.476798,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
2,AVY.N,2013-07-01,0.468788,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
3,AVY.N,2013-10-01,0.508593,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
4,AVY.N,2014-01-01,0.471071,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,0.560135,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
19300,POOL.OQ,2022-01-01,0.690834,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
19301,POOL.OQ,2022-04-01,0.657271,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
19302,POOL.OQ,2022-07-01,0.471252,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


In [9]:
df_clean.dtypes

Instrument                                                     object
Date                                                   datetime64[ns]
Earnings Per Share - Actual Surprise AbsVals                  float64
Revenue - Actual                                              float64
Enterprise Value                                              float64
Market Capitalization                                         float64
Enterprise Value To Sales (Daily Time Series Ratio)           float64
3 Month Total Return                                          float64
Volume                                                        float64
loss firm status                                                int64
dtype: object

In [10]:
#df_clean = df_clean[(df_clean['Instrument'] == 'AAPL.OQ')]
df_clean

Unnamed: 0,Instrument,Date,Earnings Per Share - Actual Surprise AbsVals,Revenue - Actual,Enterprise Value,Market Capitalization,Enterprise Value To Sales (Daily Time Series Ratio),3 Month Total Return,Volume,loss firm status
0,AVY.N,2013-01-01,0.526057,0.065123,0.027862,0.001166,0.027195,0.298034,0.001183,-1
1,AVY.N,2013-04-01,0.476798,0.064917,0.028207,0.001471,0.027308,0.340793,0.000000,-1
2,AVY.N,2013-07-01,0.468788,0.065247,0.028199,0.001453,0.027321,0.264198,0.000000,-1
3,AVY.N,2013-10-01,0.508593,0.064955,0.028065,0.001459,0.027317,0.272179,0.001348,-1
4,AVY.N,2014-01-01,0.471071,0.065443,0.028238,0.001667,0.027338,0.315017,0.000900,-1
...,...,...,...,...,...,...,...,...,...,...
19299,POOL.OQ,2021-10-01,0.560135,0.064377,0.032245,0.006004,0.030598,0.248134,0.000900,-1
19300,POOL.OQ,2022-01-01,0.690834,0.062055,0.034274,0.007826,0.031566,0.360861,0.000470,-1
19301,POOL.OQ,2022-04-01,0.657271,0.064385,0.032488,0.005848,0.030290,0.184744,0.001660,-1
19302,POOL.OQ,2022-07-01,0.471252,0.068358,0.031540,0.004842,0.029495,0.195817,0.001059,-1


In [25]:
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error
data = df_clean.dropna() # Remove rows with missing values
encoder = LabelEncoder()
data['Instrument'] = encoder.fit_transform(data['Instrument']) # Encode categorical variable
data.set_index(['Date', 'Instrument'], inplace=True) # Set the index to date and panel ID

# Split the data
split_date = '2017-01-01'
end_date = "2019-01-01"
train = data.loc[data.index.get_level_values('Date') < split_date]
#test = data.loc[data.index.get_level_values('Date') >= split_date]
test = data.loc[(data.index.get_level_values('Date') >= split_date) & (data.index.get_level_values('Date') < end_date)]

# Define the XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=1000, max_depth=3, learning_rate=0.1)

# Define the time series split
tscv = TimeSeriesSplit(n_splits=12)

# Train the model
for train_index, test_index in tscv.split(train):
    X_train, X_test = train.iloc[train_index], train.iloc[test_index]
    y_train, y_test = X_train.pop('Earnings Per Share - Actual Surprise AbsVals'), X_test.pop('Earnings Per Share - Actual Surprise AbsVals')
    xgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)], early_stopping_rounds=50, verbose=False)

# Make predictions
y_pred = xgb_model.predict(test.drop('Earnings Per Share - Actual Surprise AbsVals', axis=1))

# Evaluate the model
mse = mean_squared_error(test['Earnings Per Share - Actual Surprise AbsVals'], y_pred)
mae = mean_absolute_error(test['Earnings Per Share - Actual Surprise AbsVals'], y_pred)
r2 = r2_score(test['Earnings Per Share - Actual Surprise AbsVals'], y_pred)

print(f'MSE: {mse}')
print(f'MAE: {mae}')
print(f'R2: {r2}')

importances = xgb_model.feature_importances_
for i in range(len(importances)):
    print(f"Feature {i}: {importances[i]}")



MSE: 0.007987028929518079
MAE: 0.05266523381994445
R2: 0.00269535854336056
Feature 0: 0.14056053757667542
Feature 1: 0.12205764651298523
Feature 2: 0.14574943482875824
Feature 3: 0.15287086367607117
Feature 4: 0.1554236114025116
Feature 5: 0.11971303075551987
Feature 6: 0.16362477838993073
