# Accuracy Analysis

In [66]:
import eikon as ek
import pandas as pd
import numpy as np
import datetime
import plotly
import plotly.express as px
ek.set_app_key("f47c330480d74c598b7e8ebc2539424e91764dd8")

https://community.developers.refinitiv.com/questions/73493/get-eps-historical-data-for-stocks.html

### Accuracy Variables  

**TR.EPSActValue** - The company's actual value normalized to reflect the I/B/E/S default currency and corporate actions (e.g. stock splits). Earnings Per Share is defined as the EPS that the contributing analyst considers to be that with which to value a security. This figure may include or exclude certain items depending on the contributing analyst's specific model.  

**TR.EPSMean** - The statistical average of all broker estimates determined to be on the majority accounting basis. Earnings Per Share is defined as the EPS that the contributing analyst considers to be that with which to value a security. This figure may include or exclude certain items depending on the contributing analyst's specific model.  

--> this is a analyst forecast variable

**TR.EPSActSurprise** - The difference between the actual and the last mean of the period, expressed as a percentage. Earnings Per Share is defined as the EPS that the contributing analyst considers to be that with which to value a security. This figure may include or exclude certain items depending on the contributing analyst's specific model.  

--> forecast error between actual EPS and TR.EPSMean  


### DataFrame 

In [67]:
accuracy_variables = ['TR.EPSactValue.date', 'TR.EPSActValue', "TR.EPSMean", "TR.EPSActSurprise"]
df_accuracy, e = ek.get_data('0#.SPX',accuracy_variables, parameters = {'SDate':'0','EDate':'-5','Period':'FQ0','Frq':'FQ'})
df_accuracy["Date"] = pd.to_datetime(df_accuracy["Date"])
df_accuracy = df_accuracy.dropna()
df_accuracy

Unnamed: 0,Instrument,Date,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
0,POOL.OQ,2023-02-16 07:00:00+00:00,1.82,1.987,-8.405
1,POOL.OQ,2022-10-20 07:00:00+00:00,4.78,4.5875,4.196
2,POOL.OQ,2022-07-21 07:00:00+00:00,7.63,7.517,1.503
3,POOL.OQ,2022-04-21 07:00:00+00:00,4.23,3.14867,34.342
4,POOL.OQ,2022-02-17 07:00:00+00:00,2.63,1.875,40.267
...,...,...,...,...,...
3003,AVY.N,2022-10-26 06:45:00+00:00,2.46,2.46323,-0.131
3004,AVY.N,2022-07-27 06:45:00+00:00,2.64,2.36485,11.635
3005,AVY.N,2022-04-26 06:45:00+00:00,2.4,2.17008,10.595
3006,AVY.N,2022-02-02 06:45:00+00:00,2.13,2.12375,0.294


In [68]:
df_accuracy.dtypes

Instrument                                           string
Date                                    datetime64[ns, UTC]
Earnings Per Share - Actual                         Float64
Earnings Per Share - Mean                           Float64
Earnings Per Share - Actual Surprise                Float64
dtype: object

### Exploratory Data Analysis of Analyst Forcast Accuracy

#### 1) Summary statistics for EPS Actual, EPS Mean (forecast), and EPS Surprise

In [69]:
df_accuracy.describe()

Unnamed: 0,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
count,3005.0,3005.0,3005.0
mean,2.226721,2.090364,3.688549
std,5.597372,5.236982,261.898117
min,-7.69,-5.60614,-8858.503
25%,0.78,0.7338,0.474
50%,1.43,1.32212,4.8
75%,2.52,2.35192,12.413
max,133.441,126.76571,3784.211


Min and max values seem very high. Next step is to check for outliers and remove them for better results:  

**Removing Outliers:**

In [70]:
num_cols = df_accuracy.select_dtypes(include=[np.number]).columns.tolist()
num_cols

summary_stats = df_accuracy["Earnings Per Share - Actual Surprise"].describe()
summary_stats

Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
print(Q1, Q3, Q3 - Q1)

0.474 12.413 11.939


In [71]:
#num_cols = df.select_dtypes(include=[np.number]).columns.tolist()
#surprise_column = df_accuracy["Earnings Per Share - Actual Surprise"].tolist()
summary_stats = df_accuracy["Earnings Per Share - Actual Surprise"].describe()
Q1 = summary_stats.loc['25%']
Q3 = summary_stats.loc['75%']
IQR = Q3 - Q1
threshold = 7
surprise_outliers_removed = df_accuracy["Earnings Per Share - Actual Surprise"].loc[~((df_accuracy["Earnings Per Share - Actual Surprise"] < (Q1 - threshold * IQR)) | (df_accuracy["Earnings Per Share - Actual Surprise"] > (Q3 + threshold * IQR)))]
df_accuracy_new = df_accuracy.copy()
df_accuracy_new["Earnings Per Share - Actual Surprise"] = surprise_outliers_removed
df_accuracy_new

Unnamed: 0,Instrument,Date,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
0,POOL.OQ,2023-02-16 07:00:00+00:00,1.82,1.987,-8.405
1,POOL.OQ,2022-10-20 07:00:00+00:00,4.78,4.5875,4.196
2,POOL.OQ,2022-07-21 07:00:00+00:00,7.63,7.517,1.503
3,POOL.OQ,2022-04-21 07:00:00+00:00,4.23,3.14867,34.342
4,POOL.OQ,2022-02-17 07:00:00+00:00,2.63,1.875,40.267
...,...,...,...,...,...
3003,AVY.N,2022-10-26 06:45:00+00:00,2.46,2.46323,-0.131
3004,AVY.N,2022-07-27 06:45:00+00:00,2.64,2.36485,11.635
3005,AVY.N,2022-04-26 06:45:00+00:00,2.4,2.17008,10.595
3006,AVY.N,2022-02-02 06:45:00+00:00,2.13,2.12375,0.294


In [72]:
na_count = df_accuracy_new["Earnings Per Share - Actual Surprise"].isna().sum()
na_count

106

--> deteceted outliers at the given threshold

In [73]:
#removing NA
df_accuracy_new = df_accuracy_new.dropna()
df_accuracy_new

Unnamed: 0,Instrument,Date,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
0,POOL.OQ,2023-02-16 07:00:00+00:00,1.82,1.987,-8.405
1,POOL.OQ,2022-10-20 07:00:00+00:00,4.78,4.5875,4.196
2,POOL.OQ,2022-07-21 07:00:00+00:00,7.63,7.517,1.503
3,POOL.OQ,2022-04-21 07:00:00+00:00,4.23,3.14867,34.342
4,POOL.OQ,2022-02-17 07:00:00+00:00,2.63,1.875,40.267
...,...,...,...,...,...
3003,AVY.N,2022-10-26 06:45:00+00:00,2.46,2.46323,-0.131
3004,AVY.N,2022-07-27 06:45:00+00:00,2.64,2.36485,11.635
3005,AVY.N,2022-04-26 06:45:00+00:00,2.4,2.17008,10.595
3006,AVY.N,2022-02-02 06:45:00+00:00,2.13,2.12375,0.294


#### Surprise Distribution (with outliers)

In [74]:
fig = px.histogram(df_accuracy, x="Earnings Per Share - Actual Surprise", nbins=100, title="EPS Surprise (%) Distribution")
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()

#### Surprise Distribution (without outliers)

In [75]:
fig = px.histogram(df_accuracy_new, x="Earnings Per Share - Actual Surprise", nbins=100, title="EPS Surprise (%) Distribution")
fig.update_layout(yaxis=dict(tickformat=".2%"))
fig.show()

#### 2) Mean of all time periods per Instrument

In [76]:
df_averages = df_accuracy.groupby("Instrument").mean()
df_averages


The default value of numeric_only in DataFrameGroupBy.mean is deprecated. In a future version, numeric_only will default to False. Either specify numeric_only or select only columns which should be valid for the function.



Unnamed: 0_level_0,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
Instrument,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
A.N,1.253333,1.17476,6.625667
AAL.OQ,-0.351667,-0.408517,6.156333
AAP.N,3.138333,3.095295,2.1005
AAPL.OQ,1.538333,1.486917,3.306167
ABBV.N,3.418333,3.349458,2.076
...,...,...,...
YUM.N,1.123333,1.12382,-0.0935
ZBH.N,1.775,1.694997,5.18
ZBRA.OQ,4.43,4.276117,3.820833
ZION.OQ,1.431667,1.396037,2.782167


Summary statistics per instrument

In [77]:
df_averages.describe()

Unnamed: 0,Earnings Per Share - Actual,Earnings Per Share - Mean,Earnings Per Share - Actual Surprise
count,501.0,501.0,501.0
mean,2.225956,2.08996,3.648812
std,5.413813,5.087932,128.39726
min,-3.223333,-2.691338,-2109.542333
25%,0.838333,0.777785,1.9435
50%,1.46,1.347137,5.741667
75%,2.515,2.392125,10.817667
max,111.281833,105.479365,821.322333


remove outliers

Mean surprise (in percentage) per instrument divided into percentile groups

In [78]:
# grouping data by year
df_accuracy_yearly, e = ek.get_data('0#.SPX', accuracy_variables, parameters = {'SDate':'0','EDate':'-5','Period':'FY0','Frq':'FY'})
df_accuracy_yearly["Date"] = pd.to_datetime(df_accuracy_yearly["Date"]).dt.year
df_accuracy_yearly = df_accuracy_yearly.dropna()

In [79]:
bins = [-100, -50, -20, -10, -5, 0, 5, 10, 20, 30, 40, 50, 60, 80, 100]

# Group the data by year and calculate the percentile counts for each year
df_percentiles = pd.DataFrame(index=range(df_accuracy_yearly["Date"].min(), df_accuracy_yearly["Date"].max()+1),
                              columns=[f"{bins[i]}-{bins[i+1]}" for i in range(len(bins)-1)])
for year in df_percentiles.index:
    df_year = df_accuracy_yearly[df_accuracy_yearly["Date"] == year]
    percentile_counts = pd.cut(df_year["Earnings Per Share - Actual Surprise"], bins=bins, labels=df_percentiles.columns).value_counts().sort_index()
    df_percentiles.loc[year] = percentile_counts.values

df_percentiles

Unnamed: 0,-100--50,-50--20,-20--10,-10--5,-5-0,0-5,5-10,10-20,20-30,30-40,40-50,50-60,60-80,80-100
2017,0,1,2,3,31,96,8,5,0,2,0,0,0,0
2018,0,4,11,8,97,309,33,12,8,1,0,1,2,0
2019,1,6,10,7,122,298,26,14,1,1,2,0,1,0
2020,2,6,4,9,98,300,35,22,6,4,3,0,1,2
2021,5,9,10,12,72,258,65,40,11,5,2,0,1,3
2022,0,7,8,13,98,316,30,11,4,3,2,1,2,1
2023,1,5,7,16,81,195,20,10,3,3,0,1,0,0


Surprise development over time

In [80]:
# Group the data by company and date and calculate the mean surprise percentag
df_grouped = df_accuracy.groupby(['Instrument', 'Date'])['Earnings Per Share - Actual Surprise'].mean().reset_index()

# Plot the data for each company as a separate line
for company in df_grouped['Instrument'].unique():
    company_data = df_grouped[df_grouped['Instrument'] == company]
    plt.plot(company_data['Date'], company_data['Earnings Per Share - Actual Surprise'], label=company)

# Add axis labels and legend
plt.xlabel('Date')
plt.ylabel('EPS Surprise (%)')
#plt.legend(loc='upper left')

# Show the plot
plt.show()

NameError: name 'plt' is not defined