In [None]:
import pandas as pd
import numpy as np
import time
import sklearn 

df_proxy = pd.read_csv('..\\processed\\ibes_eps_quarters_2003_2024_processed.csv')
df_proxy.head()

Unnamed: 0,CUSIP,OFTIC,CNAME,STATPERS,FPI,NUMEST,FPEDATS
0,87482X10,TLMR,TALMER BANCORP,17/04/2014,6,4,31/03/2014
1,87482X10,TLMR,TALMER BANCORP,15/05/2014,6,5,30/06/2014
2,87482X10,TLMR,TALMER BANCORP,19/06/2014,6,5,30/06/2014
3,87482X10,TLMR,TALMER BANCORP,17/07/2014,6,5,30/06/2014
4,87482X10,TLMR,TALMER BANCORP,17/04/2014,7,5,30/06/2014


In [3]:
# Convert STATPERS and FPEDATS to datetime
df_proxy["STATPERS"] = pd.to_datetime(df_proxy["STATPERS"], dayfirst=True)
df_proxy["FPEDATS"] = pd.to_datetime(df_proxy["FPEDATS"], dayfirst=True)

In [4]:
# Create a column to distinguish first vs second half of the year
df_proxy['SemiAnnual'] = df_proxy['FPEDATS'].dt.month.apply(lambda x: 'A' if x <= 6 else 'B')

# Group by company, year, and semi-annual period
df_proxy['Year'] = df_proxy['FPEDATS'].dt.year

# Aggregate by CUSIP, Year, and SemiAnnual (A or B)
semi_annual = (
    df_proxy
    .groupby(['CUSIP', 'OFTIC', 'Year', 'SemiAnnual'])
    .agg({
        'NUMEST': 'sum',
        'FPEDATS': 'max'  # Take the latest fiscal period end date
    })
    .reset_index()
)

# Rename SemiAnnual to FPI
semi_annual['FPI'] = semi_annual['SemiAnnual']

# Drop the temporary SemiAnnual column
semi_annual.drop(columns=['SemiAnnual'], inplace=True)

# Sort the data
semi_annual = semi_annual.sort_values(['CUSIP', 'FPEDATS', 'FPI'])

In [5]:
semi_annual

Unnamed: 0,CUSIP,OFTIC,Year,NUMEST,FPEDATS,FPI
0,00030710,AAC,2014,2,2014-06-30,A
1,00030710,AAC,2014,13,2014-12-31,B
2,00030710,AAC,2015,29,2015-06-30,A
3,00030710,AAC,2015,76,2015-12-31,B
4,00030710,AAC,2016,72,2016-06-30,A
...,...,...,...,...,...,...
168228,U7260311,PLPM,2016,100,2016-12-31,B
168229,U7260311,PLPM,2017,102,2017-06-30,A
168230,U7260311,PLPM,2017,111,2017-12-31,B
168231,U7260311,PLPM,2018,39,2018-06-30,A


In [6]:
semi_annual.to_csv('..\\baseline_data\\ibes_eps_semi_annual_by_date_2003_2024.csv', index=False)

In [7]:
# count the number of stocks in 2003
semi_annual[semi_annual['Year'] == 2024].shape[0]

7395