In [None]:
import pandas as pd
import numpy as np
import time
import sklearn 

df_proxy = pd.read_csv('..\\processed_data\\ibes_eps_quarters_2003_2024_processed.csv')
df_proxy.head()

In [None]:
# Convert STATPERS and FPEDATS to datetime
df_proxy["STATPERS"] = pd.to_datetime(df_proxy["STATPERS"], dayfirst=True)
df_proxy["FPEDATS"] = pd.to_datetime(df_proxy["FPEDATS"], dayfirst=True)

In [None]:
# Create a column to distinguish first vs second half of the year
df_proxy['SemiAnnual'] = df_proxy['FPEDATS'].dt.month.apply(lambda x: 'A' if x <= 6 else 'B')

# Group by company, year, and semi-annual period
df_proxy['Year'] = df_proxy['FPEDATS'].dt.year

# Aggregate by CUSIP, Year, and SemiAnnual (A or B)
semi_annual = (
    df_proxy
    .groupby(['CUSIP', 'OFTIC', 'Year', 'SemiAnnual'])
    .agg({
        'NUMEST': 'sum',
        'FPEDATS': 'max'  # Take the latest fiscal period end date
    })
    .reset_index()
)

# Rename SemiAnnual to FPI
semi_annual['FPI'] = semi_annual['SemiAnnual']

# Drop the temporary SemiAnnual column
semi_annual.drop(columns=['SemiAnnual'], inplace=True)

# Sort the data
semi_annual = semi_annual.sort_values(['CUSIP', 'FPEDATS', 'FPI'])

In [None]:
semi_annual

In [None]:
semi_annual.to_csv('..\\models_data\\ibes_eps_semi_annual_by_date_2003_2024.csv', index=False)

In [None]:
# count the number of stocks in 2003
semi_annual[semi_annual['Year'] == 2024].shape[0]