In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pyreadstat
from datetime import datetime

In [None]:
crsp_data = pd.read_excel("Data/CRSP_fund.xlsx")
crsp_data

In [None]:
crsp_data.columns

In [None]:
crsp_data2 = crsp_data[['Date','Fund Identifier','CUSIP (8-digit)','Fund Name', 'index_fund_flag']]
crsp_cols = ['Date', 'fundid(CRSP)', 'fund_CUSIP', 'fundname(CRSP)', 'indexflag']
crsp_data2.columns = crsp_cols

In [None]:
index_flag_funds = crsp_data2[crsp_data2['indexflag'].isin(['B', 'D'])]
index_flag_funds

In [None]:
lseg_data, lseg_meta = pyreadstat.read_dta("Data/LSEG_fund.dta")
lseg_data

In [None]:
lseg_data.columns

In [None]:
lseg_data2 = lseg_data[lseg_data['shrout2'].isna() == False]
lseg_data3 = lseg_data2[['fdate','fundno', 'fundname','cusip','stkname', 'ticker','shrout2']]
lseg_cols = ['Date', 'fundid(LSEG)', 'fundname(LSEG)', 'stock_CUSIP', 'stock_name', 'ticker', 'shares_outstanding(1000s)']
lseg_data3.columns = lseg_cols
lseg_data3

In [None]:
index_flag_funds['Date'] = pd.to_datetime(index_flag_funds['Date'])
lseg_data3['Date'] = pd.to_datetime(lseg_data3['Date'])

In [None]:
lseg_data3 = lseg_data3[lseg_data3['Date'] >= "2022-01-01"]
lseg_data3

In [None]:
CRSP_WFICN = pd.read_excel("Data/CRSP_WFICN.xlsx")
CRSP_WFICN2 = CRSP_WFICN[['Fund Identifier', 'Wharton Financial Instution Center Number']]
CRSP_WFICN2.columns = ['fundid(CRSP)', 'wficn']
CRSP_WFICN2

In [None]:
index_flag_funds2 = pd.merge(index_flag_funds, CRSP_WFICN2, how = "inner", on="fundid(CRSP)")
index_flag_funds2

In [None]:
FUNDNO_WFICN = pd.read_stata("Data/FUNDO_WFICN.dta")
FUNDNO_WFICN2 = FUNDNO_WFICN[['fundno', 'wficn']]
FUNDNO_WFICN2.columns = ['fundid(LSEG)', 'wficn']
FUNDNO_WFICN2

In [None]:
FUNDNO_WFICN3 = FUNDNO_WFICN2.drop_duplicates(subset=['wficn', 'fundid(LSEG)'], keep='first').reset_index(drop=True)
FUNDNO_WFICN3 = FUNDNO_WFICN3.dropna()
FUNDNO_WFICN3

In [None]:
lseg_data4 = pd.merge(lseg_data3, FUNDNO_WFICN3, how = "left", on = 'fundid(LSEG)')
lseg_data4 = lseg_data4.dropna()
lseg_data4

In [None]:
#lseg_data4['Date'].unique().tolist()

In [None]:
merged_data = pd.merge(index_flag_funds2, lseg_data4, on=['Date', 'wficn'], how='inner')
merged_data

In [None]:
passive_data = (merged_data.groupby(['Date', 'ticker'])['shares_outstanding(1000s)'].sum().reset_index())

In [None]:
passive_data.columns = ['Date', 'ticker', 'sharesheldpassive(1000s)']
passive_data['shares_held_passive'] = passive_data['sharesheldpassive(1000s)'] * 1000 #check
passive_data_22 = passive_data.drop(columns=['sharesheldpassive(1000s)'])
passive_data_22

In [None]:
passive_data_22.to_csv("Data/passive_22.csv")

In [None]:
#Can start from here
passive_data_22 = pd.read_csv("Data/passive_22.csv")

In [None]:
passive_data2 = passive_data_22.dropna()
passive_data3 = passive_data2[['Date', 'ticker', 'shares_held_passive']]
passive_data3

In [None]:
earnings_dates = pd.read_excel("Data/LSEG_earnings.xlsx")
earnings_dates

In [None]:
earnings_dates2 = earnings_dates[['oftic', 'Period End Date, SAS Format' ,'Announce Date, SAS Format', 'Announce time, SAS Format']]
earnings_dates2.columns = ['ticker', 'quarter' ,'earnings_date', 'earnings_time']
earnings_dates2

In [None]:
earnings_dates2['earnings_date'] = pd.to_datetime(earnings_dates2['earnings_date'])
earnings_dates2['earnings_time'] = pd.to_datetime(earnings_dates2['earnings_time'], format='%H:%M:%S').dt.time
earnings_dates2

In [None]:
earnings_dates2['nextday'] = earnings_dates2['earnings_time'] > pd.to_datetime('16:30:00', format='%H:%M:%S').time()


In [None]:
if earnings_dates2['nextday'].any() == True:
    earnings_dates2['earnings_date'] = earnings_dates2['earnings_date'] + pd.Timedelta(days=1)

earnings_dates2

In [None]:
earnings_dates3 = earnings_dates2.drop(columns=['earnings_time', 'nextday', 'quarter'])
earnings_dates3['Date'] = earnings_dates3['earnings_date']
earnings_dates3

In [None]:
stock_data = pd.read_stata("Data/daily_stock.dta")

In [None]:
stock_data = stock_data[stock_data['date'] >= "2022-01-01"]
stock_data['date'] = pd.to_datetime(stock_data['date'])

In [None]:
stock_colnames = ["permno", "Date", "ticker", "company", "cusip", "price", "vol", "return", "shares_outstnading(1000s)"]
stock_data.columns = stock_colnames
stock_data["shares_outstanding"] = stock_data["shares_outstnading(1000s)"] * 1000000 #check 

In [None]:
stock_data2 = stock_data.drop("shares_outstnading(1000s)", axis = 1)
stock_data3 = stock_data2.dropna()
stock_data3

In [None]:
negative_prices = stock_data3[(stock_data3.duplicated(subset=['ticker','Date'], keep=False)) &(stock_data3['price'] < 0)]
negative_prices

In [None]:
stock_data3 = stock_data3[stock_data3['price'] >= 0]

In [None]:
stock_data3 = stock_data3.sort_values(['ticker','Date','shares_outstanding'], ascending=[True, True, False])
stock_data3 = stock_data3.drop_duplicates(subset=['ticker','Date'], keep='first')

In [None]:
passive_data3['ticker'] = passive_data3['ticker'].astype(str)
passive_data3['ticker'] = passive_data3['ticker'].str.upper()
stock_data3['ticker'] = stock_data3['ticker'].astype(str)
stock_data3['ticker'] = stock_data3['ticker'].str.upper()
passive_data3['Date'] = pd.to_datetime(passive_data3['Date'])
stock_data3['Date'] = pd.to_datetime(stock_data3['Date'])
passive_data3 = passive_data3.sort_values(['ticker', 'Date']).reset_index(drop=True)
stock_data3 = stock_data3.sort_values(['ticker', 'Date']).reset_index(drop=True)

In [None]:
merge1 = pd.merge(stock_data3, passive_data3, on = ['ticker', 'Date'], how = 'left')
merge1

In [None]:
merge2 = merge1.copy()
merge2['QuarterEnd'] = merge2['Date'].dt.to_period('Q').dt.end_time
merge2['shares_held_passive'] = (merge2.groupby(['ticker', 'QuarterEnd'])['shares_held_passive']
    .transform(lambda x: x.ffill().bfill() if x.notna().any() else x))
merge2

In [None]:
merge3 = merge2.dropna()
merge3 = merge3.drop(columns=['QuarterEnd'])
merge3['share_passive'] = merge3['shares_held_passive']/merge3['shares_outstanding']
merge3

In [None]:
merge4 = pd.merge(merge3, earnings_dates3, on = ['Date', 'ticker'], how = 'left')
merge4

In [None]:
greaterthan1 = merge3[merge3['share_passive'] >= 1]
problem_tickers = greaterthan1['ticker'].unique()
totaltickers = merge3['ticker'].unique()
print(f"total:{len(totaltickers)}, problems: {len(problem_tickers)}")