# Cleaning stocks

In [None]:
import pandas as pd
stocks = pd.read_csv('..\data\stock_price_monthly_2003_2024.csv')

stocks.head()

In [None]:
drop_stock_cols = ["CUSIP", "vwretx", "PERMCO", "PERMNO", "RCRDDT", "SHRCD", "EXCHCD", "SICCD"]

try:
   stocks["NasdaqDummy"] = (stocks["EXCHCD"] == 3).astype(int)
except:
   print("EXCHCD col not found")


stocks.drop(drop_stock_cols, axis=1, inplace=True, errors="ignore")
stocks.head()

In [None]:
# check for missing values and null values:

stocks.isnull().sum()

In [None]:
# show 10 rows with missing values:

stocks[stocks.isnull().any(axis=1)].head(10)

In [None]:
# drop rows with missing values:

stocks.dropna(inplace=True)

In [None]:
# save to processed

stocks.to_csv('..\processed_data\stock_price_monthly_2003_2024_processed.csv', index=False)

# Cleaning IBES

In [None]:
import pandas as pd
ibes = pd.read_csv('..\data\ibes_eps_quarters_2003_2024.csv')
ibes.head()

In [None]:
drop_ibes_cols = ["MEASURE", "TICKER"]

ibes.drop(drop_ibes_cols, axis=1, inplace=True, errors="ignore")

ibes.head()

In [None]:
# check for missing values and null values:

ibes.isnull().sum()

In [None]:
# show 10 rows with missing values:

ibes[ibes.isnull().any(axis=1)].iloc[0:10]

In [None]:
# drop rows if both CUSIP and OFTIC are missing:

ibes.dropna(subset=["CUSIP", "OFTIC"], how="all", inplace=True) 

In [None]:
# save to processed

ibes.to_csv('..\processed_data\ibes_eps_quarters_2003_2024_processed.csv', index=False)

# Check for connection between 2 tables

In [None]:
stocks = pd.read_csv('..\processed_data\stock_price_monthly_2003_2024_processed.csv')

In [None]:
ibes = pd.read_csv('..\processed_data\ibes_eps_quarters_2003_2024_processed.csv')

In [None]:
# get all unique CUSIP OFTIC CNAME from ibes:

ibes_unique_CUSIP_OFTIC = ibes[["CUSIP", "OFTIC"]].drop_duplicates()

ibes_unique_CNAME = ibes["CNAME"].unique()

# show
ibes_unique_CUSIP_OFTIC

In [None]:
# get all unique NCUSIP TICKER COMNAM from stocks

stocks_unique_NCUSIP_TICKER = stocks[["NCUSIP", "TICKER"]].drop_duplicates()

stocks_unique_COMNAM = stocks["COMNAM"].unique()

stocks_unique_NCUSIP_TICKER

In [None]:
# # if (NCUSIP, TICKER) in both stocks and ibes, then merge

# stocks_ibes_uniques = pd.merge(stocks_unique_NCUSIP_TICKER, ibes_unique_CUSIP_OFTIC,
#                             how="inner", left_on=["NCUSIP", "TICKER"], right_on=["CUSIP", "OFTIC"])
# stocks_ibes_uniques 

In [None]:
# # keep only stocks that has both NCUSIP and TICKER in stocks_ibes_uniques
# stocks = stocks[stocks["NCUSIP"].isin(stocks_ibes_uniques["NCUSIP"]) & stocks["TICKER"].isin(stocks_ibes_uniques["TICKER"])]
# stocks

In [None]:
# keep only ibes that has both CUSIP and OFTIC in stocks_unique_NCUSIP_TICKER
ibes = ibes[ibes["CUSIP"].isin(stocks_unique_NCUSIP_TICKER["NCUSIP"]) & ibes["OFTIC"].isin(stocks_unique_NCUSIP_TICKER["TICKER"])]
ibes

In [None]:
# save to processed

stocks.to_csv('..\processed_data\stock_price_monthly_2003_2024_processed.csv', index=False)
ibes.to_csv('..\processed_data\ibes_eps_quarters_2003_2024_processed.csv', index=False)