In [17]:
import pandas as pd
from IPython.display import clear_output

The purpose here is to calculate market cap with somewhat clean data but net income empty rows haven't been removed.

In [8]:
# raw dataset
df4 = pd.read_csv(r"compustat data v4.csv")
df4 = df4.rename(columns={"datadate": "Date",
                            "tic":"Ticker","conm":"Company", "datacqtr":"Calendar Quarter","datafqtr":"Fiscal Quarter", 
                            "actq":"Current Assets","dpq":"Depreciation and Amortisation",
                            "lctq":"Current Liabilities","wcapq":"Working Capital",
                            "ggroup":"Group","gind":"Industry","gsector":"Sector","gsubind":"Subindustry",
                            "cshopq":"Total Shares Repurchased", "cshoq":"Common Shares Outstanding",
                            "dlcq":"Debt in Current Liabilities","dlttq":"Total Long Term Debt",
                            "niq":"Net Income","dltisy":"Long Term Debt Issuance YTD",
                            "dltry":"Long Term Debt Reduction YTD",
                            "aqcy":"Acquisitions YTD","capxy": "Capital Expenditures YTD",
                            "dvpq":"Preferred Dividends", "saleq":"Sales", "ceqq":"Book Value",# i.e. common equity
                            "xrdq":"R&D Expenses",
                            "cheq":"Cash and Short Term Investments",
                            "cshtrq":"Common Shares Traded",
                            "dvpsxq":"Dividends per Share Ex-date","prccq":"Price"})
df4 = df4[df4["Price"].notna()]
df4["Date"]=pd.to_datetime(df4["Date"], format='%Y%m%d')
columns = list(df4.columns)
columns.remove("Date")
columns.remove("Calendar Quarter")
columns.remove("Price")
columns.remove("LPERMNO")
columns.remove("Common Shares Outstanding")
df4.drop(columns=columns,inplace = True)

In [10]:
df4 = df4.drop_duplicates(subset = ["LPERMNO","Calendar Quarter"], keep ="last")
df4 = df4.drop_duplicates(subset = ["LPERMNO","Date"], keep ="last").reset_index(drop = True)

In [None]:
df4["Adjusted Date"] = pd.to_datetime(df4["Calendar Quarter"])+pd.tseries.offsets.QuarterEnd(0)

df4.loc[df4["Calendar Quarter"].isna(),"Adjusted Date"]= df4["Date"] + pd.tseries.offsets.QuarterEnd(0)
df4 = df4.drop_duplicates(subset = ["LPERMNO","Adjusted Date"], keep ="last")
df4["Date"]= df4["Adjusted Date"]
df4.drop(columns=["Adjusted Date"],inplace = True)

In [16]:
df2 = pd.read_csv(r"C:\Users\Administrator.LAPTOP-3KUBTES6\Documents\Dissertation Sandbox v4\crsp data.csv")
df2 = df2.rename(columns={"date": "Date", "EXCHCD": "Exchange Code",
                          "TICKER":"Ticker","COMNAM":"Company",
                          "DIVAMT":"Dividend","PRC":"Price","VOL":"Volume","SHROUT":"Shares Outstanding"
                         })
df2 = df2[df2["Price"].notna()]
df2 = df2[df2["Exchange Code"]==3]
df2["Dividend"] = df2["Dividend"].fillna(0)
# CRSP's hyphenated price values means there was no close price, but instead a bid/ask average.
df2["Price"]= df2["Price"].abs()

df2["Date"]=pd.to_datetime(df2["Date"], format='%Y%m%d', errors='ignore')

df2= df2.sort_values(by=['PERMNO','Date','Dividend']) 
# things are mostly ordered, but I want to sort by dividend too in order to drop duplicates and keep the largest dividend entry.

df2 = df2.drop_duplicates(subset = ["PERMNO","Date"], keep ="last").reset_index(drop = True)

In [18]:
j = 0
length = len(df4["LPERMNO"].unique())

df4["Exchange Check"] = 0

for permno, group in df4.groupby("LPERMNO"):
    clear_output(wait=True)
    match = df2[df2["PERMNO"]==permno]
    firstdate = match["Date"].iloc[0]
    lastdate = match["Date"].iloc[-1]
    # aim here is to remove all observations that weren't when the stock was listed on the NASDAQ
    df4.loc[(df4.LPERMNO ==permno) & (df4.Date < firstdate),"Exchange Check"] = 1
    df4.loc[(df4.LPERMNO ==permno) & (df4.Date > lastdate),"Exchange Check"]=1 # these are the observations I want to drop!
    # this is much quicker than dropping observations in place
    j+=1
    print(j,"/",length)
    
df4 = df4[df4["Exchange Check"]==0].reset_index(drop = True)
    
df4.drop("Exchange Check",axis=1,inplace=True)


14006 / 14006


In [20]:
# 426000 rows get cut down to 360000 after checking exchanges

In [21]:
i = 0
length = len(df4["LPERMNO"].unique())
for permno, group in df4.groupby("LPERMNO"):
    
    clear_output(wait=True)
    
    if (group["Common Shares Outstanding"].isna()).all():
        df4.drop(group.index, inplace= True)
    elif (group["Common Shares Outstanding"].notna()).all():
        pass
    else:
        group["Common Shares Outstanding"] = group["Common Shares Outstanding"].interpolate(method='linear',limit_direction="both")
        # need to specify limit direction otherwise it can't interpolate backwards to values at start of series if missing
        df4.update(group["Common Shares Outstanding"]) # updates based on column name and index
        
    i+=1
    print(i,"/",length)
        
# I don't interpolate gaps in price because it was likely that the stock underwent some sort of fundamental change
# effectively can treat as a different security once it relists
# but gaps in common shares outstanding can be interpolated
df4["Market Cap"] = df4["Price"]*df4["Common Shares Outstanding"]
df4 = df4.reset_index(drop=True)

13974 / 13974


In [23]:
df4.to_csv("compustat cleaned raw.csv")