In [1]:
import numpy as np
import pandas as pd
from IPython.display import clear_output

In [None]:
df = pd.read_csv(r"compustat cleaned base.csv")
df["Date"]=pd.to_datetime(df["Date"], format='%Y-%m-%d')

Adjust the section below based on which FCFE I'm using.

In [3]:
df.drop(columns = ['fqtr', 'Cash and Short Term Investments',
       'Debt in Current Liabilities',
       'Total Long Term Debt', 'Depreciation and Amortisation',
       'Preferred Dividends','Working Capital', 'Long Term Debt Issuance',
       'Long Term Debt Reduction', 'Change in Current Debt',
       'Capital Expenditures', 'Net Borrowing','Acquisitions','Total Shares Repurchased',
       'Change in Long Term Debt v2', 'Net Borrowing v2', 'NCWC', 'R&D Expenses',
       'Change in NCWC'],inplace = True)

^This creates gaps which I thus need to test for again.

In [5]:
# removing gaps
df["Adjusted_LPERMNO2"]=df["Adjusted_LPERMNO"].astype(str)

length = len(df["Adjusted_LPERMNO"].unique())
counter = 0
for permno, group in df.groupby("Adjusted_LPERMNO"):
    
    clear_output(wait=True)
    
    indices = list(group.index)
    dates =list(group["Date"])
    expected_dates = list(pd.date_range(group["Date"].min(),group["Date"].max(),freq="Q"))
    # don't need to worry about offsetting to the end of the quarter here
    # because the start and end points are already offset.
    
    if len(dates)!= len(expected_dates):
        
        start = 0
        marker = 0
        for i in range(len(dates)-1):
            
            
            if (dates[i+1]-dates[i]).days>92: 
                # max 92 days in a quarter
                # if this condition is met
                # then between indices i+1 and i there is a break.'
                
                if i- start>7:
                    
                    # so the current segment goes from index 'start' to index i.
                    # minimum segment length is 9, so the difference >=8 i.e. >7.
                    df.loc[indices[start:i+1],"Adjusted_LPERMNO2"]+= str(marker)
                    marker+=1
                    # every time we get a new usable series segment, marker increments.
                    
                    # remember, everything is already sorted
                    # e.g. 10001 is split into 100010, 100011
                
                else:
                    #s egment not long enough, so drop
                    df.drop(indices[start:i+1], inplace=True)
                    # both loc and drop take index arguments as opposed to iloc which takes positional arguments.
                    
                start = i+1
                # increment 'start' (as an index of 'dates') for the next segment
                
    counter+=1
    print(counter,"/",length)
    
# If there is a break between the penultimate and final values, the script doesn't do anything to the final value.
# Hence I apply this:
df = df.groupby(["Adjusted_LPERMNO2"]).filter(lambda x: len(x) > 8)
df.drop("Adjusted_LPERMNO",axis=1,inplace=True)
df=df.rename(columns={"Adjusted_LPERMNO2":"Adjusted_LPERMNO"})
df = df.reset_index(drop = True)

8661 / 8661


Interpolating gaps for common shares outstanding to calculate market cap.

In [7]:
i = 0
length = len(df["LPERMNO"].unique())
for permno, group in df.groupby("LPERMNO"):
    
    clear_output(wait=True)
    
    if (group["Common Shares Outstanding"].isna()).all():
        df.drop(group.index, inplace= True)
    elif (group["Common Shares Outstanding"].notna()).all():
        pass
    else:
        group["Common Shares Outstanding"] = group["Common Shares Outstanding"].interpolate(method='linear',limit_direction="both")
        # need to specify limit direction otherwise it can't interpolate backwards to values at start of series if missing
        df.update(group["Common Shares Outstanding"]) #updates based on column name and index
        
    i+=1
    print(i,"/",length)
        
# I don't interpolate gaps in price because it was likely that the stock underwent some sort of fundamental change
# effectively can treat as a different security once it relists
# but gaps in common shares outstanding can be interpolated
df["Market Cap"] = df["Price"]*df["Common Shares Outstanding"]
df = df.reset_index(drop=True)

6163 / 6163


In [75]:
len(df["LPERMNO"].unique())

10557

In [8]:
df.to_csv("fcfe1_check.csv",index=False)
# df.to_csv("compustat cleaned base.csv",index=False) #basically compustat cleaned v2 but with market cap 