# Magic Formula Investing: Implementation and Simulation Using Python

## Cleaning the Data

In [1]:
# importing packages that are used throughout the code
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import rcParams

In [2]:
# importing our raw data and viewing a tiny subset of it 

# df containly quarterly company fundamentals, such as debt, cash, PPE, etc
# does not contain EBIT, which is crucial to our calculations
df=pd.read_csv("QuarterlyRawDataNoEBIT.csv")

# the database only offers EBIT on an annual basis, so ebit_df 
# contains annual data for each company's EBIT 
ebit_df=pd.read_csv("AnnualRawInputData.csv")
print len(df), len(ebit_df)
df.head()

512530 140522


Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,cstkq,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic
0,1004,02/28/1990,1989,3,INDL,C,D,STD,AIR,AAR CORP,...,16.07,32.551,72.606,53.428,0,188.1,A,,31.125,5080
1,1004,05/31/1990,1989,4,INDL,C,D,STD,AIR,AAR CORP,...,16.082,33.821,72.329,63.441,0,184.932,A,,21.25,5080
2,1004,08/31/1990,1990,1,INDL,C,D,STD,AIR,AAR CORP,...,16.086,27.427,71.806,63.545,0,189.351,A,,15.875,5080
3,1004,11/30/1990,1990,2,INDL,C,D,STD,AIR,AAR CORP,...,16.086,33.563,71.769,63.075,0,186.955,A,,11.875,5080
4,1004,02/28/1991,1990,3,INDL,C,D,STD,AIR,AAR CORP,...,16.097,11.436,69.02,63.626,0,184.665,A,,12.875,5080


In [3]:
# trimming data for years past 2014
df=df[df['fyearq']<2015]
ebit_df=ebit_df[ebit_df['fyear']<2015]

# eliminating duplicate data that is double-listed under FS and INDL 
ebit_df=ebit_df[ebit_df['indfmt']!="FS"]

# the length of both dataframes should be shortening
print len(df), len(ebit_df)

487706 124596


In [4]:
# extracting the companies that have missing EBIT values
missing_ebit_tics=ebit_df[ebit_df['ebit'].isnull()].tic.unique()

# removing those companies from our datasets 
df = df[~df.tic.isin(missing_ebit_tics)]
ebit_df = ebit_df[~ebit_df.tic.isin(missing_ebit_tics)]

# the length of both dataframes should be shortening
print len(df), len(ebit_df)

350463 87305


In [5]:
# function to get the appropriate annual EBIT value for each row in the quarterly dataframe
def get_row_ebit(row):
    ebit_row = ebit_df[ebit_df['fyear']==row['fyearq']]
    ebit_row = ebit_row[ebit_row['tic']== row['tic']]
    if ebit_row['ebit'].empty:
        return 0
    else: 
        return float(ebit_row['ebit'])

In [None]:
%%time 
# takes a while to run so don't run this every time
# the edited CSVs have been saved and can be loaded in separately 
df['ebit']=df.apply(get_row_ebit,axis=1)

In [9]:
# function to get the appropriate annual cash values for 
# rows with missing cash values in the quarterly dataframe
# quarterly cash values are generally missing before 2007
def get_row_cash(row):
    if math.isnan(row['chq']):
        cash_row = ebit_df[ebit_df['datadate']==row['datadate']]
        cash_row = cash_row[cash_row['tic']==row['tic']]
        if cash_row['ch'].empty:
            return float('NaN')
        else: 
            return float(cash_row['ch'])
    else: 
        return row['chq']    

In [None]:
%%time
# also takes around 10-15 minutes
df['chq']=df.apply(get_row_cash,axis=1)

In [None]:
df.head()

In [None]:
# dropping the rows that still have missing cash values in the quarterly dataset
df=df[~np.isnan(df['chq'])]

# export to CSV so we don't have to rerun the cleaning and get_row_ebit functions everytime
df.to_csv("CleanedQuarterlyEBITCash.csv")
ebit_df.to_csv("CleanedAnnualEBIT.csv")

# Start here to avoid running data-cleaning code above
## which takes ~30 minutes

In [None]:
# re-read in the cleaned dataframes
df=pd.read_csv("CleanedQuarterlyEBITCash.csv")
ebit_df=pd.read_csv("CleanedAnnualEBIT.csv")

# drop the index column that gets added mysteriously during exporting
df=df.drop(['Unnamed: 0'],axis=1)
ebit_df=ebit_df.drop(['Unnamed: 0'], axis=1)

## 1. Establish minimum market capitalization value of 50M

#### Filter out companies that do not meet this minimum

In [None]:
# we write a function here that will be used later once the year of simulation is determined
def set_min_market_cap(df, year):
    df_for_year = df[df['year']==year]
    df_for_year = df_for_year[df_for_year['mkvaltq']>50] 
    return list(df_for_year['tic'])

## 2. Excluding utility and financial companies

In [None]:
# remove SIC Division H Companies: Finance, Insurance, and Real Estate
# https://www.osha.gov/pls/imis/sic_manual.html
df_below_6000=df[df['sic']<6000]
df_above_7000=df[df['sic']>=7000]

df=pd.concat([df_below_6000,df_above_7000])

# still shortening
print len(df), len(ebit_df)

In [None]:
# remove SIC Division E Companies: Transportation, Communications, Electric, Gas, and Sanitary Services
# https://www.osha.gov/pls/imis/sic_manual.html
df_below_4000=df[df['sic']<4000]
df_above_5000=df[df['sic']>=5000]

df=pd.concat([df_below_4000,df_above_5000])

# still shortening
print len(df), len(ebit_df)

## 3. Calculating Earnings Yield: EBIT/EV

In [None]:
# basic function to calculate ratio 1 across a row
# ratio 1 = EBIT / EV
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# EV = enterprise value = market cap + debt - cash
# EV = MKVALTQ + DTQ - CHQ
# MKVALTQ = market cap = value of preferred stock + value of common stock
# MKVALTQ = PSTKQ + CSTKQ 
# DTQ = debt = long-term debt + net current debt
# DTQ = DLTTQ + DLCQ
# CHQ = cash
# ratio 1 = EBIT / (MKVALTQ + DLCQ + DLTTQ - CHQ)
def ratio_one(row):
    if math.isnan(row['pstkq']) or math.isnan(row['cstkq']) or math.isnan(row['dlcq']) or math.isnan(row['dlttq']) or math.isnan(row['chq']) or (row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])==0: 
        ratio = float('NaN')
    else: 
        # ratio=row['ebit']/(row['mkvaltq']+row['dlcq']+row['dlttq']-row['chq'])
        ratio=row['ebit']/(row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])
    return ratio

In [None]:
%%time
# add the ratio1 column to the dataframe
df['ratio1']=df.apply(ratio_one,axis=1)

## 4. Calculating Return on Capital: EBIT/(NFA + NWC)

In [None]:
# basic function to calculate ratio 2 across a row
# ratio 2 = EBIT / (NFA + NWC)
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# NFA = net fixed assets = net book value of Property Plant and Equipment (PPENTQ)
# NMW = net working capital = working capital - cash = WCAPQ - CHQ
def ratio_two(row):
    if math.isnan(row['ppentq']) or math.isnan(row['wcapq']) or math.isnan(row['chq']) or row['ppentq']+row['wcapq']-row['chq']==0:
        ratio = float('NaN')
    else: 
        ratio=row['ebit']/(row['ppentq']+row['wcapq']-row['chq'])
    return ratio

In [None]:
%%time
# add the ratio2 column to the dataframe
df['ratio2']=df.apply(ratio_two,axis=1)

In [None]:
plt.hist(df['ratio1'])

## 5. Ranking the companies based on calculated ratios
### and extracting the top 20-30 companies

In [None]:
def get_top(df, year, ratio1weight):
    # eliminate companies that do not meet the min market cap
    short_df = df[df['tic'] in set_min_market_cap(df, year)]
    
    # rank the rows based on ratio1 and ratio2, with a larger ratio ==> better ranking
    short_df['rank1'] = short_df['ratio1'].rank(ascending=False,na_option='bottom')
    short_df['rank2'] = short_df['ratio2'].rank(ascending=False,na_option='bottom')
    short_df['ranksum'] = short_df['rank1']+short_df['rank2']
    short_df['finalrank'] = short_df['ranksum'].rank(ascending=True,na_option='bottom')
    
    # get a list of the top tickers that we should invest in
    top_tickers = list(short_df[short_df['finalrank']<=35].tic.unique())
    return top_tickers

