# Magic Formula Investing: Implementation and Simulation

## Implementation in Python

In [43]:
# importing packages
# import sys
# print sys.version
import math
import numpy as np
import pandas as pd
# import scipy.stats as stats
import matplotlib.pyplot as plt
# import sklearn
# import statsmodels.api as sm

# import seaborn as sns
# sns.set_style("whitegrid")
# sns.set_context("poster")

# special matplotlib argument for improved plots
from matplotlib import rcParams

In [45]:
df=pd.read_csv("QuarterlyRawDataNoEBIT.csv")
ebit_df=pd.read_csv("AnnualRawInputData.csv")
print len(df), len(ebit_df)
df.head()

512530 140522


Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,cstkq,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic
0,1004,02/28/1990,1989,3,INDL,C,D,STD,AIR,AAR CORP,...,16.07,32.551,72.606,53.428,0,188.1,A,,31.125,5080
1,1004,05/31/1990,1989,4,INDL,C,D,STD,AIR,AAR CORP,...,16.082,33.821,72.329,63.441,0,184.932,A,,21.25,5080
2,1004,08/31/1990,1990,1,INDL,C,D,STD,AIR,AAR CORP,...,16.086,27.427,71.806,63.545,0,189.351,A,,15.875,5080
3,1004,11/30/1990,1990,2,INDL,C,D,STD,AIR,AAR CORP,...,16.086,33.563,71.769,63.075,0,186.955,A,,11.875,5080
4,1004,02/28/1991,1990,3,INDL,C,D,STD,AIR,AAR CORP,...,16.097,11.436,69.02,63.626,0,184.665,A,,12.875,5080


# Cleaning the Data

In [46]:
# trimming data for years past 2014
df=df[df['fyearq']<2015]
ebit_df=ebit_df[ebit_df['fyear']<2015]

# eliminating duplicate data listed under FS and INDL
ebit_df=ebit_df[ebit_df['indfmt']!="FS"]

print len(df), len(ebit_df)

487706 124596


In [58]:
missing_ebit_tics=ebit_df[ebit_df['ebit'].isnull()].tic.unique()
# print type(missing_ebit_tics)

# df=df.drop(missing_ebit_tics,)

df = df[~df.tic.isin(missing_ebit_tics)]
ebit_df = ebit_df[~ebit_df.tic.isin(missing_ebit_tics)]

print len(df), len(ebit_df)

239972 87305


In [59]:
# remove SIC Division H Companies: Finance, Insurance, and Real Estate
df_below_6000=df[df['sic']<6000]
df_above_7000=df[df['sic']>=7000]

df=pd.concat([df_below_6000,df_above_7000])

print len(df), len(ebit_df)

239972 87305


In [60]:
# remove SIC Division E Companies: Transportation, Communications, Electric, Gas, and Sanitary Services
df_below_4000=df[df['sic']<4000]
df_above_5000=df[df['sic']>=5000]

df=pd.concat([df_below_4000,df_above_5000])

print len(df), len(ebit_df)

239972 87305


In [61]:
def get_row_ebit(row):
    ebit_row = ebit_df[ebit_df['fyear']==row['fyearq']]
    ebit_row = ebit_row[ebit_row['tic']== row['tic']]
    if ebit_row['ebit'].empty:
        return 0
    else: 
        return float(ebit_row['ebit'])
# ebit_df.head()

In [55]:
%%time 
# takes a while to run so don't run this every time
# the edited CSVs have been saved and can be loaded in separately 
df['ebit']=df.apply(get_row_ebit,axis=1)

CPU times: user 11min 46s, sys: 3.2 s, total: 11min 49s
Wall time: 2h 2min 27s


In [56]:
df.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit
206,1050,02/28/1990,1989,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.069,1.085,0.024,0,0.382,A,,0.156,3564,-0.229
207,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.057,0.804,0.024,0,0.372,A,,0.281,3564,-0.229
208,1050,08/31/1990,1990,1,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.061,1.171,0.155,0,0.553,A,,0.281,3564,-0.032
209,1050,11/30/1990,1990,2,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.059,1.182,0.099,0,0.583,A,,0.125,3564,-0.032
210,1050,02/28/1991,1990,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.073,0.244,0.024,0,-0.641,A,,0.25,3564,-0.032


In [62]:
ebit_df.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,conm,curcd,...,dt,ebit,ppent,pstk,wcap,costat,prcc_c,mkvalt,prcc_f,sic
0,1004,05/31/1990,1989,INDL,C,D,STD,AIR,AAR CORP,USD,...,,46.851,63.441,0,184.932,A,36.0,,21.25,5080
1,1004,05/31/1991,1990,INDL,C,D,STD,AIR,AAR CORP,USD,...,,33.701,63.415,0,189.172,A,11.5,,14.125,5080
2,1004,05/31/1992,1991,INDL,C,D,STD,AIR,AAR CORP,USD,...,,26.53,60.422,0,197.246,A,13.5,,12.875,5080
3,1004,05/31/1993,1992,INDL,C,D,STD,AIR,AAR CORP,USD,...,,16.343,56.052,0,193.399,A,11.75,,13.5,5080
4,1004,05/31/1994,1993,INDL,C,D,STD,AIR,AAR CORP,USD,...,,21.824,54.783,0,240.009,A,14.5,,14.375,5080


In [63]:
df.to_csv("CleanedQuarterlyRawWithEBIT.csv")
ebit_df.to_csv("CleanedAnnualEBIT.csv")

In [207]:
df=pd.read_csv("CleanedQuarterlyRawWithEBIT.csv")
ebit_df=pd.read_csv("CleanedAnnualEBIT.csv")
print len(df), len(ebit_df)

239972 87305


In [208]:
df=df.drop(['Unnamed: 0'],axis=1)
df.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit
0,1050,02/28/1990,1989,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.069,1.085,0.024,0,0.382,A,,0.156,3564,-0.229
1,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.057,0.804,0.024,0,0.372,A,,0.281,3564,-0.229
2,1050,08/31/1990,1990,1,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.061,1.171,0.155,0,0.553,A,,0.281,3564,-0.032
3,1050,11/30/1990,1990,2,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.059,1.182,0.099,0,0.583,A,,0.125,3564,-0.032
4,1050,02/28/1991,1990,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.073,0.244,0.024,0,-0.641,A,,0.25,3564,-0.032


In [209]:
ebit_df=ebit_df.drop(['Unnamed: 0'], axis=1)
ebit_df.head()

Unnamed: 0,gvkey,datadate,fyear,indfmt,consol,popsrc,datafmt,tic,conm,curcd,...,dt,ebit,ppent,pstk,wcap,costat,prcc_c,mkvalt,prcc_f,sic
0,1004,05/31/1990,1989,INDL,C,D,STD,AIR,AAR CORP,USD,...,,46.851,63.441,0,184.932,A,36.0,,21.25,5080
1,1004,05/31/1991,1990,INDL,C,D,STD,AIR,AAR CORP,USD,...,,33.701,63.415,0,189.172,A,11.5,,14.125,5080
2,1004,05/31/1992,1991,INDL,C,D,STD,AIR,AAR CORP,USD,...,,26.53,60.422,0,197.246,A,13.5,,12.875,5080
3,1004,05/31/1993,1992,INDL,C,D,STD,AIR,AAR CORP,USD,...,,16.343,56.052,0,193.399,A,11.75,,13.5,5080
4,1004,05/31/1994,1993,INDL,C,D,STD,AIR,AAR CORP,USD,...,,21.824,54.783,0,240.009,A,14.5,,14.375,5080


In [210]:
def get_row_cash(row):
    if math.isnan(row['chq']):
        cash_row = ebit_df[ebit_df['datadate']==row['datadate']]
        cash_row = cash_row[cash_row['tic']==row['tic']]
    # print cash_row
    # print cash_row['datadate']
    # print cash_row['tic']
        if cash_row['ch'].empty:# and math.isnan(cash_row['ch']):
            return float('NaN')
        else: 
            return float(cash_row['ch'])
    else: 
        return row['chq']    

In [211]:
%%time
# sampledf=df.head(100)
# sampledf['chq']=sampledf.apply(get_row_cash,axis=1)
# sampledf
df['chq']=df.apply(get_row_cash,axis=1)

CPU times: user 14min 54s, sys: 2.26 s, total: 14min 56s
Wall time: 14min 57s


In [212]:
df.head()

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit
0,1050,02/28/1990,1989,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.069,1.085,0.024,0,0.382,A,,0.156,3564,-0.229
1,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.057,0.804,0.024,0,0.372,A,,0.281,3564,-0.229
2,1050,08/31/1990,1990,1,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.061,1.171,0.155,0,0.553,A,,0.281,3564,-0.032
3,1050,11/30/1990,1990,2,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.059,1.182,0.099,0,0.583,A,,0.125,3564,-0.032
4,1050,02/28/1991,1990,3,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.073,0.244,0.024,0,-0.641,A,,0.25,3564,-0.032


In [215]:
df=df[~np.isnan(df['chq'])]

In [216]:
df.to_csv("CleanedQuarterlyEBITCash.csv")
df=pd.read_csv("CleanedQuarterlyEBITCash.csv")
df=df.drop(['Unnamed: 0'],axis=1)
df.head(100)

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit
0,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.057,0.804,0.024,0,0.372,A,,0.2810,3564,-0.229
1,1050,05/31/1991,1990,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.624,0.226,0.024,0,-0.674,A,,0.5000,3564,-0.032
2,1050,05/31/1992,1991,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.826,0.224,0.024,0,-0.824,A,,0.6560,3564,-0.069
3,1050,12/31/1992,1992,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.903,0.000,0.024,0,-0.924,A,,3.0310,3564,-0.048
4,1050,12/31/1993,1993,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.560,2.149,2.268,0,0.506,A,,3.1870,3564,-0.281
5,1050,12/31/1994,1994,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.804,1.399,2.138,0,0.496,A,,1.8750,3564,0.900
6,1050,12/31/1995,1995,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.028,1.253,2.020,0,1.427,A,,3.0620,3564,0.057
7,1050,12/31/1996,1996,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.489,1.142,1.806,0,2.187,A,,2.0000,3564,0.718
8,1050,12/31/1997,1997,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.339,1.737,1.947,0,0.649,A,,3.0620,3564,0.475
9,1050,12/31/1998,1998,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.588,1.570,2.062,0,0.372,A,,3.0000,3564,1.137


In [217]:
# basic function to calculate ratio 1 across a row
# ratio 1 = EBIT / (NFA + NWC)
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# NFA = net fixed assets = net book value of Property Plant and Equipment (PPENTQ)
# NMW = net working capital = working capital - cash = WCAPQ - CHQ
def ratio_one(row):
    if math.isnan(row['ppentq']) or math.isnan(row['wcapq']) or math.isnan(row['chq']) or row['ppentq']+row['wcapq']-row['chq']==0:
        ratio = float('NaN')
    else: 
        ratio=row['ebit']/(row['ppentq']+row['wcapq']-row['chq'])
    return ratio

In [218]:
%%time
# add the ratio1 column to the dataframe
df['ratio1']=df.apply(ratio_one,axis=1)

CPU times: user 5.4 s, sys: 42 ms, total: 5.44 s
Wall time: 5.45 s


In [219]:
df.head(100)

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit,ratio1
0,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.804,0.024,0,0.372,A,,0.2810,3564,-0.229,-0.578283
1,1050,05/31/1991,1990,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.226,0.024,0,-0.674,A,,0.5000,3564,-0.032,0.049231
2,1050,05/31/1992,1991,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.224,0.024,0,-0.824,A,,0.6560,3564,-0.069,0.086142
3,1050,12/31/1992,1992,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.000,0.024,0,-0.924,A,,3.0310,3564,-0.048,0.053274
4,1050,12/31/1993,1993,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.149,2.268,0,0.506,A,,3.1870,3564,-0.281,-0.102145
5,1050,12/31/1994,1994,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.399,2.138,0,0.496,A,,1.8750,3564,0.900,0.390625
6,1050,12/31/1995,1995,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.253,2.020,0,1.427,A,,3.0620,3564,0.057,0.023710
7,1050,12/31/1996,1996,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.142,1.806,0,2.187,A,,2.0000,3564,0.718,0.200503
8,1050,12/31/1997,1997,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.737,1.947,0,0.649,A,,3.0620,3564,0.475,0.271739
9,1050,12/31/1998,1998,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.570,2.062,0,0.372,A,,3.0000,3564,1.137,0.549541


In [221]:
# check the validities of the ratio1 values
print len(df[df['ratio1']==0])
print df['ratio1'].isnull().sum()

830
3209


In [222]:
# basic function to calculate ratio 2 across a row
# ratio 2 = EBIT / EV
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# EV = enterprise value = market cap + debt - cash
# EV = MKVALTQ + DTQ - CHQ
# MKVALTQ = market cap = value of preferred stock + value of common stock
# MKVALTQ = PSTKQ + CSTKQ 
# DTQ = debt = long-term debt + net current debt
# DTQ = DLTTQ + DLCQ
# CHQ = cash
# ratio 2 = EBIT / (MKVALTQ + DLCQ + DLTTQ - CHQ)
def ratio_two(row):
    if math.isnan(row['pstkq']) or math.isnan(row['cstkq']) or math.isnan(row['dlcq']) or math.isnan(row['dlttq']) or math.isnan(row['chq']) or (row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])==0: 
        ratio = float('NaN')
    else: 
        # ratio=row['ebit']/(row['mkvaltq']+row['dlcq']+row['dlttq']-row['chq'])
        ratio=row['ebit']/(row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])
    return ratio

In [223]:
df['ratio2']=df.apply(ratio_two,axis=1)

In [224]:
print len(df[df['ratio2']==0])
print df['ratio2'].isnull().sum()

827
7135


In [225]:
df.head(100)

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit,ratio1,ratio2
0,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,0.372,A,,0.2810,3564,-0.229,-0.578283,-0.173222
1,1050,05/31/1991,1990,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.674,A,,0.5000,3564,-0.032,0.049231,-0.033649
2,1050,05/31/1992,1991,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.824,A,,0.6560,3564,-0.069,0.086142,-0.055825
3,1050,12/31/1992,1992,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.924,A,,3.0310,3564,-0.048,0.053274,-0.051064
4,1050,12/31/1993,1993,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.268,0,0.506,A,,3.1870,3564,-0.281,-0.102145,-0.075013
5,1050,12/31/1994,1994,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.138,0,0.496,A,,1.8750,3564,0.900,0.390625,0.465116
6,1050,12/31/1995,1995,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.020,0,1.427,A,,3.0620,3564,0.057,0.023710,0.043578
7,1050,12/31/1996,1996,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.806,0,2.187,A,,2.0000,3564,0.718,0.200503,0.555728
8,1050,12/31/1997,1997,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.947,0,0.649,A,,3.0620,3564,0.475,0.271739,0.362872
9,1050,12/31/1998,1998,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.062,0,0.372,A,,3.0000,3564,1.137,0.549541,0.395203


# Split into test and train data sets

In [230]:
test = df[df['fyearq']<2000]
train = df[df['fyearq']>=2000]

test['rank1'] = test['ratio1'].rank(ascending=False,na_option='bottom')
# train['rank1'] = train['ratio1'].rank(ascending=False)
test['rank2'] = test['ratio2'].rank(ascending=False,na_option='bottom')
# train['rank2'] = train['ratio2'].rank(ascending=False)
test['ranksum'] = test['rank1']+ test['rank2']
# train['ranksum'] = train['rank1']+ train['rank2']
test['finalrank'] = test['ranksum'].rank(ascending=True,na_option='bottom')

top_tickers = list(test[test['finalrank']<=30].tic)
print top_tickers

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


['AVP', 'LCAR', 'LCAR', 'PARS', 'SCIE', 'SCIE', 'RGRX', 'ADLI', 'TFRY', 'NAVB', 'PENC', 'OXGN', 'AOGN', 'SMDM', 'PWVI', 'AMAR', 'CERS', 'PGNX', 'MTEX', 'MTEX', 'SKVI', 'XMEX', 'IPG', 'CPXP', 'MIND', 'ADSV', 'TCX', 'SLP', 'AGBR', 'MHTX']


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


# Moving forward:
1. Rank companies based on ratio1 for given year (From high-better, to low-worse) 
2. Rank companies based on ratio2 for given year
3. Add the two rankings to get company's overall status for the year
4. Year over year, buy high-ranking companies and sell low-ranking companies
## have pricing data from 1990 (on monthly basis) in CSV file
## have adjustment data from 1990 (on quarterly basis) in CSV file
1. Adjust the prices based on AJEX
2. Run simulations based on ^^ 