# Magic Formula Investing: Implementation and Simulation

# ADD FEATURE TO RESTRICT MARKET CAP!!!!! based on 1999 value

## Implementation in Python

In [43]:
# importing packages that are used throughout the code
import math
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt

from matplotlib import rcParams

In [45]:
# importing our raw data and viewing a tiny subset of it 

# df containly quarterly company fundamentals, such as debt, cash, PPE, etc
# does not contain EBIT, which is crucial to our calculations
df=pd.read_csv("QuarterlyRawDataNoEBIT.csv")

# the database only offers EBIT on an annual basis, so ebit_df 
# contains annual data for each company's EBIT 
ebit_df=pd.read_csv("AnnualRawInputData.csv")
print len(df), len(ebit_df)
df.head()

512530 140522


Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,cstkq,dlcq,dlttq,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic
0,1004,02/28/1990,1989,3,INDL,C,D,STD,AIR,AAR CORP,...,16.07,32.551,72.606,53.428,0,188.1,A,,31.125,5080
1,1004,05/31/1990,1989,4,INDL,C,D,STD,AIR,AAR CORP,...,16.082,33.821,72.329,63.441,0,184.932,A,,21.25,5080
2,1004,08/31/1990,1990,1,INDL,C,D,STD,AIR,AAR CORP,...,16.086,27.427,71.806,63.545,0,189.351,A,,15.875,5080
3,1004,11/30/1990,1990,2,INDL,C,D,STD,AIR,AAR CORP,...,16.086,33.563,71.769,63.075,0,186.955,A,,11.875,5080
4,1004,02/28/1991,1990,3,INDL,C,D,STD,AIR,AAR CORP,...,16.097,11.436,69.02,63.626,0,184.665,A,,12.875,5080


## Cleaning the Data

In [46]:
# trimming data for years past 2014
df=df[df['fyearq']<2015]
ebit_df=ebit_df[ebit_df['fyear']<2015]

# eliminating duplicate data that is double-listed under FS and INDL 
ebit_df=ebit_df[ebit_df['indfmt']!="FS"]

# the length of both dataframes should be shortening
print len(df), len(ebit_df)

487706 124596


In [58]:
# extracting the companies that have missing EBIT values
missing_ebit_tics=ebit_df[ebit_df['ebit'].isnull()].tic.unique()

# removing those companies from our datasets 
df = df[~df.tic.isin(missing_ebit_tics)]
ebit_df = ebit_df[~ebit_df.tic.isin(missing_ebit_tics)]

# the length of both dataframes should be shortening
print len(df), len(ebit_df)

239972 87305


In [59]:
# remove SIC Division H Companies: Finance, Insurance, and Real Estate
# https://www.osha.gov/pls/imis/sic_manual.html
df_below_6000=df[df['sic']<6000]
df_above_7000=df[df['sic']>=7000]

df=pd.concat([df_below_6000,df_above_7000])

# still shortening
print len(df), len(ebit_df)

239972 87305


In [60]:
# remove SIC Division E Companies: Transportation, Communications, Electric, Gas, and Sanitary Services
# https://www.osha.gov/pls/imis/sic_manual.html
df_below_4000=df[df['sic']<4000]
df_above_5000=df[df['sic']>=5000]

df=pd.concat([df_below_4000,df_above_5000])

# still shortening
print len(df), len(ebit_df)

239972 87305


In [61]:
# function to get the appropriate annual EBIT value for each row in the quarterly dataframe
def get_row_ebit(row):
    ebit_row = ebit_df[ebit_df['fyear']==row['fyearq']]
    ebit_row = ebit_row[ebit_row['tic']== row['tic']]
    if ebit_row['ebit'].empty:
        return 0
    else: 
        return float(ebit_row['ebit'])

In [55]:
%%time 
# takes a while to run so don't run this every time
# the edited CSVs have been saved and can be loaded in separately 
df['ebit']=df.apply(get_row_ebit,axis=1)

CPU times: user 11min 46s, sys: 3.2 s, total: 11min 49s
Wall time: 2h 2min 27s


In [210]:
# function to get the appropriate annual cash values for 
# rows with missing cash values in the quarterly dataframe
# quarterly cash values are generally missing before 2007
def get_row_cash(row):
    if math.isnan(row['chq']):
        cash_row = ebit_df[ebit_df['datadate']==row['datadate']]
        cash_row = cash_row[cash_row['tic']==row['tic']]
        if cash_row['ch'].empty:
            return float('NaN')
        else: 
            return float(cash_row['ch'])
    else: 
        return row['chq']    

In [211]:
%%time
# also takes around 10-15 minutes
df['chq']=df.apply(get_row_cash,axis=1)

CPU times: user 14min 54s, sys: 2.26 s, total: 14min 56s
Wall time: 14min 57s


In [215]:
# dropping the rows that still have missing cash values in the quarterly dataset
df=df[~np.isnan(df['chq'])]

# export to CSV so we don't have to rerun the cleaning and get_row_ebit functions everytime
df.to_csv("CleanedQuarterlyEBITCash.csv")
ebit_df.to_csv("CleanedAnnualEBIT.csv")

# Start here to avoid running data-cleaning code above
## which takes ~30 minutes

In [246]:
# re-read in the cleaned dataframes
df=pd.read_csv("CleanedQuarterlyEBITCash.csv")
ebit_df=pd.read_csv("CleanedAnnualEBIT.csv")

# drop the index column that gets added mysteriously during exporting
df=df.drop(['Unnamed: 0'],axis=1)
ebit_df=ebit_df.drop(['Unnamed: 0'], axis=1)

## 1. Establish minimum market capitalization value of 50M

#### Filter out companies that do not meet this minimum

## Calculating Greenblatt's Ratios

### Ratio 1: EBIT/(NFA + NWC)

In [232]:
# basic function to calculate ratio 1 across a row
# ratio 1 = EBIT / (NFA + NWC)
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# NFA = net fixed assets = net book value of Property Plant and Equipment (PPENTQ)
# NMW = net working capital = working capital - cash = WCAPQ - CHQ
def ratio_one(row):
    if math.isnan(row['ppentq']) or math.isnan(row['wcapq']) or math.isnan(row['chq']) or row['ppentq']+row['wcapq']-row['chq']==0:
        ratio = float('NaN')
    else: 
        ratio=row['ebit']/(row['ppentq']+row['wcapq']-row['chq'])
    return ratio

In [233]:
%%time
# add the ratio1 column to the dataframe
df['ratio1']=df.apply(ratio_one,axis=1)

CPU times: user 5 s, sys: 52 ms, total: 5.05 s
Wall time: 5.06 s


### Ratio 2: EBIT/EV

In [234]:
# basic function to calculate ratio 2 across a row
# ratio 2 = EBIT / EV
# EBIT = earnings before interest and taxes, after subtracting depreciation and amortization
# EV = enterprise value = market cap + debt - cash
# EV = MKVALTQ + DTQ - CHQ
# MKVALTQ = market cap = value of preferred stock + value of common stock
# MKVALTQ = PSTKQ + CSTKQ 
# DTQ = debt = long-term debt + net current debt
# DTQ = DLTTQ + DLCQ
# CHQ = cash
# ratio 2 = EBIT / (MKVALTQ + DLCQ + DLTTQ - CHQ)
def ratio_two(row):
    if math.isnan(row['pstkq']) or math.isnan(row['cstkq']) or math.isnan(row['dlcq']) or math.isnan(row['dlttq']) or math.isnan(row['chq']) or (row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])==0: 
        ratio = float('NaN')
    else: 
        # ratio=row['ebit']/(row['mkvaltq']+row['dlcq']+row['dlttq']-row['chq'])
        ratio=row['ebit']/(row['pstkq']+row['cstkq']+row['dlcq']+row['dlttq']-row['chq'])
    return ratio

In [235]:
%%time
# add the ratio1 column to the dataframe
df['ratio2']=df.apply(ratio_two,axis=1)

CPU times: user 7.86 s, sys: 44.9 ms, total: 7.91 s
Wall time: 7.91 s


### Let's see what our dataframe looks like now...

In [236]:
# note the additional new columns! 
df.head(100)

Unnamed: 0,gvkey,datadate,fyearq,fqtr,indfmt,consol,popsrc,datafmt,tic,conm,...,ppentq,pstkq,wcapq,costat,mkvaltq,prccq,sic,ebit,ratio1,ratio2
0,1050,05/31/1990,1989,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,0.372,A,,0.2810,3564,-0.229,-0.578283,-0.173222
1,1050,05/31/1991,1990,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.674,A,,0.5000,3564,-0.032,0.049231,-0.033649
2,1050,05/31/1992,1991,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.824,A,,0.6560,3564,-0.069,0.086142,-0.055825
3,1050,12/31/1992,1992,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,0.024,0,-0.924,A,,3.0310,3564,-0.048,0.053274,-0.051064
4,1050,12/31/1993,1993,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.268,0,0.506,A,,3.1870,3564,-0.281,-0.102145,-0.075013
5,1050,12/31/1994,1994,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.138,0,0.496,A,,1.8750,3564,0.900,0.390625,0.465116
6,1050,12/31/1995,1995,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.020,0,1.427,A,,3.0620,3564,0.057,0.023710,0.043578
7,1050,12/31/1996,1996,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.806,0,2.187,A,,2.0000,3564,0.718,0.200503,0.555728
8,1050,12/31/1997,1997,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,1.947,0,0.649,A,,3.0620,3564,0.475,0.271739,0.362872
9,1050,12/31/1998,1998,4,INDL,C,D,STD,CECE,CECO ENVIRONMENTAL CORP,...,2.062,0,0.372,A,,3.0000,3564,1.137,0.549541,0.395203


## Split into train and test data sets
The train dataset will be used for finding which stocks we should invest in, and then we simulate a 100,000 portfolio on those stocks using the prices and adjustment factors from the test data set. Here, we take the train data as the data from years 1990 to 1999, and the test data from 2000 to 2014. 

In [242]:
# splitting the data into test and train
train = df[df['fyearq']<2000]
test = df[df['fyearq']>=2000]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


### Rank the companies in the train data set and extract our 30 chosen stocks

In [244]:
# rank the rows based on ratio1 and ratio2, with a larger ratio ==> better ranking
train['rank1'] = train['ratio1'].rank(ascending=False,na_option='bottom')
train['rank2'] = train['ratio2'].rank(ascending=False,na_option='bottom')
train['ranksum'] = train['rank1']+ train['rank2']
train['finalrank'] = train['ranksum'].rank(ascending=True,na_option='bottom')

# get a list of the top tickers that we should invest in
top_tickers = list(train[train['finalrank']<=35].tic.unique())
print len(top_tickers)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  from ipykernel import kernelapp as app
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the the caveats in the documentation: http://pandas.p

30


# Simulating a 

# Moving forward:
1. Rank companies based on ratio1 for given year (From high-better, to low-worse) 
2. Rank companies based on ratio2 for given year
3. Add the two rankings to get company's overall status for the year
4. Year over year, buy high-ranking companies and sell low-ranking companies
## have pricing data from 1990 (on monthly basis) in CSV file
## have adjustment data from 1990 (on quarterly basis) in CSV file
1. Adjust the prices based on AJEX
2. Run simulations based on ^^ 