In [1]:
from bs4 import BeautifulSoup
import pandas as pd
import matplotlib.pylab as plt
import numpy as np
from sklearn import linear_model

# Betterment Performance Data

Betterment's performance data is located on [their performance page](https://www.betterment.com/resources/betterment-historical-performance/) in an IFrame [that points to CloudFront](https://d1svladlv4b69d.cloudfront.net/src/d3/bmt-hist-perf-line-graph/bmt-hist-perf.html).  We downloaded the HTML of the latter on June 21st.

In [2]:
with open("data/Betterment_Performance.html") as fh:
    soup = BeautifulSoup(fh)
    paths = soup.select("g.g-linecontainer path.g-port-line")


## SVG Graphics

SVG graphics language is an `M[x],[y]` followed by a series of `L[x],[y]` commands ([source](https://developer.mozilla.org/en-US/docs/Web/SVG/Tutorial/Paths)).  They can be parsed from "path" objects.  Remember that the x coordinates increase left to right but the y coordinates increase *up to down* ([source](https://www.w3.org/TR/SVG/coords.html)), as with most coodinate systems for monitor displays.

In [3]:
# extract Jan 2004 to Feb 2019 (inclusive) values
dates = pd.date_range(start='2004/1/31', end='2019/2/28', freq='M')

def extract_y_coors(data):
    assert(data.startswith('M'))
    pairs = [[float(x) for x in pair.split(',')] for pair in data[1:].split('L')]
    assert(len(pairs) == len(dates))
    return [y for _, y in pairs]

df_y = pd.DataFrame({
    path.get('class')[1]: extract_y_coors(path.get('d'))
    for path in paths
}, index=dates)

df_y.tail()


Unnamed: 0,sp500,bmt100,bmt90,bmt80,bmt70,bmt60,bmt50,arc90,bmt40,arc70,bmt30,arc50,bmt20,arc30,tBillFiveYear,bmt10,bmt0
2018-10-31,170.775383,202.890228,215.271566,229.386392,243.772043,259.51212,275.115015,292.934511,294.27335,308.020408,318.620544,329.148161,341.131677,348.365767,353.509172,361.758855,380.478592
2018-11-30,164.693801,196.469022,209.56967,224.435519,239.514242,255.956349,272.190415,289.434571,292.018606,306.211663,317.050883,328.267733,340.125891,348.112342,353.157105,361.207331,380.283313
2018-12-31,194.095807,219.506013,228.983556,240.34829,252.222506,265.667843,279.242003,302.496131,296.760893,314.648834,320.095164,333.580762,341.802337,349.926239,352.836172,361.809996,380.069927
2019-01-31,169.713198,195.745954,207.951434,222.042486,236.497121,252.414333,268.255538,289.353984,288.231552,305.453852,314.357799,327.478384,338.338211,346.6533,352.531162,360.151851,379.806212
2019-02-28,159.051086,187.940678,201.232324,216.368741,231.79112,248.606233,265.25144,284.542966,285.988979,301.97782,312.819141,325.424053,337.370215,345.589342,352.231359,359.635952,379.63845


## Parse Returns Table

In [4]:
def parse_percent(x):
    if isinstance(x, str) and x.endswith('%'):
        return float(x[:-1]) / 100
    return x

with open("data/Betterment_Performance.html") as fh:
    df_raw = pd.read_html(fh)[0]
    df_r = pd.DataFrame(
        df_raw.iloc[2:,:].values,
        columns=df_raw.iloc[1,:].values
    ).set_index('Portfolio').applymap(parse_percent)
    
df_r

Unnamed: 0_level_0,Avg. Annual Return,Cumulative Return,Sharpe Ratio,Best month,Worst month,Last 12 months
Portfolio,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
S&P 500 Index,0.083,2.318,0.55,0.109,-0.165,0.045
Betterment 100% stock,0.076,2.036,0.47,0.131,-0.202,-0.007
Betterment 90% stock,0.073,1.906,0.48,0.12,-0.187,-0.003
Betterment 80% stock,0.07,1.758,0.48,0.109,-0.173,0.001
Betterment 70% stock,0.066,1.607,0.49,0.098,-0.159,0.005
Betterment 60% stock,0.061,1.443,0.5,0.087,-0.146,0.009
Betterment 50% stock,0.056,1.28,0.5,0.077,-0.134,0.013
Avg. private client investor 80-100% equity risk,0.05,1.092,0.36,0.085,-0.16,0.003
Betterment 40% stock,0.05,1.078,0.5,0.064,-0.117,0.016
Avg. private client investor 60-80% equity risk,0.044,0.922,0.38,0.067,-0.119,-0.006


# Join path data and Return Table

We're in luck -- the data aligns so it's easy to join them

In [5]:
label_dict = dict(zip(df_y.columns, df_r.index))
label_dict

{'sp500': 'S&P 500 Index',
 'bmt100': 'Betterment 100% stock',
 'bmt90': 'Betterment 90% stock',
 'bmt80': 'Betterment 80% stock',
 'bmt70': 'Betterment 70% stock',
 'bmt60': 'Betterment 60% stock',
 'bmt50': 'Betterment 50% stock',
 'arc90': 'Avg. private client investor 80-100% equity risk',
 'bmt40': 'Betterment 40% stock',
 'arc70': 'Avg. private client investor 60-80% equity risk',
 'bmt30': 'Betterment 30% stock',
 'arc50': 'Avg. private client investor 40-60% equity risk',
 'bmt20': 'Betterment 20% stock',
 'arc30': 'Avg. private client investor 0-40% equity risk',
 'tBillFiveYear': 'Five year U.S. Treasury Bills',
 'bmt10': 'Betterment 10% stock',
 'bmt0': 'Betterment 0% stock'}

In [6]:
if set(df_y.columns) != set(label_dict.values()):
    df_y.columns = [label_dict[x] for x in df_y.columns]
df_y.tail()

Unnamed: 0,S&P 500 Index,Betterment 100% stock,Betterment 90% stock,Betterment 80% stock,Betterment 70% stock,Betterment 60% stock,Betterment 50% stock,Avg. private client investor 80-100% equity risk,Betterment 40% stock,Avg. private client investor 60-80% equity risk,Betterment 30% stock,Avg. private client investor 40-60% equity risk,Betterment 20% stock,Avg. private client investor 0-40% equity risk,Five year U.S. Treasury Bills,Betterment 10% stock,Betterment 0% stock
2018-10-31,170.775383,202.890228,215.271566,229.386392,243.772043,259.51212,275.115015,292.934511,294.27335,308.020408,318.620544,329.148161,341.131677,348.365767,353.509172,361.758855,380.478592
2018-11-30,164.693801,196.469022,209.56967,224.435519,239.514242,255.956349,272.190415,289.434571,292.018606,306.211663,317.050883,328.267733,340.125891,348.112342,353.157105,361.207331,380.283313
2018-12-31,194.095807,219.506013,228.983556,240.34829,252.222506,265.667843,279.242003,302.496131,296.760893,314.648834,320.095164,333.580762,341.802337,349.926239,352.836172,361.809996,380.069927
2019-01-31,169.713198,195.745954,207.951434,222.042486,236.497121,252.414333,268.255538,289.353984,288.231552,305.453852,314.357799,327.478384,338.338211,346.6533,352.531162,360.151851,379.806212
2019-02-28,159.051086,187.940678,201.232324,216.368741,231.79112,248.606233,265.25144,284.542966,285.988979,301.97782,312.819141,325.424053,337.370215,345.589342,352.231359,359.635952,379.63845


In [7]:
df_ry = pd.merge(
    df_r, df_y.iloc[-1:, :].T,
    right_index=True,
    left_index=True
)
df_ry

Unnamed: 0,Avg. Annual Return,Cumulative Return,Sharpe Ratio,Best month,Worst month,Last 12 months,2019-02-28 00:00:00
S&P 500 Index,0.083,2.318,0.55,0.109,-0.165,0.045,159.051086
Betterment 100% stock,0.076,2.036,0.47,0.131,-0.202,-0.007,187.940678
Betterment 90% stock,0.073,1.906,0.48,0.12,-0.187,-0.003,201.232324
Betterment 80% stock,0.07,1.758,0.48,0.109,-0.173,0.001,216.368741
Betterment 70% stock,0.066,1.607,0.49,0.098,-0.159,0.005,231.79112
Betterment 60% stock,0.061,1.443,0.5,0.087,-0.146,0.009,248.606233
Betterment 50% stock,0.056,1.28,0.5,0.077,-0.134,0.013,265.25144
Avg. private client investor 80-100% equity risk,0.05,1.092,0.36,0.085,-0.16,0.003,284.542966
Betterment 40% stock,0.05,1.078,0.5,0.064,-0.117,0.016,285.988979
Avg. private client investor 60-80% equity risk,0.044,0.922,0.38,0.067,-0.119,-0.006,301.97782


## Connect path coodinates with returns

We use the terminal coordinates for each index and match them (via linear regression) against the cumulative returns to infer the monthly returns.

In [13]:
# Slightly overdone use of linear regression
lm = linear_model.LinearRegression()
X = df_ry[[pd.Timestamp('2019-02-28 00:00:00', freq='M')]]
y = df_ry['Cumulative Return'] + 1.  # index values

lm.fit(X, y)
print("R^2: {}".format(lm.score(X, y)))

R^2: 0.999999777266464


In [9]:
df = df_y.apply(lambda col: lm.predict(col[:, np.newaxis]))
df.head()

Unnamed: 0,S&P 500 Index,Betterment 100% stock,Betterment 90% stock,Betterment 80% stock,Betterment 70% stock,Betterment 60% stock,Betterment 50% stock,Avg. private client investor 80-100% equity risk,Betterment 40% stock,Avg. private client investor 60-80% equity risk,Betterment 30% stock,Avg. private client investor 40-60% equity risk,Betterment 20% stock,Avg. private client investor 0-40% equity risk,Five year U.S. Treasury Bills,Betterment 10% stock,Betterment 0% stock
2004-01-31,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141,1.000141
2004-02-29,1.01371,1.023066,1.021723,1.020303,1.018885,1.017295,1.015653,1.017914,1.013466,1.013497,1.010195,1.012086,1.006924,1.004918,1.002663,1.003652,1.000376
2004-03-31,1.000284,1.019918,1.019862,1.019715,1.019531,1.019209,1.018802,1.020915,1.017251,1.014244,1.013197,1.016407,1.009117,1.008565,1.004965,1.005016,1.000887
2004-04-30,0.981362,0.978533,0.979485,0.98045,0.981341,0.982058,0.982626,0.997677,0.984319,0.993892,0.988633,0.9999,0.992907,0.996934,1.007759,0.997133,1.001312
2004-05-31,0.998161,0.992444,0.991605,0.990707,0.989781,0.988496,0.98704,0.995963,0.987093,0.990842,0.990916,0.997153,0.994655,0.993866,1.010936,0.998301,1.001854


In [10]:
df.to_csv("data/betterment_values.csv")