In [234]:
import os
import pandas as pd

## load data

In [281]:
processed_data_folder_path = "../data/etf_data/processed/"

In [282]:
etf_list = os.listdir(processed_data_folder_path)
etf_list.remove('.DS_Store')
etf_list

['SPYD.csv', 'JEPI.csv', 'VYM.csv', 'VOO.csv', 'SPY.csv']

In [283]:
spyd = pd.read_csv(processed_data_folder_path + "SPYD.csv")
spy = pd.read_csv(processed_data_folder_path + "SPY.csv")
jepi = pd.read_csv(processed_data_folder_path + "JEPI.csv")
vym = pd.read_csv(processed_data_folder_path + "VYM.csv")
voo = pd.read_csv(processed_data_folder_path + "VOO.csv")

In [284]:
jepi.replace(" N/A", "0", inplace=True)

In [238]:
spyd["category"].unique()

array(['profile', 'valuation_and_dividend', 'expense', 'tax_analysis',
       'holdings_analysis', 'performance'], dtype=object)

## visualization

In [68]:
vis = dict()

### 1. Annual Dividend Yield

In [239]:
vis["ady"] = dict()

vis["ady"]["spyd"] = spyd[
    (spyd["category"]=="valuation_and_dividend")&
    (spyd["key"]=="Annual Dividend Yield")
]["value"].tolist()[0]

vis["ady"]["spy"] = spy[
    (spy["category"]=="valuation_and_dividend")&
    (spy["key"]=="Annual Dividend Yield")
]["value"].tolist()[0]

vis["ady"]["jepi"] = jepi[
    (jepi["category"]=="valuation_and_dividend")&
    (jepi["key"]=="Annual Dividend Yield")
]["value"].tolist()[0]

vis["ady"]["vym"] = vym[
    (vym["category"]=="valuation_and_dividend")&
    (vym["key"]=="Annual Dividend Yield")
]["value"].tolist()[0]

vis["ady"]["voo"] = voo[
    (voo["category"]=="valuation_and_dividend")&
    (voo["key"]=="Annual Dividend Yield")
]["value"].tolist()[0]

In [240]:
def clean_string(text):
    num = ''.join(c for c in text if c.isdigit() or c in ['.', '-'])
    return float(num)

In [241]:
vis["ady"]["spyd"] = clean_string(vis["ady"]["spyd"])
vis["ady"]["spy"] = clean_string(vis["ady"]["spy"])
vis["ady"]["jepi"] = clean_string(vis["ady"]["jepi"])
vis["ady"]["vym"] = clean_string(vis["ady"]["vym"])
vis["ady"]["voo"] = clean_string(vis["ady"]["voo"])
vis

{'ady': {'spyd': 4.81, 'spy': 1.43, 'jepi': 8.75, 'vym': 3.14, 'voo': 1.49},
 'exp': {'spyd': {'key': ['SPYD',
    'ETF Database Category Average',
    'FactSet Segment Average'],
   'value': [0.07, 0.49, 0.37]},
  'spy': {'key': ['SPY',
    'ETF Database Category Average',
    'FactSet Segment Average'],
   'value': [0.09, 0.37, 0.58]}},
 'top15': {'spyd': {'key': ['Seagate Technology Holdings PLC',
    'NRG Energy, Inc.',
    'Packaging Corporation of America',
    'Phillips 66',
    'International Business Machines Corporation',
    'Amgen Inc.',
    'AT&T Inc.',
    'Verizon Communications Inc.',
    'International Paper Company',
    'Digital Realty Trust, Inc.',
    'KeyCorp',
    'Iron Mountain, Inc.',
    'Simon Property Group, Inc.',
    'Fifth Third Bancorp',
    'Williams Companies, Inc.'],
   'value': [1.77,
    1.69,
    1.59,
    1.57,
    1.55,
    1.54,
    1.53,
    1.52,
    1.52,
    1.48,
    1.47,
    1.43,
    1.43,
    1.42,
    1.42]},
  'spy': {'key': ['Apple I

In [243]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, CustomJS, Select
from bokeh.layouts import column
from bokeh.transform import factor_cmap

output_notebook()

# Sample data
categories1 = [k for k in vis["ady"].keys()]
values1 = [vis["ady"][c] for c in categories1]

categories2 = ['X', 'Y', 'Z']
values2 = [1, 4, 3]

source = ColumnDataSource(data=dict(categories=categories1, values=values1))

# Create a new plot
p = figure(x_range=categories1, height=250, title="Annual Dividend Yield",
           toolbar_location=None, tools="",
           y_axis_label="Percentage (%)")

p.vbar(x='categories', top='values', width=0.9, source=source,
       legend_field="categories", 
       line_color='white', 
       fill_color=factor_cmap('categories', palette="Spectral5", factors=categories1))

# Callback function to update data
callback = CustomJS(args=dict(source=source, 
                              categories1=categories1, values1=values1,
                              categories2=categories2, values2=values2, 
                              p=p), code="""
    var data = source.data;
    var f = cb_obj.value;
    p.x_range.factors = (f === 'Dataset 1') ? categories1 : categories2;
    data['categories'] = (f === 'Dataset 1') ? categories1 : categories2;
    data['values'] = (f === 'Dataset 1') ? values1 : values2;
    source.change.emit();
""")

select = Select(title="Dataset", value="Dataset 1", options=["Dataset 1", "Dataset 2"])
select.js_on_change('value', callback)

# Display the plot
layout = column(select, p)
show(layout)


### 2. expense

In [244]:
vis["exp"] = dict()

In [245]:
vis["exp"]["spyd"] = dict()

vis["exp"]["spyd"]["key"] = spyd[spyd["category"]=="expense"]["key"].tolist()
vis["exp"]["spyd"]["value"] = spyd[spyd["category"]=="expense"]["value"].apply(clean_string).tolist()

In [246]:
vis["exp"]["spy"] = dict()

vis["exp"]["spy"]["key"] = spy[spy["category"]=="expense"]["key"].tolist()
vis["exp"]["spy"]["value"] = spy[spy["category"]=="expense"]["value"].apply(clean_string).tolist()

In [247]:
vis["exp"]["jepi"] = dict()

vis["exp"]["jepi"]["key"] = jepi[jepi["category"]=="expense"]["key"].tolist()
vis["exp"]["jepi"]["value"] = jepi[jepi["category"]=="expense"]["value"].apply(clean_string).tolist()

In [248]:
vis["exp"]["vym"] = dict()

vis["exp"]["vym"]["key"] = vym[vym["category"]=="expense"]["key"].tolist()
vis["exp"]["vym"]["value"] = vym[vym["category"]=="expense"]["value"].apply(clean_string).tolist()

In [249]:
vis["exp"]["voo"] = dict()

vis["exp"]["voo"]["key"] = voo[voo["category"]=="expense"]["key"].tolist()
vis["exp"]["voo"]["value"] = voo[voo["category"]=="expense"]["value"].apply(clean_string).tolist()

In [250]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, CustomJS, Select
from bokeh.layouts import column
from bokeh.transform import factor_cmap

output_notebook()

# Sample data
spyd_key = vis["exp"]["spyd"]["key"]
spyd_value = vis["exp"]["spyd"]["value"]

spy_key = vis["exp"]["spy"]["key"]
spy_value = vis["exp"]["spy"]["value"]

jepi_key = vis["exp"]["jepi"]["key"]
jepi_value = vis["exp"]["jepi"]["value"]

vym_key = vis["exp"]["vym"]["key"]
vym_value = vis["exp"]["vym"]["value"]

voo_key = vis["exp"]["voo"]["key"]
voo_value = vis["exp"]["voo"]["value"]

source = ColumnDataSource(data=dict(categories=spyd_key, values=spyd_value))

# Create a new plot
p = figure(x_range=spyd_key, height=250, title="Expense",
           toolbar_location=None, tools="",
           y_axis_label="Percentage (%)")

p.vbar(x='categories', top='values', width=0.9, source=source,
       # legend_field="categories", 
       line_color='white', 
       fill_color=factor_cmap('categories', palette="Spectral5", factors=spyd_key))

# Callback function to update data
callback = CustomJS(args=dict(source=source, 
                              categories1=spyd_key, values1=spyd_value,
                              categories2=spy_key, values2=spy_value, 
                              categories3=jepi_key, values3=jepi_value, 
                              categories4=vym_key, values4=vym_value, 
                              categories5=voo_key, values5=voo_value, 
                              p=p), code="""
    var data = source.data;
    var f = cb_obj.value;
    if (f === 'SPYD') {
       data['categories'] = categories1;
       data['values'] = values1;
       p.x_range.factors = categories1;
    } 
    if (f === 'SPY') {
       data['categories'] = categories2;
       data['values'] = values2;
       p.x_range.factors = categories2;
    }
    if (f === 'JEPI') {
       data['categories'] = categories3;
       data['values'] = values3;
       p.x_range.factors = categories3;
    }
    if (f === 'VYM') {
       data['categories'] = categories4;
       data['values'] = values4;
       p.x_range.factors = categories4;
    }
    if (f === 'VOO') {
       data['categories'] = categories5;
       data['values'] = values5;
       p.x_range.factors = categories5;
    }
    source.change.emit();
    p.x_range.change.emit();
""")

select = Select(title="ETF List", value="SPYD", options=["SPYD", "SPY", "JEPI", "VYM", "VOO"])
select.js_on_change('value', callback)

# Display the plot
layout = column(select, p)
show(layout)


### 3. top 15 holdings

In [256]:
vis["top15"] = dict()

In [257]:
vis["top15"]["spyd"] = dict()

vis["top15"]["spyd"]["key"] = spyd[
    (spyd["category"]=="holdings_analysis")
]["key"].tolist()

vis["top15"]["spyd"]["value"] = spyd[
    (spyd["category"]=="holdings_analysis")
]["value"].apply(clean_string).tolist()

In [258]:
vis["top15"]["spy"] = dict()

vis["top15"]["spy"]["key"] = spy[
    (spy["category"]=="holdings_analysis")
]["key"].tolist()

vis["top15"]["spy"]["value"] = spy[
    (spy["category"]=="holdings_analysis")
]["value"].apply(clean_string).tolist()

In [259]:
vis["top15"]["jepi"] = dict()

vis["top15"]["jepi"]["key"] = jepi[
    (jepi["category"]=="holdings_analysis")
]["key"].tolist()

vis["top15"]["jepi"]["value"] = jepi[
    (jepi["category"]=="holdings_analysis")
]["value"].apply(clean_string).tolist()

In [260]:
vis["top15"]["vym"] = dict()

vis["top15"]["vym"]["key"] = vym[
    (vym["category"]=="holdings_analysis")
]["key"].tolist()

vis["top15"]["vym"]["value"] = vym[
    (vym["category"]=="holdings_analysis")
]["value"].apply(clean_string).tolist()

In [261]:
vis["top15"]["voo"] = dict()

vis["top15"]["voo"]["key"] = voo[
    (voo["category"]=="holdings_analysis")
]["key"].tolist()

vis["top15"]["voo"]["value"] = voo[
    (voo["category"]=="holdings_analysis")
]["value"].apply(clean_string).tolist()

In [262]:
from bokeh.plotting import figure, show, output_notebook
from bokeh.models import ColumnDataSource, CustomJS, Select, LabelSet
from bokeh.layouts import column
from bokeh.palettes import plasma
import numpy as np

output_notebook()

# Sample data (assuming vis is defined)
spyd_key = vis["top15"]["spyd"]["key"]
spyd_value = vis["top15"]["spyd"]["value"]
spyd_colors = plasma(len(spyd_key))

spy_key = vis["top15"]["spy"]["key"]
spy_value = vis["top15"]["spy"]["value"]
spy_colors = plasma(len(spy_key))

jepi_key = vis["top15"]["jepi"]["key"]
jepi_value = vis["top15"]["jepi"]["value"]
jepi_colors = plasma(len(jepi_key))

vym_key = vis["top15"]["vym"]["key"]
vym_value = vis["top15"]["vym"]["value"]
vym_colors = plasma(len(vym_key))

voo_key = vis["top15"]["voo"]["key"]
voo_value = vis["top15"]["voo"]["value"]
voo_colors = plasma(len(voo_key))

# Reverse the order for initial display
spyd_key = list(reversed(spyd_key))
spyd_value = list(reversed(spyd_value))

spy_key = list(reversed(spy_key))
spy_value = list(reversed(spy_value))

jepi_key = list(reversed(jepi_key))
jepi_value = list(reversed(jepi_value))

vym_key = list(reversed(vym_key))
vym_value = list(reversed(vym_value))

voo_key = list(reversed(voo_key))
voo_value = list(reversed(voo_value))

# Titles based on Python variables
title_spyd = f"SPYD - Top 15 Holdings Total Percentage {round(sum(spyd_value),1)}"
title_spy = f"SPY - Top 15 Holdings Total Percentage {round(sum(spy_value),1)}"
title_jepi = f"JEPI - Top 15 Holdings Total Percentage {round(sum(jepi_value),1)}"
title_vym = f"VYM - Top 15 Holdings Total Percentage {round(sum(vym_value),1)}"
title_voo = f"VOO - Top 15 Holdings Total Percentage {round(sum(voo_value),1)}"

source = ColumnDataSource(data=dict(categories=spyd_key, values=spyd_value, colors=spyd_colors))

# Create a new horizontal bar plot
p = figure(y_range=spyd_key, width=800, height=800, title=title_spyd,
           toolbar_location=None, tools="",
           x_axis_label="Percentage (%)")

bar_height = 0.7
bars = p.hbar(y='categories', right='values', height=bar_height, source=source, line_color='white', fill_color='colors')

# Add values next to each bar
labels = LabelSet(y='categories', x='values', text='values', level='glyph',
                  y_offset=-13.5, x_offset=5, source=source)
p.add_layout(labels)

# Callback function to update data and title
callback = CustomJS(args=dict(source=source, 
                              categories1=spyd_key, values1=spyd_value, colors1=spyd_colors,
                              categories2=spy_key, values2=spy_value, colors2=spy_colors,
                              categories3=jepi_key, values3=jepi_value, colors3=jepi_colors,
                              categories4=vym_key, values4=vym_value, colors4=vym_colors,
                              categories5=voo_key, values5=voo_value, colors5=voo_colors,
                              p=p, title_spyd=title_spyd, title_spy=title_spy, title_jepi=title_jepi,
                              title_vym=title_vym, title_voo=title_voo), code="""
    var data = source.data;
    var f = cb_obj.value;
    if (f === 'SPYD') {
       data['categories'] = categories1;
       data['values'] = values1;
       data['colors'] = colors1;
       p.y_range.factors = categories1;
       p.title.text = title_spyd;
    } else if (f === 'SPY') {
       data['categories'] = categories2;
       data['values'] = values2;
       data['colors'] = colors2;
       p.y_range.factors = categories2;
       p.title.text = title_spy;
    } else if (f === 'JEPI') {
       data['categories'] = categories3;
       data['values'] = values3;
       data['colors'] = colors3;
       p.y_range.factors = categories3;
       p.title.text = title_jepi;
    } else if (f === 'VYM') {
       data['categories'] = categories4;
       data['values'] = values4;
       data['colors'] = colors4;
       p.y_range.factors = categories4;
       p.title.text = title_vym;
    } else if (f === 'VOO') {
       data['categories'] = categories5;
       data['values'] = values5;
       data['colors'] = colors5;
       p.y_range.factors = categories5;
       p.title.text = title_voo;
    }
    source.change.emit();
    p.y_range.change.emit();
""")

select = Select(title="ETF List", value="SPYD", options=["SPYD", "SPY", "JEPI", "VYM", "VOO"])
select.js_on_change('value', callback)

# Display the plot
layout = column(select, p)
show(layout)


### 4. 1, 3, 5 Year Return

In [267]:
vis["yr"] = dict()

In [285]:
# SPYD
vis["yr"]["spyd"] = dict()

vis["yr"]["spyd"]["key"] = spyd[
    (spyd["category"]=="performance")
]["key"].tolist()

vis["yr"]["spyd"]["value"] = spyd[
    (spyd["category"]=="performance")
]["value"].apply(clean_string).tolist()

In [286]:
# SPY
vis["yr"]["spy"] = dict()

vis["yr"]["spy"]["key"] = spy[
    (spy["category"]=="performance")
]["key"].tolist()

vis["yr"]["spy"]["value"] = spy[
    (spy["category"]=="performance")
]["value"].apply(clean_string).tolist()

In [287]:
# JEPI
vis["yr"]["jepi"] = dict()

vis["yr"]["jepi"]["key"] = jepi[
    (jepi["category"]=="performance")
]["key"].tolist()

vis["yr"]["jepi"]["value"] = jepi[
    (jepi["category"]=="performance")
]["value"].apply(clean_string).tolist()

In [288]:
# VYM
vis["yr"]["vym"] = dict()

vis["yr"]["vym"]["key"] = vym[
    (vym["category"]=="performance")
]["key"].tolist()

vis["yr"]["vym"]["value"] = vym[
    (vym["category"]=="performance")
]["value"].apply(clean_string).tolist()

In [289]:
# VOO
vis["yr"]["voo"] = dict()

vis["yr"]["voo"]["key"] = voo[
    (voo["category"]=="performance")
]["key"].tolist()

vis["yr"]["voo"]["value"] = voo[
    (voo["category"]=="performance")
]["value"].apply(clean_string).tolist()

In [296]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_notebook
from bokeh.transform import dodge
from bokeh.palettes import plasma

output_notebook()

# Sample Data
categories = vis["yr"]["spyd"]["key"]
class_a_values = vis["yr"]["spyd"]["value"]
class_b_values = vis["yr"]["spy"]["value"]
class_c_values = vis["yr"]["jepi"]["value"]
class_d_values = vis["yr"]["vym"]["value"]
class_e_values = vis["yr"]["voo"]["value"]

# Prepare the data in a flat structure
data = {
    'categories': categories,
    'SPYD': class_a_values,
    'SPY': class_b_values,
    'JEPI': class_c_values,
    'VYM': class_d_values,
    'VOO': class_e_values
}

source = ColumnDataSource(data=data)

# Create a new plot
p = figure(x_range=categories, width=800, height=400, title="Year Return",
           toolbar_location=None, tools="")

# Dodge values for each class
dodge_values = [-0.3, -0.15, 0, 0.15, 0.3]

# Plot bars for each class with unique dodge values
p.vbar(x=dodge('categories', dodge_values[0], range=p.x_range), top='SPYD', width=0.1, source=source,
       color=plasma(10)[9], legend_label="SPYD")
p.vbar(x=dodge('categories', dodge_values[1], range=p.x_range), top='SPY', width=0.1, source=source,
       color=plasma(10)[0], legend_label="SPY")
p.vbar(x=dodge('categories', dodge_values[2], range=p.x_range), top='JEPI', width=0.1, source=source,
       color=plasma(10)[7], legend_label="JEPI")
p.vbar(x=dodge('categories', dodge_values[3], range=p.x_range), top='VYM', width=0.1, source=source,
       color=plasma(10)[4], legend_label="VYM")
p.vbar(x=dodge('categories', dodge_values[4], range=p.x_range), top='VOO', width=0.1, source=source,
       color=plasma(10)[5], legend_label="VOO")

# Add labels for each class with unique y_offset
labels_a = LabelSet(x=dodge('categories', dodge_values[0], range=p.x_range), y='SPYD', text='SPYD', 
                    y_offset=4, text_font_size="8pt", text_color="black", source=source, text_align='center')
p.add_layout(labels_a)

labels_b = LabelSet(x=dodge('categories', dodge_values[1], range=p.x_range), y='SPY', text='SPY', 
                    y_offset=4, text_font_size="8pt", text_color="black", source=source, text_align='center')
p.add_layout(labels_b)

labels_c = LabelSet(x=dodge('categories', dodge_values[2], range=p.x_range), y='JEPI', text='JEPI', 
                    y_offset=4, text_font_size="8pt", text_color="black", source=source, text_align='center')
p.add_layout(labels_c)

labels_d = LabelSet(x=dodge('categories', dodge_values[3], range=p.x_range), y='VYM', text='VYM', 
                    y_offset=4, text_font_size="8pt", text_color="black", source=source, text_align='center')
p.add_layout(labels_d)

labels_e = LabelSet(x=dodge('categories', dodge_values[4], range=p.x_range), y='VOO', text='VOO', 
                    y_offset=4, text_font_size="8pt", text_color="black", source=source, text_align='center')
p.add_layout(labels_e)

# Rotate x-axis labels
# p.xaxis.major_label_orientation = np.pi/4

# Show plot
show(p)
