In [1]:
import pandas as pd
import numpy as np
import math

In [2]:
from bokeh.io import output_notebook, output_file, show
from bokeh.plotting import figure
from bokeh.models import Div, Arrow, NormalHead, Label, Span, Legend, NumeralTickFormatter, DataTable, TableColumn

In [3]:
sample = pd.read_csv('simulated_ddos_data.csv')
simulations = pd.read_csv('simulation_scores.csv')

In [4]:
stats_output = '''
<h2>Normal vs Malicious Summary</h2> 
<i>metric = magnitude</i>

<h3>Normal:</h3>
-----------------------------
Observations: <b>86100</b>
Average: <b>5015.62</b>
Standard Deviation: <b>2843.19</b>

<h3>Malicious:</h3>
-----------------------------
Observations: <b>300</b>
Average: <b>25110.47</b>
Standard Deviation: </b>1938.8</b>

A threshold at (average + 3x standard deviations) magnitude would result in:
    - True Positives (correctly identified malicious events: <b>300</b>
    - False Positives (wrongly identified normal events: <b>1,229</b>
    - True Negatives (correctly identified normal events: <b>84,871</b>
    - False Negatives (wrongly identified malicious events: <b>0</b>

Accuracy Metrics:
    - Precision (what % of events above threshold are actually malicious): <b>19.6%</b>
    - Recall (what % of malicious events did we catch): <b>100.0%</b>
    - F1 Score (blends precision and recall): <b>32.8%</b>

<i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. The median is 4372.82 
compared to 5015.62 for the average.</i>
'''

In [5]:
output_notebook()
text = """
<h1>Normal vs Malicious Summary</h1> 
<i>metric = magnitude</i>

<table style="width:100%,text-align: right">
  <tr>
    <th style="text-align:left">Metric</th>
    <th style="text-align:left">Normal Events</th>
    <th style="text-align:left">Malicious Events</th>
  </tr>
  <tr>
    <td style="text-align:left">Observations</td>
    <td style="text-align:left">86,100</td>
    <td style="text-align:left">300</td>
  </tr>
  <tr>
    <td style="text-align:left">Average</td>
    <td style="text-align:left">5,015.62</td>
    <td style="text-align:left">25,110.47</td>
  </tr>
  <tr>
    <td style="text-align:left">Standard Deviation</td>
    <td style="text-align:left">2,843.19</td>
    <td style="text-align:left">1,938.80</td>
  </tr>  
</table>

<p>A threshold at <i>(average + 3x standard deviations)</i> magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
"""
stats_div = Div(text=text, width=500, height=200)
show(stats_div)

In [6]:
hypothetical_threshold = '''
<p>A threshold at <i>(average + 3x standard deviations)</i> magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
'''

hypo_div = Div(text=hypothetical_threshold, width=500, height=200)
show(hypo_div)

In [7]:
warning_msg = '''
<p><i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. 
The median is 4372.82 compared to 5015.62 for the average.</i></p>
'''

warning_div = Div(text=warning_msg, width=500, height=50)
show(warning_div)

In [8]:
# Let's get the exploratory charts generated
malicious = sample.loc[sample.is_ddos == 1, 'magnitude']
normal = sample.loc[sample.is_ddos == 0, 'magnitude']

mal_mean = malicious.mean()
mal_std = malicious.std()
mal_count = malicious.size
normal_mean = normal.mean()
normal_std = normal.std()
normal_count = normal.size

malicious_hist, malicious_edge = np.histogram(malicious, bins=100)
mal_hist_df = pd.DataFrame({
    'magnitude': malicious_hist,
    'left': malicious_edge[:-1],
    'right': malicious_edge[1:]
})

normal_hist, normal_edge = np.histogram(normal, bins=100)
norm_hist_df = pd.DataFrame({
    'magnitude': normal_hist,
    'left': normal_edge[:-1],
    'right': normal_edge[1:]
})

exploratory = figure(plot_width = 900, plot_height = 600,  
           title = 'Magnitude Distribution Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations'
          )

exploratory.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='purple', alpha=.85)
exploratory.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='cyan', alpha=.35)

exploratory.xaxis.formatter = NumeralTickFormatter(format='0,0')
exploratory.yaxis.formatter = NumeralTickFormatter(format='0,0')

exploratory.add_layout(Arrow(end=NormalHead(fill_color='#FFA38B', line_color='#FFA38B', size=10),
                   x_start=mal_mean, y_start=mal_count, x_end=mal_mean, y_end=0))
arrow_label = Label(x=mal_mean, y=mal_count * 1.2, text='Malicious Events',
                   text_color='#FFA38B', text_font_style='bold')
exploratory.add_layout(arrow_label)

exploratory.legend.location = "top_right"
show(exploratory)

In [139]:
# Zoomed in version
from bokeh.themes import built_in_themes
from bokeh.io import curdoc

curdoc().theme = 'dark_minimal'

overlap_view = figure(plot_width = 600, plot_height = 480,  
           title = 'Zoomed in w/Example Threshold',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations',
           y_range=(0, mal_count * .33),
           x_range=(normal_mean + (normal_std * 2.5),mal_mean + (mal_std * 3)),
          )

overlap_view.title.text_font_size = '12pt'

overlap_view.border_fill_color = "#243749"
overlap_view.border_fill_alpha = 1
overlap_view.background_fill_color = "#243749"
overlap_view.background_fill_alpha = .95
overlap_view.min_border_left = 90
overlap_view.min_border_right = 70
overlap_view.min_border_top = 70
overlap_view.min_border_bottom = 70

overlap_view.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='#FFA38B', alpha=.85, line_alpha=0.15)
overlap_view.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='#74D1EA', alpha=.35, line_alpha=0.5)
overlap_view.xaxis.formatter = NumeralTickFormatter(format='0,0')
overlap_view.yaxis.formatter = NumeralTickFormatter(format='0,0')

# 3 sigma reference line

for i in range(0,9):
    overlap_view.add_layout(Span(location=normal_mean + (normal_std * i), dimension='height', line_color='white',
                  line_dash='dashed', line_width=2))
    overlap_view.add_layout(Label(x=normal_mean + (normal_std * i), x_offset=.05, y=160, y_units='screen',
                         text=f'{i}σ', text_font_style='bold', text_color='white'))

overlap_view.legend.location = "top_right"
show(overlap_view)

In [10]:
# Density version
malicious_hist_dense, malicious_edge_dense = np.histogram(malicious, density=True, bins=100)
mal_hist_dense_df = pd.DataFrame({
    'magnitude': malicious_hist_dense,
    'left': malicious_edge_dense[:-1],
    'right': malicious_edge_dense[1:]
})

normal_hist_dense, normal_edge_dense = np.histogram(normal, density=True, bins=100)
norm_hist_dense_df = pd.DataFrame({
    'magnitude': normal_hist_dense,
    'left': normal_edge_dense[:-1],
    'right': normal_edge_dense[1:]
})

density = figure(plot_width = 900, plot_height = 600,  
           title = 'Probability Density Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = '% of Group Total'
          )

density.quad(bottom = 0, top=mal_hist_dense_df.magnitude, left=mal_hist_dense_df.left, 
             right=mal_hist_dense_df.right, legend_label='malicious', fill_color='purple', alpha=.85)
density.quad(bottom = 0, top=norm_hist_dense_df.magnitude, left=norm_hist_dense_df.left, 
             right=norm_hist_dense_df.right, legend_label='normal', fill_color='#74D1EA', alpha=.35)
density.xaxis.formatter = NumeralTickFormatter(format='0,0')
density.yaxis.formatter = NumeralTickFormatter(format='0.000%')

density.legend.location = "top_right"
show(density)

In [127]:
    from bokeh.models import VArea
    # Simulation Series to be used
    false_positives = simulations.false_positives
    false_negatives = simulations.false_negatives
    multiplier = simulations.multiplier
    precision = simulations.precision
    recall = simulations.recall
    f1_score = simulations.f1_score

    # False Positives vs False Negatives

    errors = figure(
        plot_width=800,
        plot_height=600,
        x_range=(multiplier.min(), multiplier.max()),

        title='False Positives vs False Negatives Across Multiplier Levels',
        x_axis_label='Multiplier',
        y_axis_label='Count',

        tools="pan,box_select,zoom_in,zoom_out,save,reset"
    )
    
    source = ColumnDataSource(dict(x=multiplier, y1=false_positives, y2=np.zeros(false_positives.size)))

    fps = VArea(x='x', y1='y1', y2='y2', hatch_color="#A8C0BB")
    errors.add_glyph(source,fps)
    errors.line(multiplier, false_negatives, legend_label='false_negatives', line_width=2, color="#FA4616")
    errors.legend.location = "top_center"

    show(errors)

In [12]:
simulations.head()

Unnamed: 0,multiplier,threshold,false_positives,false_negatives,precision,recall,f1_score,fp_cost,fn_cost,estimated_total_cost_thousands
0,3.0,13545,1229,0,0.1962,1.0,0.328,409666.666667,0.0,409.666667
1,3.1,13829,1084,0,0.2168,1.0,0.3563,361333.333333,0.0,361.333333
2,3.2,14113,954,0,0.2392,1.0,0.3861,318000.0,0.0,318.0
3,3.3,14398,845,0,0.262,1.0,0.4152,281666.666667,0.0,281.666667
4,3.4,14682,748,0,0.2863,1.0,0.4451,249333.333333,0.0,249.333333


In [13]:
df = pd.read_csv('simulation_weighted_results_10x.csv')
loss_min = df[df.total_weighted_errors == df.total_weighted_errors.min()].squeeze()['multiplier']
df['loss_min'] = loss_min
df.head()

Unnamed: 0,multiplier,TP,FP,TN,FN,precision,recall,f1_score,weighted_FN,total_weighted_errors,loss_min
0,2.0,300,4035,82065,0,0.069204,1.0,0.12945,0.0,4035.0,5.5
1,2.1,300,3594,82506,0,0.077042,1.0,0.143062,0.0,3594.0,5.5
2,2.2,300,3200,82900,0,0.085714,1.0,0.157895,0.0,3200.0,5.5
3,2.3,300,2854,83246,0,0.095117,1.0,0.173712,0.0,2854.0,5.5
4,2.4,300,2523,83577,0,0.10627,1.0,0.192123,0.0,2523.0,5.5


In [16]:
#slider 

from bokeh.layouts import column, row
from bokeh.models import CustomJS, ColumnDataSource, LinearAxis, Range1d, Band
from bokeh.plotting import Figure, output_file, show
from bokeh.models.widgets import Slider

df = pd.read_csv('simulation_weighted_results_10x.csv')
ratio = 10
x = df.multiplier
y = df.FN
z = y * ratio
a = df.FP
b = a + z
c = df.f1_score
d = df.precision
e = df.recall
loss_min = df[df.total_weighted_errors == df.total_weighted_errors.min()].squeeze()['multiplier']

source = ColumnDataSource(data=dict(x=x,
                                    y=y,
                                    z=z,
                                    a=a,
                                    b=b,
                                    c=c,
                                    d=d,
                                    e=e
                                    ))
plot = Figure(plot_width=900, plot_height=480, x_axis_label='multiplier', y_axis_label='Errors')

plot.border_fill_color = "whitesmoke"
plot.background_fill_color = "whitesmoke"
plot.background_fill_alpha = 0.5
plot.min_border_left = 40
plot.min_border_right = 40
plot.min_border_top = 20
plot.min_border_bottom = 20

plot.line('x', 'b', source=source, line_width=3, line_alpha=0.6, 
          color='#00C65E', legend_label='Total Weighted Errors')
plot.extra_y_ranges = {"y2": Range1d(start = 0, end = 1.1)}
plot.add_layout(LinearAxis(y_range_name = "y2", axis_label="Score"), 'right')
plot.line('x', 'c', source=source, line_width=3, line_alpha=0.6, 
          color='#87037B', legend_label='F1 score', y_range_name = "y2")
plot.line('x', 'd', source=source, line_width=3, line_alpha=0.6, 
          color='#E40046', legend_label='Precision', y_range_name = "y2")
plot.line('x', 'e', source=source, line_width=3, line_alpha=0.6, 
          color='#00C1D4', legend_label='Recall', y_range_name = "y2")

slider = Slider(start=1.0, end=50, value=10, step=.25, title="Slider Value",
               bar_color='#FFD100', height=50, margin=(5,0,5,0), background='whitesmoke')
twe_thresh = Span(location=loss_min, dimension='height', line_color='grey',
              line_dash='dashed', line_width=2)
plot.add_layout(twe_thresh)

handler = CustomJS(args=dict(source=source, thresh=twe_thresh), code="""
   var data = source.data;
   var f = cb_obj.value
   var x = data['x']
   var y = data['y']
   var z = data['z']
   var a = data['a']
   var b = data['b']
   var c = data['c']
   var d = data['d']
   var e = data['e']
   for (var i = 0; i < x.length; i++) {
      z[i] = y[i] * f
      b[i] = z[i] + a[i]
   }
   
   var min_loss = Math.min.apply(null,b)
   var new_thresh = 0
   for (var i = 0; i < x.length; i++) {
      if (b[i] == min_loss) {
          new_thresh = x[i]
      }
   }
   twe_thresh.location = new_thresh
   twe_thresh.change.emit();
   source.change.emit();
""")

slider.js_on_change('value', handler)

weighting_intro = f'''
    <p><b>Error types differ in impact</b> - in the case of security incidents, a false negative, 
though possibly rarer than false positives, is likely more costly. For example, downtime suffered 
from a DDoS attack (lost sales/customers) incurs more loss than time wasted chasing a false positive 
(labor hours). </p>

<p>Try playing around with the slider below to see how your thresholding strategy might change 
depending on the relative weight of false negatives to false positives. What does it look like at
10:1, 50:1, etc.?</p>
'''

weighting_one = Div(text=weighting_intro, width=400, height=280, margin=(25,25,25,25))
weighting_two = Div(text=weighting_intro, width=400, height=200, margin=(25,25,25,25))


plot.legend.location = "bottom_right"
plot.legend.background_fill_alpha = .5
layout = row(column(weighting_one, weighting_two), column(plot, slider))
show(layout)

In [28]:
from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, Slider
from bokeh.models import TextInput
from bokeh.plotting import figure, show

# PREP DATA
welcome_message = f'You have selected: 1'

# TAKE ONLY OUTPUT
text_banner = Paragraph(text=welcome_message, width=200, height=100)

x = [x*0.005 for x in range(0, 200)]
y = x

source = ColumnDataSource(data=dict(x=x, y=y))

plot = figure(plot_width=400, plot_height=400)
plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)

handler = CustomJS(args=dict(banner=text_banner,
                            source=source), code="""
    var data = source.data
    var f = cb_obj.value
    var x = data['x']
    var y = data['y']
    for (var i = 0; i < x.length; i++) {
    y[i] = Math.pow(x[i], f)
    }
    banner.text = `power: ${f}`
    banner.change.emit();
    source.change.emit();
""")
#slider = Slider(start=0.1, end=4, value=1, step=.1, title="power", callback=callback)
#layout = vform(slider, plot)

text_input = TextInput(value="1", title="Power:")
text_input.js_on_change('value', handler)

layout = column(text_input, text_banner, plot)

show(layout)

In [22]:
# from bokeh.layouts import widgetbox
from bokeh.models import CustomJS, TextInput, TextAreaInput, Paragraph
from bokeh.plotting import output_file, show

# PREP DATA
welcome_message = f'You have selected: 10'

# TAKE ONLY OUTPUT
text_banner = Paragraph(text=welcome_message, width=200, height=100)

# USER INTERACTIONS
text_input = TextInput(value="10", title="Enter row number:")

handler = CustomJS(args=dict(banner=text_banner), code="""
 var f = cb_obj.value
 banner.text = `You have selected: ${f}`
 banner.change.emit();
 """)

slider = Slider(start=0, end=50, value=10, step=1, title="Slider Value")
slider.js_on_change('value', handler)

text_input = TextInput(value="10", title="Enter row number:")
text_input.js_on_change('value', handler)


# LAYOUT
widg = column(text_input, text_banner)
show(widg)

In [37]:
from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource
from bokeh.plotting import Figure, output_file, show
from bokeh.models.widgets import Slider

x = [x*0.05 for x in range(0, 200)]
y = x


# PREP DATA
welcome_message = f'You have selected: 10'

# TAKE ONLY OUTPUT
text_banner = Paragraph(text=welcome_message, width=200, height=100)

# USER INTERACTIONS
text_input = TextInput(value="10", title="Enter row number:")

source = ColumnDataSource(data=dict(x=x, y=y))

plot = Figure(plot_width=400, plot_height=400)
plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)
handler = CustomJS(args=dict(source=source,
                            banner=text_banner), code="""
 var data = source.data
 var f = cb_obj.value
 var x = data['x']
 var y = data['y']
 for (var i = 0; i < x.length; i++) {
 y[i] = Math.pow(x[i], f)
 }
 banner.text = `power: ${f}`
 banner.change.emit();
 source.change.emit();
 """)
slider = Slider(start=0.0, end=5, value=1, step=.25, title="Power")
slider.js_on_change('value', handler)
layout = column(slider, plot, text_banner)
show(layout)

In [None]:
from bokeh.io import output_file, show
from bokeh.models import Button

button = Button(label="Foo", button_type="success")

show(button)

In [120]:
#slider 

from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, LinearAxis, Range1d, Band
from bokeh.plotting import Figure, output_file, show
from bokeh.models.widgets import Slider

df = pd.read_csv('simulation_weighted_results_10x.csv')

init_fp_cost = 100
df['fp_cost'] = init_fp_cost
df['total_cost'] = df.total_weighted_errors * df.fp_cost
ratio = 10
x = df.multiplier
y = df.FN
z = y * ratio
a = df.FP
b = a + z
c = df.f1_score
d = df.precision
e = df.recall
loss_min = df[df.total_weighted_errors == df.total_weighted_errors.min()].head(1).squeeze()['multiplier']
f1_max = df[df.f1_score == df.f1_score.max()].head(1).squeeze()['multiplier']
loss_min_twe = df[df.multiplier == loss_min].squeeze()['total_weighted_errors']
generic_twe = df[df.multiplier.apply(lambda x: round(x,2)) == 3.00].squeeze()['total_weighted_errors']


# PREP DATA
message = f'''
Based on your inputs, the optimal threshold is around {loss_min}.
This would result in an estimated {int(loss_min_twe)} total weighted errors and 
${int(loss_min_twe * init_fp_cost):,} in cost.
          
The generic threshold of 3.0 standard deviations would result in {generic_twe} 
total weighted errors and ${int(generic_twe * init_fp_cost):,} cost.
          
Using the optimal threshold would save ${int((generic_twe - loss_min_twe) * init_fp_cost):,}, 
reducing costs by {(generic_twe - loss_min_twe) / generic_twe * 100:.1f}% 
(assuming near-future events are distributed similarly to those from the past).
'''

# TAKE ONLY OUTPUT
text_banner = Div(text=message, width=400, height=230, margin=(25,25,25,25))

source = ColumnDataSource(data=dict(x=x,
                                    y=y,
                                    z=z,
                                    a=a,
                                    b=b,
                                    c=c,
                                    d=d,
                                    e=e,
                                    f=df.fp_cost,
                                    g=df.total_cost
                                    ))

plot = Figure(plot_width=900, plot_height=480,
              x_axis_label='multiplier', y_axis_label='Errors')
plot.line('x', 'b', source=source, line_width=3, line_alpha=0.6, 
          color='green', legend_label='Total Weighted Errors')
plot.extra_y_ranges = {"y2": Range1d(start = 0, end = 1.1)}
plot.add_layout(LinearAxis(y_range_name = "y2", axis_label="Score"), 'right')
plot.line('x', 'c', source=source, line_width=3, line_alpha=0.6, 
          color='purple', legend_label='F1 score', y_range_name = "y2")
plot.line('x', 'd', source=source, line_width=3, line_alpha=0.6, 
          color='red', legend_label='Precision', y_range_name = "y2")
plot.line('x', 'e', source=source, line_width=3, line_alpha=0.6, 
          color='blue', legend_label='Recall', y_range_name = "y2")

slider = Slider(start=1.0, end=500, value=10, step=.5, title="Slider Value")
twe_thresh = Span(location=loss_min, dimension='height', line_color='grey',
              line_dash='dashed', line_width=2)
twe_label = Label(x=loss_min - .05, y=240, y_units='screen', text=f'TWE Min: {round(loss_min,2)}', 
                  text_font_size='9pt', text_font_style='bold', 
                  text_align='right', text_color='green')

plot.add_layout(twe_thresh)
plot.add_layout(twe_label)

f1_thresh = Span(location=f1_max, dimension='height', line_color='purple',
              line_dash='dashed', line_width=2)
f1_label = Label(x=f1_max + .05, y=200, y_units='screen', text=f'F1 Max: {round(f1_max,2)}', 
                  text_font_size='9pt', text_font_style='bold', 
                  text_align='left', text_color='purple')
plot.add_layout(f1_thresh)
plot.add_layout(f1_label)

handler = CustomJS(args=dict(source=source, 
                             thresh=twe_thresh,
                             label=twe_label,
                             lmin=text_banner,
                             lmin2=loss_min_twe,
                             cost=fp_cost), code="""
                             
   var data = source.data
   var f = cb_obj.value
   var x = data['x']
   var y = data['y']
   var z = data['z']
   var a = data['a']
   var b = data['b']
   var c = data['c']
   var d = data['d']
   var e = data['e']
   var g = data['g']
   var generic_twe = 0
   for (var i = 0; i < x.length; i++) {
      z[i] = y[i] * f
      b[i] = z[i] + a[i]
      g[i] = b[i] * cost.value
      if (Math.round(x[i]) == 3) {
          generic_twe = b[i]
      }
   }
   
   function round(value, decimals) {
       return Number(Math.round(value+'e'+decimals)+'e-'+decimals);
   }
   
   function numberWithCommas(x) {
       return x.toString().replace(/\B(?<!\.\d*)(?=(\d{3})+(?!\d))/g, ",");
   }
   var min_loss = Math.min.apply(null,b)
   var new_thresh = 0
   for (var i = 0; i < x.length; i++) {
      if (b[i] == min_loss) {
          new_thresh = x[i]
          thresh.location = new_thresh
          lmin.text = `Based on your inputs, the optimal threshold is around ${numberWithCommas(new_thresh)}.
          This would result in an estimated ${b[i]} total weighted errors and 
          $${numberWithCommas(b[i] * cost.value)} in cost.
          
          The generic threshold of 3.0 standard deviations would result in ${numberWithCommas(generic_twe)} 
          total weighted errors and $${numberWithCommas(generic_twe * cost.value)} cost.
          
          Using the optimal threshold would save $${numberWithCommas((generic_twe - b[i]) * cost.value)}, 
          reducing costs by ${numberWithCommas(round(((generic_twe - b[i]) / generic_twe) * 100,2))}% 
          (assuming near-future events are distributed similarly to those from the past).
          `
          label.text = `TWE Min: ${round(new_thresh,2)}`
          label.x = new_thresh - .05
          lmin.change.emit()
          thresh.change.emit()
          label.change.emit()
      }
   }
   source.change.emit();
""")

slider = Slider(start=1.0, end=50, value=10, step=.25, title="FN:FP ratio",
               bar_color='#FFD100', height=70, margin=(10,10,10,10), background='whitesmoke')                    
slider.js_on_change('value', handler)

columns = [
    TableColumn(field="x", title="Multiplier"),
    TableColumn(field="a", title="False Positives"),
    TableColumn(field="y", title="False Negatives"),
    TableColumn(field="z", title="Weighted False Negatives", formatter=NumberFormatter(format='0,0.00')),
    TableColumn(field="b", title="Total Weighted Errors", formatter=NumberFormatter(format='0,0.00')),
    TableColumn(field="f", title="Estimated FP Cost", formatter=NumberFormatter(format='$0,0.00')),
    TableColumn(field="g", title="Estimated Total Cost", formatter=NumberFormatter(format='$0,0.00')),
    TableColumn(field="c", title="Precision", formatter=NumberFormatter(format='0.000%')),
    TableColumn(field="d", title="Recall", formatter=NumberFormatter(format='0.000%')),
    TableColumn(field="e", title="F1 Score", formatter=NumberFormatter(format='0.000%'))
    ]
                    
data_table = DataTable(source=source, columns=columns, width=900, height=400,
                      fit_columns=True, reorderable=True, sortable=True)

cost_handler = CustomJS(args=dict(source=source, 
                             thresh=twe_thresh,
                             label=twe_label,
                             lmin=text_banner,
                             lmin2=loss_min_twe), code="""
                             
   var data = source.data
   var cost = cb_obj.value
   var x = data['x']
   var y = data['y']
   var z = data['z']
   var a = data['a']
   var b = data['b']
   var c = data['c']
   var d = data['d']
   var e = data['e']
   var g = data['g']
   var f = data['f']
   var generic_twe = 0
   for (var i = 0; i < x.length; i++) {
      f[i] = cost
      g[i] = b[i] * cost
      if (Math.round(x[i]) == 3) {
          generic_twe = b[i]
      }
   }
   
   function round(value, decimals) {
       return Number(Math.round(value+'e'+decimals)+'e-'+decimals);
   }
   
   function numberWithCommas(x) {
       return x.toString().replace(/\B(?<!\.\d*)(?=(\d{3})+(?!\d))/g, ",");
   }
   var min_loss = Math.min.apply(null,b)
   var new_thresh = 0
   for (var i = 0; i < x.length; i++) {
      if (b[i] == min_loss) {
          new_thresh = x[i]
          thresh.location = new_thresh
          lmin.text = `Based on your inputs, the optimal threshold is around ${numberWithCommas(new_thresh)}.
          This would result in an estimated ${b[i]} total weighted errors and 
          $${numberWithCommas(b[i] * cost)} in cost.
          
          The generic threshold of 3.0 standard deviations would result in ${numberWithCommas(generic_twe)} 
          total weighted errors and $${numberWithCommas(generic_twe * cost)} cost.
          
          Using the optimal threshold would save $${numberWithCommas((generic_twe - b[i]) * cost)}, 
          reducing costs by ${numberWithCommas(round(((generic_twe - b[i]) / generic_twe) * 100,2))}% 
          (assuming near-future events are distributed similarly to those from the past).
          `
          label.text = `TWE Min: ${round(new_thresh,2)}`
          label.x = new_thresh - .05
          lmin.change.emit()
          thresh.change.emit()
          label.change.emit()
      }
   }
   source.change.emit();
""")

fp_cost = TextInput(value=f"{init_fp_cost}", title="How much a false positive costs", height=50,
                   margin=(25,25,25,25))
fp_cost.js_on_change('value', cost_handler)

weighting_intro = f'''
    <h3>Error types differ in impact</h3> 
    <p>in the case of security incidents, a false negative, 
though possibly rarer than false positives, is likely more costly. For example, downtime suffered 
from a DDoS attack (lost sales/customers) incurs more loss than time wasted chasing a false positive 
(labor hours). </p>

<p>Try playing around with the slider below to see how your thresholding strategy might change 
depending on the relative weight of false negatives to false positives. What does it look like at
10:1, 50:1, etc.?</p>
'''

weighting_one = Div(text=weighting_intro, width=400, height=160, margin=(25,25,25,25))

plot.legend.location = "bottom_right"
layout = column(row(column(weighting_one, fp_cost, text_banner,), column(plot, slider)), data_table)
show(layout)

In [90]:
df[df.multiplier.apply(lambda x: round(x,2)) == 3.00]

Unnamed: 0,multiplier,TP,FP,TN,FN,precision,recall,f1_score,weighted_FN,total_weighted_errors,fp_cost,total_cost
10,3.0,300,1229,84871,0,0.196207,1.0,0.328048,0.0,1229.0,100,122900.0


In [None]:
f1_max

In [None]:
source.data['w'][0]

In [29]:
# Displaying data tables
from bokeh.models import NumberFormatter
df = pd.read_csv('simulation_weighted_results_10x.csv')
source = ColumnDataSource(dict(df))
columns = [
    TableColumn(field="multiplier", title="Multiplier"),
    TableColumn(field="TP", title="True Positives"),
    TableColumn(field="FP", title="False Positives"),
    TableColumn(field="TN", title="True Negatives"),
    TableColumn(field="FN", title="False Negatives"),
    TableColumn(field="precision", title="Precision"),
    TableColumn(field="recall", title="Recall"),
    TableColumn(field="f1_score", title="F1 Score", formatter=NumberFormatter(format='0.000%')),
    TableColumn(field="weighted_FN", title="Weighted False Negatives"),
    TableColumn(field="total_weighted_errors", title="Total Weighted Errors"),
    ]
                    
data_table = DataTable(source=source, columns=columns, width=900, height=400,
                      fit_columns=True, reorderable=True, sortable=True)

handler = CustomJS(args=dict(source=source), code="""
   var data = source.data;
   var f = cb_obj.value
   var w = data['FP']
   var x = data['FN']
   var y = data['weighted_FN']
   var z = data['total_weighted_errors']
   for (var i = 0; i < x.length; i++) {
      y[i] = x[i] * f
      z[i] = w[i] + y[i]
   }
   source.change.emit();
""")

slider = Slider(start=1.0, end=50, value=10, step=.25, title="Slider Value")
slider.js_on_change('value', handler)

layout = column(slider, data_table)
show(layout)

In [35]:
# div testing

from bokeh.layouts import row, column, gridplot, grid
from bokeh.models import Div
#tools for creating html file
from bokeh.resources import CDN
from bokeh.embed import file_html
import webbrowser, os


title_text = '''
<style>

@font-face {
    font-family: MontrealBold;
    src: url(fonts/NeueMontreal-Bold.otf);
    font-weight: bold;
}

@font-face {
    font-family: MontrealLight;
    src: url(fonts/NeueMontreal-Light.otf);
}

body {
    background-color: #f2ebe6;
}

title_header {
    font-size: 80px;
    font-style: bold;
    font-family: MontrealBold, Helvetica;
    font-weight: bold;
    margin-bottom: -200px;
}

h1 {
    color: #313596;
}

p {
    font-size: 12px;
}

b {
    color: #58c491;
}

th, td {
    text-align:left;
    padding: 5px;
}

tr:nth-child(even) {
    background-color: white;
    opacity: .7;
}

.vertical { 
    border-left: 1px solid black; 
    height: 190px; 
        } 
</style>

    <title_header style="text-align:left; color: white;">
        Cream.
    </title_header>
    <p style="font-family: MontrealBold, Helvetica;
    font-size:18px;
    margin-top: 0px;
    margin-left: 5px;">
        Time is money, and <b style="font-size=18px;">"Cash Rules Everything Around Me"</b>.
    </p>
</div>
'''

title_div = Div(text=title_text, width=800, height=160, margin=(40,0,0,70))

stats = """
<h1>Summary Statistics</h1> 
<p><i>metric = magnitude</i></p>

<table style="width:420px; height:160px; vertical-align: center;">
  <tr>
    <th>Metric</th>
    <th>Normal Events</th>
    <th>Malicious Events</th>
  </tr>
  <tr>
    <td>Observations</td>
    <td>86,100</td>
    <td>300</td>
  </tr>
  <tr>
    <td>Average</td>
    <td>5,015.62</td>
    <td>25,110.47</td>
  </tr>
  <tr>
    <td>Standard Deviation</td>
    <td>2,843.19</td>
    <td>1,938.80</td>
  </tr>  
  <tr>
    <td>Median</td>
    <td>2,843.19</td>
    <td>1,938.80</td>
  </tr>    
</table>
"""
stats_div = Div(text=stats, width=480, height=480, margin=(3,0,0,73))

hypothetical_threshold = '''
<h1>\"Rule of Thumb\" Hypothetical Threshold</h1>
<p>A threshold at the <i>(average + 3x standard deviations)</i> normal magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
'''

hypo_div = Div(text=hypothetical_threshold, width=600, height=480, margin=(5,0,0,80))

line = '''
<div class="vertical"></div>
'''

vertical_line = Div(text=line, width=20, height=380, margin=(80,0,0,40))



#header_banner = column(title_div, row(stats_div, hypo_div))
#header_banner = column(title_div)

x = [x*0.005 for x in range(0, 200)]
y = x

source = ColumnDataSource(data=dict(x=x, y=y))

plot = figure(plot_width=400, plot_height=400)
plot.line('x', 'y', source=source, line_width=3, line_alpha=0.6)

callback = CustomJS(args=dict(source=source), code="""
        var data = source.get('data');
        var f = cb_obj.get('value')
        x = data['x']
        y = data['y']
        for (i = 0; i < x.length; i++) {
            y[i] = Math.pow(x[i], f)
        }
        source.trigger('change');
    """)

#slider = Slider(start=0.1, end=4, value=1, step=.1, title="power", callback=callback)
#layout = vform(slider, plot)

text_input = TextInput(value="1", title="power")
text_input.js_on_change('value', callback)
layout = column(text_input, plot)


l = grid([
        [title_div],
        [row(stats_div,vertical_line, hypo_div)],
        [layout]
    ])
    #show(column(bokeh_objects))
    #show(l)
html = file_html(l, CDN, "CREAM")
#html = file_html(header_banner, CDN, "CREAM")
with open("render.html", "w") as file:
    file.write(html)
webbrowser.open("file://" + os.path.realpath("render.html"))

True