In [1]:
import pandas as pd
import numpy as np
import math

In [103]:
from bokeh.io import output_notebook, output_file, show
from bokeh.plotting import figure
from bokeh.models import Div, Arrow, NormalHead, Label, Span, Legend, NumeralTickFormatter, DataTable, TableColumn

In [82]:
sample = pd.read_csv('simulated_ddos_data.csv')
simulations = pd.read_csv('simulation_scores.csv')

In [22]:
stats_output = '''
<h2>Normal vs Malicious Summary</h2> 
<i>metric = magnitude</i>

<h3>Normal:</h3>
-----------------------------
Observations: <b>86100</b>
Average: <b>5015.62</b>
Standard Deviation: <b>2843.19</b>

<h3>Malicious:</h3>
-----------------------------
Observations: <b>300</b>
Average: <b>25110.47</b>
Standard Deviation: </b>1938.8</b>

A threshold at (average + 3x standard deviations) magnitude would result in:
    - True Positives (correctly identified malicious events: <b>300</b>
    - False Positives (wrongly identified normal events: <b>1,229</b>
    - True Negatives (correctly identified normal events: <b>84,871</b>
    - False Negatives (wrongly identified malicious events: <b>0</b>

Accuracy Metrics:
    - Precision (what % of events above threshold are actually malicious): <b>19.6%</b>
    - Recall (what % of malicious events did we catch): <b>100.0%</b>
    - F1 Score (blends precision and recall): <b>32.8%</b>

<i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. The median is 4372.82 
compared to 5015.62 for the average.</i>
'''

In [81]:
output_notebook()
text = """
<h1>Normal vs Malicious Summary</h1> 
<i>metric = magnitude</i>

<table style="width:100%,text-align: right">
  <tr>
    <th style="text-align:left">Metric</th>
    <th style="text-align:left">Normal Events</th>
    <th style="text-align:left">Malicious Events</th>
  </tr>
  <tr>
    <td style="text-align:left">Observations</td>
    <td style="text-align:left">86,100</td>
    <td style="text-align:left">300</td>
  </tr>
  <tr>
    <td style="text-align:left">Average</td>
    <td style="text-align:left">5,015.62</td>
    <td style="text-align:left">25,110.47</td>
  </tr>
  <tr>
    <td style="text-align:left">Standard Deviation</td>
    <td style="text-align:left">2,843.19</td>
    <td style="text-align:left">1,938.80</td>
  </tr>  
</table>

<p>A threshold at <i>(average + 3x standard deviations)</i> magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
"""
stats_div = Div(text=text, width=500, height=200)
show(stats_div)

In [36]:
hypothetical_threshold = '''
<p>A threshold at <i>(average + 3x standard deviations)</i> magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
'''

hypo_div = Div(text=hypothetical_threshold, width=500, height=200)
show(hypo_div)

In [31]:
warning_msg = '''
<p><i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. 
The median is 4372.82 compared to 5015.62 for the average.</i></p>
'''

warning_div = Div(text=warning_msg, width=500, height=50)
show(warning_div)

In [93]:
# Let's get the exploratory charts generated
malicious = sample.loc[sample.is_ddos == 1, 'magnitude']
normal = sample.loc[sample.is_ddos == 0, 'magnitude']

mal_mean = malicious.mean()
mal_std = malicious.std()
mal_count = malicious.size
normal_mean = normal.mean()
normal_std = normal.std()
normal_count = normal.size

malicious_hist, malicious_edge = np.histogram(malicious, bins=100)
mal_hist_df = pd.DataFrame({
    'magnitude': malicious_hist,
    'left': malicious_edge[:-1],
    'right': malicious_edge[1:]
})

normal_hist, normal_edge = np.histogram(normal, bins=100)
norm_hist_df = pd.DataFrame({
    'magnitude': normal_hist,
    'left': normal_edge[:-1],
    'right': normal_edge[1:]
})

exploratory = figure(plot_width = 900, plot_height = 600,  
           title = 'Magnitude Distribution Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations'
          )

exploratory.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='purple', alpha=.85)
exploratory.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='cyan', alpha=.35)

exploratory.xaxis.formatter = NumeralTickFormatter(format='0,0')
exploratory.yaxis.formatter = NumeralTickFormatter(format='0,0')

exploratory.add_layout(Arrow(end=NormalHead(fill_color='red', size=10),
                   x_start=mal_mean, y_start=mal_count, x_end=mal_mean, y_end=0))
arrow_label = Label(x=mal_mean, y=mal_count * 1.2, text='Malicious Events')
exploratory.add_layout(arrow_label)

exploratory.legend.location = "top_right"
show(exploratory)

In [92]:
# Zoomed in version
overlap_view = figure(plot_width = 900, plot_height = 600,  
           title = 'Magnitude Distribution Across Normal vs Malicious Events (Zoomed in w/Example Threshold)',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations',
           y_range=(0,mal_count * .33),
           x_range=(normal_mean + (normal_std * 2.5),mal_mean + (mal_std * 3)),
          )

overlap_view.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='purple', alpha=.85)
overlap_view.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='cyan', alpha=.35)
overlap_view.xaxis.formatter = NumeralTickFormatter(format='0,0')
overlap_view.yaxis.formatter = NumeralTickFormatter(format='0,0')

# 3 sigma reference line
thresh = Span(location=normal_mean + (normal_std * 3), dimension='height', line_color='grey',
              line_dash='dashed', line_width=2)
thresh_label = Label(x=normal_mean + (normal_std * 3), y=mal_count * .33 * .95, 
                     text='3 Std Dev Threshold')
overlap_view.add_layout(thresh)
overlap_view.add_layout(thresh_label)

overlap_view.legend.location = "top_right"
show(overlap_view)

In [108]:
# Density version
malicious_hist_dense, malicious_edge_dense = np.histogram(malicious, density=True, bins=100)
mal_hist_dense_df = pd.DataFrame({
    'magnitude': malicious_hist_dense,
    'left': malicious_edge_dense[:-1],
    'right': malicious_edge_dense[1:]
})

normal_hist_dense, normal_edge_dense = np.histogram(normal, density=True, bins=100)
norm_hist_dense_df = pd.DataFrame({
    'magnitude': normal_hist_dense,
    'left': normal_edge_dense[:-1],
    'right': normal_edge_dense[1:]
})

density = figure(plot_width = 900, plot_height = 600,  
           title = 'Probability Density Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = '% of Group Total'
          )

density.quad(bottom = 0, top=mal_hist_dense_df.magnitude, left=mal_hist_dense_df.left, 
             right=mal_hist_dense_df.right, legend_label='malicious', fill_color='purple', alpha=.85)
density.quad(bottom = 0, top=norm_hist_dense_df.magnitude, left=norm_hist_dense_df.left, 
             right=norm_hist_dense_df.right, legend_label='normal', fill_color='cyan', alpha=.35)
density.xaxis.formatter = NumeralTickFormatter(format='0,0')
density.yaxis.formatter = NumeralTickFormatter(format='0.000%')

density.legend.location = "top_right"
show(density)

In [113]:
# Density version
from scipy.stats.kde import gaussian_kde
malicious_hist_dense, malicious_edge_dense = np.histogram(malicious, density=True, bins=100)
mal_hist_dense_df = pd.DataFrame({
    'magnitude': malicious_hist_dense,
    'left': malicious_edge_dense[:-1],
    'right': malicious_edge_dense[1:]
})

normal_hist_dense, normal_edge_dense = np.histogram(normal, density=True, bins=100)
norm_hist_dense_df = pd.DataFrame({
    'magnitude': normal_hist_dense,
    'left': normal_edge_dense[:-1],
    'right': normal_edge_dense[1:]
})

density = figure(plot_width = 900, plot_height = 600,  
           title = 'Probability Density Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = '% of Group Total'
          )

mal_pdf = gaussian_kde(malicious)

density.line(norm_hist_dense_df.right, norm_hist_dense_df.magnitude, 
             legend_label='normal', line_width=2, color="grey")
density.line(mal_hist_dense_df.right, mal_hist_dense_df.magnitude, 
             legend_label='malicious', line_width=2, color="red")
density.line(malicious, mal_pdf(malicious),
            legend_label='mal_pdf', line_width=2, color="blue")
density.xaxis.formatter = NumeralTickFormatter(format='0,0')
density.yaxis.formatter = NumeralTickFormatter(format='0.000%')

density.legend.location = "top_right"
show(density)

In [85]:
    # Simulation Series to be used
    false_positives = simulations.false_positives
    false_negatives = simulations.false_negatives
    multiplier = simulations.multiplier
    precision = simulations.precision
    recall = simulations.recall
    f1_score = simulations.f1_score

    # False Positives vs False Negatives

    errors = figure(
        plot_width=800,
        plot_height=600,
        x_range=(multiplier.min(), multiplier.max()),

        title='False Positives vs False Negatives Across Multiplier Levels',
        x_axis_label='Multiplier',
        y_axis_label='Count',

        tools="pan,box_select,zoom_in,zoom_out,save,reset"
    )

    errors.line(multiplier, false_positives, legend_label='false positives', line_width=2, color="grey")
    errors.line(multiplier, false_negatives, legend_label='false_negatives', line_width=2, color="red")
    errors.legend.location = "top_center"

    show(errors)

In [84]:
simulations.head()

Unnamed: 0,multiplier,threshold,false_positives,false_negatives,precision,recall,f1_score,fp_cost,fn_cost,estimated_total_cost_thousands
0,3.0,13545,1229,0,0.1962,1.0,0.328,409666.666667,0.0,409.666667
1,3.1,13829,1084,0,0.2168,1.0,0.3563,361333.333333,0.0,361.333333
2,3.2,14113,954,0,0.2392,1.0,0.3861,318000.0,0.0,318.0
3,3.3,14398,845,0,0.262,1.0,0.4152,281666.666667,0.0,281.666667
4,3.4,14682,748,0,0.2863,1.0,0.4451,249333.333333,0.0,249.333333


In [133]:
df = pd.read_csv('simulation_weighted_results_10x.csv')
loss_min = df[df.total_weighted_errors == df.total_weighted_errors.min()].squeeze()['multiplier']
df['loss_min'] = loss_min
df.head()

Unnamed: 0,multiplier,TP,FP,TN,FN,precision,recall,f1_score,weighted_FN,total_weighted_errors,loss_min
0,2.0,300,4035,82065,0,0.069204,1.0,0.12945,0.0,4035.0,5.5
1,2.1,300,3594,82506,0,0.077042,1.0,0.143062,0.0,3594.0,5.5
2,2.2,300,3200,82900,0,0.085714,1.0,0.157895,0.0,3200.0,5.5
3,2.3,300,2854,83246,0,0.095117,1.0,0.173712,0.0,2854.0,5.5
4,2.4,300,2523,83577,0,0.10627,1.0,0.192123,0.0,2523.0,5.5


In [135]:
#slider 

from bokeh.layouts import column
from bokeh.models import CustomJS, ColumnDataSource, LinearAxis, Range1d, Band
from bokeh.plotting import Figure, output_file, show
from bokeh.models.widgets import Slider

df = pd.read_csv('simulation_weighted_results_10x.csv')
ratio = 10
x = df.multiplier
y = df.FN
z = y * ratio
a = df.FP
b = a + z
c = df.f1_score
d = df.precision
e = df.recall
loss_min = df[df.total_weighted_errors == df.total_weighted_errors.min()].squeeze()['multiplier']
df['loss_min'] = loss_min
w = df.loss_min

source = ColumnDataSource(data=dict(x=x,
                                    y=y,
                                    z=z,
                                    a=a,
                                    b=b,
                                    c=c,
                                    d=d,
                                    e=e,
                                    w=w
                                    ))
plot = Figure(plot_width=900, plot_height=360, x_axis_label='multiplier', y_axis_label='Errors')
plot.line('x', 'b', source=source, line_width=3, line_alpha=0.6, 
          color='green', legend_label='Total Weighted Errors')
plot.extra_y_ranges = {"y2": Range1d(start = 0, end = 1.1)}
plot.add_layout(LinearAxis(y_range_name = "y2", axis_label="Score"), 'right')
plot.line('x', 'c', source=source, line_width=3, line_alpha=0.6, 
          color='purple', legend_label='F1 score', y_range_name = "y2")
plot.line('x', 'd', source=source, line_width=3, line_alpha=0.6, 
          color='red', legend_label='Precision', y_range_name = "y2")
plot.line('x', 'e', source=source, line_width=3, line_alpha=0.6, 
          color='blue', legend_label='Recall', y_range_name = "y2")

handler = CustomJS(args=dict(source=source), code="""
   var data = source.data;
   var f = cb_obj.value
   var x = data['x']
   var y = data['y']
   var z = data['z']
   var a = data['a']
   var b = data['b']
   var c = data['c']
   var d = data['d']
   var e = data['e']
   var w = data['w']
   for (var i = 0; i < x.length; i++) {
      z[i] = y[i] * f
      b[i] = z[i] + a[i]
   }
   
   var min_loss = Math.min.apply(null,b)
   var loss_min = 0
   for (var i = 0; i < x.length; i++) {
      if (b[i] == min_loss) {
          loss_min = x[i]
      }
   }
   for (var i = 0; i < x.length; i++) {
      w[i] = loss_min
   }
   source.change.emit();
""")

slider = Slider(start=1.0, end=50, value=10, step=.25, title="Slider Value")
slider.js_on_change('value', handler)

twe_thresh = Span(location=source.data['w'][0], dimension='height', line_color='grey',
              line_dash='dashed', line_width=2)
plot.add_layout(twe_thresh)

plot.legend.location = "bottom_right"
layout = column(plot, slider)
show(layout)

In [130]:
source.data['x'][0]

2.0

In [114]:
# Displaying data tables
from bokeh.models import NumberFormatter
df = pd.read_csv('simulation_weighted_results_10x.csv')
source = ColumnDataSource(dict(df))
columns = [
    TableColumn(field="multiplier", title="Multiplier"),
    TableColumn(field="TP", title="True Positives"),
    TableColumn(field="FP", title="False Positives"),
    TableColumn(field="TN", title="True Negatives"),
    TableColumn(field="FN", title="False Negatives"),
    TableColumn(field="precision", title="Precision"),
    TableColumn(field="recall", title="Recall"),
    TableColumn(field="f1_score", title="F1 Score", formatter=NumberFormatter(format='0.000%')),
    TableColumn(field="weighted_FN", title="Weighted False Negatives"),
    TableColumn(field="total_weighted_errors", title="Total Weighted Errors"),
    ]
                    
data_table = DataTable(source=source, columns=columns, width=900, height=400,
                      fit_columns=True, reorderable=True, sortable=True)

handler = CustomJS(args=dict(source=source), code="""
   var data = source.data;
   var f = cb_obj.value
   var w = data['FP']
   var x = data['FN']
   var y = data['weighted_FN']
   var z = data['total_weighted_errors']
   for (var i = 0; i < x.length; i++) {
      y[i] = x[i] * f
      z[i] = w[i] + y[i]
   }
   source.change.emit();
""")

slider = Slider(start=1.0, end=50, value=10, step=.25, title="Slider Value")
slider.js_on_change('value', handler)

layout = column(slider, data_table)
show(layout)