In [1]:
import pandas as pd
import numpy as np
import math

In [67]:
from bokeh.io import output_notebook, output_file, show
from bokeh.plotting import figure
from bokeh.models import Div, Arrow, NormalHead, Label, Span, Legend

In [37]:
sample = pd.read_csv('simulated_ddos_data.csv')
simulation = pd.read_csv('simulation_scores.csv')

In [22]:
stats_output = '''
<h2>Normal vs Malicious Summary</h2> 
<i>metric = magnitude</i>

<h3>Normal:</h3>
-----------------------------
Observations: <b>86100</b>
Average: <b>5015.62</b>
Standard Deviation: <b>2843.19</b>

<h3>Malicious:</h3>
-----------------------------
Observations: <b>300</b>
Average: <b>25110.47</b>
Standard Deviation: </b>1938.8</b>

A threshold at (average + 3x standard deviations) magnitude would result in:
    - True Positives (correctly identified malicious events: <b>300</b>
    - False Positives (wrongly identified normal events: <b>1,229</b>
    - True Negatives (correctly identified normal events: <b>84,871</b>
    - False Negatives (wrongly identified malicious events: <b>0</b>

Accuracy Metrics:
    - Precision (what % of events above threshold are actually malicious): <b>19.6%</b>
    - Recall (what % of malicious events did we catch): <b>100.0%</b>
    - F1 Score (blends precision and recall): <b>32.8%</b>

<i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. The median is 4372.82 
compared to 5015.62 for the average.</i>
'''

In [44]:
output_notebook()
text = """
<h1>Normal vs Malicious Summary</h1> 
<i>metric = magnitude</i>

<table style="width:100%,text-align: right">
  <tr>
    <th style="text-align:left">Metric</th>
    <th style="text-align:left">Normal Events</th>
    <th style="text-align:left">Malicious Events</th>
  </tr>
  <tr>
    <td style="text-align:left">Observations</td>
    <td style="text-align:left">86,100</td>
    <td style="text-align:left">300</td>
  </tr>
  <tr>
    <td style="text-align:left">Average</td>
    <td style="text-align:left">5,015.62</td>
    <td style="text-align:left">25,110.47</td>
  </tr>
  <tr>
    <td style="text-align:left">Standard Deviation</td>
    <td style="text-align:left">2,843.19</td>
    <td style="text-align:left">1,938.80</td>
  </tr>  
</table>
"""
stats_div = Div(text=text, width=500, height=200)
show(stats_div)

In [36]:
hypothetical_threshold = '''
<p>A threshold at <i>(average + 3x standard deviations)</i> magnitude would result in:</p>
<ul>
    <li>True Positives (correctly identified malicious events: <b>300</b></li>
    <li>False Positives (wrongly identified normal events: <b>1,229</b></li>
    <li>True Negatives (correctly identified normal events: <b>84,871</b></li>
    <li>False Negatives (wrongly identified malicious events: <b>0</b></li>
</ul>
<h3>Accuracy Metrics</h3>
<ul>
    <li>Precision (what % of events above threshold are actually malicious): <b>19.6%</b></li>
    <li>Recall (what % of malicious events did we catch): <b>100.0%</b></li>
    <li>F1 Score (blends precision and recall): <b>32.8%</b></li>
</ul>
'''

hypo_div = Div(text=hypothetical_threshold, width=500, height=200)
show(hypo_div)

In [31]:
warning_msg = '''
<p><i>You may want to be cautious as your normal traffic's magnitude 
has a long tail towards high values. 
The median is 4372.82 compared to 5015.62 for the average.</i></p>
'''

warning_div = Div(text=warning_msg, width=500, height=50)
show(warning_div)

In [64]:
# Let's get the exploratory charts generated
malicious = sample.loc[sample.is_ddos == 1, 'magnitude']
normal = sample.loc[sample.is_ddos == 0, 'magnitude']

mal_mean = malicious.mean()
mal_std = malicious.std()
mal_count = malicious.size
normal_mean = normal.mean()
normal_std = normal.std()
normal_count = normal.size

malicious_hist, malicious_edge = np.histogram(malicious, bins=100)
mal_hist_df = pd.DataFrame({
    'magnitude': malicious_hist,
    'left': malicious_edge[:-1],
    'right': malicious_edge[1:]
})

normal_hist, normal_edge = np.histogram(normal, bins=100)
norm_hist_df = pd.DataFrame({
    'magnitude': normal_hist,
    'left': normal_edge[:-1],
    'right': normal_edge[1:]
})

exploratory = figure(plot_width = 900, plot_height = 600,  
           title = 'Magnitude Distribution Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations'
          )

exploratory.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='purple', alpha=.85)
exploratory.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='cyan', alpha=.35)

exploratory.add_layout(Arrow(end=NormalHead(fill_color='red', size=10),
                   x_start=mal_mean, y_start=mal_count, x_end=mal_mean, y_end=0))
arrow_label = Label(x=mal_mean, y=mal_count * 1.2, text='Malicious Events')
exploratory.add_layout(arrow_label)

exploratory.legend.location = "top_right"
show(exploratory)

In [79]:
# Zoomed in version
overlap_view = figure(plot_width = 900, plot_height = 600,  
           title = 'Magnitude Distribution Across Normal vs Malicious Events (Zoomed in w/Example Threshold)',
           x_axis_label = 'Magnitude', 
           y_axis_label = 'Observations',
           y_range=(0,mal_count * .33),
           x_range=(normal_mean + (normal_std * 2.5),mal_mean + (mal_std * 3)),
          )

overlap_view.quad(bottom = 0, top=mal_hist_df.magnitude, left=mal_hist_df.left, right=mal_hist_df.right,
      legend_label='malicious', fill_color='purple', alpha=.85)
overlap_view.quad(bottom = 0, top=norm_hist_df.magnitude, left=norm_hist_df.left, right=norm_hist_df.right,
      legend_label='normal', fill_color='cyan', alpha=.35)

# 3 sigma reference line
thresh = Span(location=normal_mean + (normal_std * 3), dimension='height', line_color='grey',
              line_dash='dashed', line_width=2)
thresh_label = Label(x=normal_mean + (normal_std * 3), y=mal_count * .33 * .95, 
                     text='3 Std Dev Threshold')
overlap_view.add_layout(thresh)
overlap_view.add_layout(thresh_label)

overlap_view.legend.location = "top_right"
show(overlap_view)

In [80]:
# density version
malicious_hist_dense, malicious_edge_dense = np.histogram(malicious, density=True, bins=100)
mal_hist_dense_df = pd.DataFrame({
    'magnitude': malicious_hist_dense,
    'left': malicious_edge_dense[:-1],
    'right': malicious_edge_dense[1:]
})

normal_hist_dense, normal_edge_dense = np.histogram(normal, density=True, bins=100)
norm_hist_dense_df = pd.DataFrame({
    'magnitude': normal_hist_dense,
    'left': normal_edge_dense[:-1],
    'right': normal_edge_dense[1:]
})

density = figure(plot_width = 900, plot_height = 600,  
           title = 'Probability Density Across Normal vs Malicious Events',
           x_axis_label = 'Magnitude', 
           y_axis_label = '% of Group Total'
          )

density.quad(bottom = 0, top=mal_hist_dense_df.magnitude, left=mal_hist_dense_df.left, 
             right=mal_hist_dense_df.right, legend_label='malicious', fill_color='purple', alpha=.85)
density.quad(bottom = 0, top=norm_hist_dense_df.magnitude, left=norm_hist_dense_df.left, 
             right=norm_hist_dense_df.right, legend_label='normal', fill_color='cyan', alpha=.35)

density.legend.location = "top_right"
show(density)