In [7]:
#Import relevant libraries and functions
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn import metrics
import matplotlib.pyplot as plt
from matplotlib import cm
from matplotlib.ticker import LinearLocator
import pandas as pd
import numpy as np

# Clear workspace
!rm -rf result_figs
!rm result_figs.zip
!mkdir result_figs

rm: cannot remove 'result_figs.zip': No such file or directory


###Exercise 1: Using the Negative Selection Algorithm

In [None]:
# Specify the input
input = 'english_hiligayon'

# Read the csv file
df = pd.read_csv('output.csv')

# Encode labels (target = 0; anomalous = 1)
labels = np.array(df.iloc[:,2])
for i in range(len(labels)):
  labels[i] = 0 if labels[i] == ' english.test' else 1

# Extract and preprocess data
r = np.array(df.iloc[:,3])
r_values = np.unique(r)
num_values = np.where(r==r_values[0])

anomaly_scores = np.reshape(np.array(df.iloc[:,1]), (len(r_values), len(num_values[0])))
labels = np.reshape(labels, (len(r_values), len(num_values[0])))

# Specify figure features
image_format = 'svg'
figs, axs = plt.subplots(1, len(r_values), figsize = (6*len(r_values),5))
idx = 0
AUCs = []

# Plot Receiver Operating Characteristic for different r values
for value in r_values:
  title = "Receiver Operating Characteristic for r = {f}".format(f=value)

  # Compute relevant metrics
  specitifity, sensitivity,_ = metrics.roc_curve(labels[idx,:].astype(int), anomaly_scores[idx,:])
  AUC = round(metrics.roc_auc_score(labels[idx,:].astype(int), anomaly_scores[idx,:]),5)
  AUCs.append(AUC)

  # Individual figures
  image_name = 'result_figs/' + input + '_r' + value.astype(str) + '.' + image_format 
  fig, ax = plt.subplots(1, figsize=(4,4))
  plt.title(title)
  plt.plot( specitifity, sensitivity)
  plt.plot([0, 1], ls="--")
  plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
  plt.text(0.2, 0, r"AUC="+str(AUC), fontsize=15)
  plt.ylabel('sensitivity')
  plt.xlabel('1 - specitifity')
  
  # Composite figure
  composite_image_name = 'result_figs/All_' + input + '.' + image_format
  axs[idx].set_title(title)
  axs[idx].plot( specitifity, sensitivity, label="AUC="+str(AUC))
  axs[idx].plot([0, 1], ls="--")
  axs[idx].plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
  axs[idx].set_ylabel('sensitivity')
  axs[idx].set_xlabel('1 - specitifity')
  axs[idx].text(0.3, 0, r"AUC="+str(AUC), fontsize=15)

  fig.savefig(image_name, format = image_format, dpi=1200)
  idx+=1

# Print the average AUC over all different r values
print("Average AUC over all r values ranging from 1 to 9: {}".format(round(np.mean(AUCs),5)))

# Save and zip the figures
figs.savefig(composite_image_name, format = image_format, dpi=1200)
!zip -r /content/result_figs.zip /content/result_figs

###Exercise 2: Intrusion Detection for Unix Processes

In [11]:
# Read the csv file
df = pd.read_csv('snd-unm.csv')

# Extract and preprocess data
r = np.array(df.iloc[:,3])
r_values = np.unique(r)
n = np.array(df.iloc[:,4])
n_values = np.unique(n)

num_elem_per_param_set = int((len(r)/len(r_values))/len(n_values)) 

anomaly_scores = np.reshape(np.array(df.iloc[:,1]), (len(n_values), len(r_values), num_elem_per_param_set))
labels = np.reshape(np.array(df.iloc[:,2]), (len(n_values), len(r_values), num_elem_per_param_set))


# Initialize variables
AUCs = []
n_idx = 0
AUCs = np.zeros((len(n_values),len(r_values)))

# Compute the AUCs for all parameter combinations
for n_value in n_values:
  r_idx = 0
  for r_value in r_values:
    AUC = metrics.roc_auc_score(labels[:][n_idx][r_idx], anomaly_scores[:][n_idx][r_idx])
    AUCs[n_idx][r_idx] = AUC
    r_idx += 1
  n_idx +=1
    
# Find the best performing parameter set 
optimal_param = np.where(AUCs == np.max(AUCs))
optimal_n_value = n_values[optimal_param[0]][0]
optimal_r_value = r_values[optimal_param[1]][0]

print('r = {} and n = {} gave the highest AUC of: {}'.format(optimal_r_value,optimal_n_value,np.max(AUCs)))

r = 4 and n = 8 gave the highest AUC of: 0.9331763440860215


In [None]:
# Indicate chunk-merging method
merging = 'max'

# Specify figure features
image_format = 'svg'
image_name = merging + '_AUC_surface_plot.' + image_format

fig, ax = plt.subplots(subplot_kw={"projection": "3d"}, figsize=(12, 12))

# Compute the r-n grid
X, Y = np.meshgrid(r_values, n_values)

# Plot the surface.
surf = ax.plot_surface(X, Y, AUCs, cmap = cm.coolwarm, edgecolor='k', lw=0.5, antialiased=False, rstride = 1, cstride = 1, alpha = 0.5) # rstride = 8, cstride = 8, cmap=cm.coolwarm,

cset = ax.contour(X, Y, AUCs, zdir ='z', offset = 0.5, cmap = cm.coolwarm)
cset = ax.contour(X, Y, AUCs, zdir ='x', offset = np.min(r_values)-1, cmap = cm.coolwarm)
cset = ax.contour(X, Y, AUCs, zdir ='y', offset =  np.max(n_values)+1, cmap = cm.coolwarm)

# calc index of min/max Z value
xmax, ymax = np.unravel_index(np.argmax(AUCs), AUCs.shape)

# max points in 3D space (x,y,z)
max = (X[xmax, ymax], Y[xmax, ymax], AUCs.max())

A_max = np.array([max]*4)
for i, v in enumerate([np.min(r_values)-1,np.max(n_values)+1,0.5]):
    A_max[i,i] = v 

#plot max point
ax.plot(A_max[:,0], A_max[:,1], A_max[:,2], marker="o", ls="", c=cm.coolwarm(1.))

# Customize plot
ax.set_zlim(0.5, 1)
ax.set_xlim(np.min(r_values)-1, np.max(r_values)+1)
ax.set_ylim(np.min(n_values)-1, np.max(n_values)+1)

ax.zaxis.set_major_locator(LinearLocator(10))
ax.zaxis.set_major_formatter('{x:.02f}')

ax.set_xticks(r_values, fontsize=20)
ax.set_yticks(n_values, fontsize=20)
ax.set_xlabel('r value', fontsize=20)
ax.set_ylabel('n value', fontsize=20)
ax.set_zlabel('AUC', fontsize=20)
ax.set_title('AUC surface plot for ' + merging + '-chunking', fontsize=25)

fig.colorbar(surf, ax = ax, shrink=0.5, aspect=5)

plt.show()

# Save the image
fig.savefig(image_name, format = image_format, dpi=1200)

In [None]:
# Compute relevant metrics
specitifity, sensitivity,_ = metrics.roc_curve(labels[:][optimal_param[0][0]][optimal_param[1][0]], anomaly_scores[:][optimal_param[0][0]][optimal_param[1][0]])
max_AUC = round(metrics.roc_auc_score(labels[:][optimal_param[0][0]][optimal_param[1][0]], anomaly_scores[:][optimal_param[0][0]][optimal_param[1][0]]),5)

# Plot the ROC for the best performing parameter set
image_name = merging + '_ROC.' + image_format
fig, ax = plt.subplots(1, figsize=(12,12))
plt.title('ROC for r = {} and n = {}'.format(optimal_r_value, optimal_n_value), fontsize=25)
plt.plot( specitifity, sensitivity)
plt.plot([0, 1], ls="--")
plt.plot([0, 0], [1, 0] , c=".7"), plt.plot([1, 1] , c=".7")
plt.text(0.2, 0, r"AUC="+str(max_AUC), fontsize=25)
plt.ylabel('sensitivity', fontsize=20)
plt.xlabel('1 - specitifity', fontsize=20)
plt.xticks(fontsize = 20)
plt.yticks(fontsize = 20)

# Save the image
fig.savefig(image_name, format = image_format, dpi=1200)