In [None]:
# mount Google Drive, please authenticate with a Google account with some free space on your storage.
from google.colab import drive
drive.mount('/content/drive', force_remount=1)

Mounted at /content/drive


In [None]:
#copy zip file from your google drive to this colab notebook
!cp "/content/drive/My Drive/final5.zip" "/content"

In [None]:
#unzip the folder
!unzip /content/final5.zip -d /content

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  inflating: /content/content/NAB/results/e239/realAdExchange/e239_exchange-2_cpc_results.csv  
   creating: /content/content/NAB/results/e239/artificialNoAnomaly/
  inflating: /content/content/NAB/results/e239/artificialNoAnomaly/e239_art_daily_small_noise.csv  
  inflating: /content/content/NAB/results/e239/artificialNoAnomaly/e239_art_flatline.csv  
  inflating: /content/content/NAB/results/e239/artificialNoAnomaly/e239_art_daily_no_noise.csv  
  inflating: /content/content/NAB/results/e239/artificialNoAnomaly/e239_art_noisy.csv  
  inflating: /content/content/NAB/results/e239/artificialNoAnomaly/e239_art_daily_perfect_square_wave.csv  
  inflating: /content/content/NAB/results/e239/e239_reward_low_FN_rate_scores.csv  
  inflating: /content/content/NAB/results/e239/e239_standard_scores.csv  
   creating: /content/content/NAB/results/e239/realTweets/
  inflating: /content/content/NAB/results/e239/realTweets/e239_Twitter

In [None]:
#import libraries
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from IPython.display import display
import matplotlib.dates as mdates
import json
from itertools import combinations
import shutil

In [None]:
#change the current working directory for this colab notebook. So that you can run NAB code commands like run... -score -normalize, etc.
os.chdir('/content/NAB')

To create ensembles with a new algorithm using NAB data:

## 1.   Run that algorithm on the data files, and, for each data file, produce results in a csv with the following columns:

*   'timestamp' (from the original data)
*   'value' (from the original data)
*   'anomaly' (filled with either: 0 = predicted by the algorithm as NOT an anomaly or 1 = predicted by the algorithm as YES an anomaly)
*   'label' (containing the provided ground truth label, 0 or 1)

##2.   Save these data files to the appropriate colab directories with the appropriate naming convention as specified below:
*   Directory: "/content/NAB/results_with_predictions/" + *algorithmName* + "/" + *dataFolderName*, where *algorithmName* is a name of your choice with NO SPACES, UNDERSCORES, OR SPECIAL CHARACTERS, and *dataFolderName* is named exactly the same as the original NAB data folder that this data file came from (eg. "artificialNoAnomaly")
*   File Name: *algorithmName* + "_" + original NAB data file name. If your algorithm is called "algorithm", and this csv contains the results of running it on the NAB file called "art_daily_no_noise.csv", then your results file name would be "algorithm_art_daily_no_noise.csv" 

##3. Assign a string with *algorithmName* to the below variable:





 





In [None]:
new_algorithm_name = ""

##4.   Run the below code:



In [None]:
algorithm_list = ["ARTime","bayesChangePt","contextOSE","earthgeckoSkyline","expose","htmjava","knncad","null","numenta","numentaTM","random","randomCutForest","relativeEntropy","skyline","twitterADVec","windowedGaussian"]

ensemble_list = list(combinations(algorithm_list,2))
ensembles = {}
for i, value in enumerate(ensemble_list):
  val = list(value)
  val.append(new_algorithm_name)
  ensembles[i+559] = tuple(val)
print(ensembles)

{559: ('ARTime', 'bayesChangePt', ''), 560: ('ARTime', 'contextOSE', ''), 561: ('ARTime', 'earthgeckoSkyline', ''), 562: ('ARTime', 'expose', ''), 563: ('ARTime', 'htmjava', ''), 564: ('ARTime', 'knncad', ''), 565: ('ARTime', 'null', ''), 566: ('ARTime', 'numenta', ''), 567: ('ARTime', 'numentaTM', ''), 568: ('ARTime', 'random', ''), 569: ('ARTime', 'randomCutForest', ''), 570: ('ARTime', 'relativeEntropy', ''), 571: ('ARTime', 'skyline', ''), 572: ('ARTime', 'twitterADVec', ''), 573: ('ARTime', 'windowedGaussian', ''), 574: ('bayesChangePt', 'contextOSE', ''), 575: ('bayesChangePt', 'earthgeckoSkyline', ''), 576: ('bayesChangePt', 'expose', ''), 577: ('bayesChangePt', 'htmjava', ''), 578: ('bayesChangePt', 'knncad', ''), 579: ('bayesChangePt', 'null', ''), 580: ('bayesChangePt', 'numenta', ''), 581: ('bayesChangePt', 'numentaTM', ''), 582: ('bayesChangePt', 'random', ''), 583: ('bayesChangePt', 'randomCutForest', ''), 584: ('bayesChangePt', 'relativeEntropy', ''), 585: ('bayesChangePt

In [None]:
for ensemble_key, ensemble_value in ensembles.items():
  for algorithm in ensemble_value:
    source_path = "/content/NAB/results_with_predictions/" + algorithm
    for data_folder in os.listdir(source_path):
      source_filepath = source_path + "/" + data_folder
      write_path = "/content/NAB/results/e" + str(ensemble_key) + "/" + data_folder
      os.makedirs(write_path, exist_ok=True)
      for file in os.listdir(source_filepath):
        source_full_path = source_filepath + "/" + file
        print(source_filepath)
        print(file)
        shortened_file_name = file.split('_', 1)[1]
        if "e"+str(ensemble_key)+"_"+shortened_file_name not in os.listdir(write_path):
          shutil.copy(source_full_path, write_path + "/e" + str(ensemble_key) +"_" + shortened_file_name)
        else:
          df_existing = pd.read_csv(write_path + "/e" + str(ensemble_key) +"_"+ shortened_file_name)
          df_to_add = pd.read_csv(source_full_path)
          df_to_write = df_existing
          df_to_write['anomaly'] += df_to_add['anomaly']
          df_to_write.to_csv(write_path + "/e" + str(ensemble_key) +"_"+ shortened_file_name)

/content/content/NAB/results_with_predictions/ARTime/artificialNoAnomaly
ARTime_art_noisy.csv
/content/content/NAB/results_with_predictions/ARTime/artificialNoAnomaly
ARTime_art_daily_small_noise.csv
/content/content/NAB/results_with_predictions/ARTime/artificialNoAnomaly
ARTime_art_daily_perfect_square_wave.csv
/content/content/NAB/results_with_predictions/ARTime/artificialNoAnomaly
ARTime_art_daily_no_noise.csv
/content/content/NAB/results_with_predictions/ARTime/artificialNoAnomaly
ARTime_art_flatline.csv
/content/content/NAB/results_with_predictions/ARTime/artificialWithAnomaly
ARTime_art_daily_jumpsup.csv
/content/content/NAB/results_with_predictions/ARTime/artificialWithAnomaly
ARTime_art_daily_jumpsdown.csv
/content/content/NAB/results_with_predictions/ARTime/artificialWithAnomaly
ARTime_art_daily_flatmiddle.csv
/content/content/NAB/results_with_predictions/ARTime/artificialWithAnomaly
ARTime_art_load_balancer_spikes.csv
/content/content/NAB/results_with_predictions/ARTime/artif

IndexError: ignored

In [None]:
dir = "/content/NAB/results"
for algorithm_folder in ensembles.keys():
  subdir = dir + "/" + algorithm_folder
  for data_folder in os.listdir(subdir):
    if "." not in data_folder:
      sub_subdir = subdir + "/" + data_folder
      for file in os.listdir(sub_subdir):
        full_path = sub_subdir + "/" + file
        df = pd.read_csv(full_path)
        df_to_write = pd.DataFrame()
        df_to_write["timestamp"] = df["timestamp"]
        df_to_write["value"] = df["value"]
        df_to_write["anomaly_score"] = df["anomaly"]
        df_to_write["label"] = df["label"]
        df_to_write.to_csv(full_path,index=False)

In [None]:
for ensemble_key, ensemble_value in ensembles.items():
  k = "e"+str(ensemble_key)
  thresholds[k] = {}
  thresholds[k]['standard'] = {}
  thresholds[k]['standard']['threshold'] = 1.5

  thresholds[k]['reward_low_FN_rate'] = {}
  thresholds[k]['reward_low_FN_rate']['threshold'] = 1.5

  thresholds[k]['reward_low_FP_rate'] = {}
  thresholds[k]['reward_low_FP_rate']['threshold'] = 1.5

with open('/content/NAB/config/thresholds.json', 'w') as fp:
    json.dump(thresholds, fp)



##5.   Run the following command, but replace "e560 e561" with the names of all ensembles from your ensembles.keys(), separated by one space between each



In [None]:
!python run.py -d e560 e561 --score --normalize