In [20]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [32]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from typing import Iterable, Set, Tuple
import pandas as pd
import glob
import os
import re
import logging

log = logging.getLogger("debug")
log.setLevel(logging.DEBUG)

In [22]:
## constants
results_folder = "/home/pablo/results/bb_path_1" 
cleanup_base_dir = "/tmp"

## Cargando los resultados de evaluar las quieres de evaluacion

In [23]:
# Take as input results folder, and assume that there exists a `v0` and `worse` folder with the evaluation results
v0_dir = "v0/"
boost_dir = "worse/"

results_boost = {}
results_v0 = {}

for f in glob.glob(os.path.join(results_folder, boost_dir, "*.csv")):
    results_boost[os.path.basename(f)] = pd.read_csv(f)

for f in glob.glob(os.path.join(results_folder, v0_dir, "*.csv")):
    results_v0[os.path.basename(f)] = pd.read_csv(f)

print("Read total %s dataframes" % (len(results_boost) + len(results_v0)))

Read total 100 dataframes


In [24]:
# Post-processing required for results:
# - remove the leading /tesis/tmp/*/ from the filePathSource and filePathSink columns
# - drop score
# - drop the origin worse results

# Post-processing results
# Clean filePath* columns
def cleanup_filePath_col(value: str) -> str:
    # /tesis/tmp/78fi2yky
    leading_replace_pat = "%s/[a-zA-Z0-9]+/" % (cleanup_base_dir)
    return re.sub(leading_replace_pat, "", value)

# This bit below will cleanup the /tesis/tmp/[a-zA-Z0-9]+ bit of the path column in the results
for df in results_boost.values():
    # Cleanup filePath* columns
    df['filePathSource'] = df['filePathSource'].map(cleanup_filePath_col)
    df['filePathSink'] = df['filePathSink'].map(cleanup_filePath_col)

for df in results_v0.values():
    df['filePathSource'] = df['filePathSource'].map(cleanup_filePath_col)
    df['filePathSink'] = df['filePathSink'].map(cleanup_filePath_col)

# utility function used for hashing each row in a consistent way, so they
# can be operated in sets
def hash_tuple(x: Tuple):
    return "#".join([
        str(v) for v in x
    ])

# Now, generate the a set for worse, v0 and boosted alone
v0 = set()
worse = set()
boosted = set()

# cleanup of each dataframe, dropping columns that will not give repeatable results between
# evaluation sets.
def cleanup(df):
    return df.drop(["score", "origin", "source", "sink"], axis=1)

def calculate_df_intersect(a,b):
    sa = set(cleanup(a).apply(lambda x: hash_tuple(tuple(x)), axis=1))
    sb = set(cleanup(b).apply(lambda x: hash_tuple(tuple(x)), axis=1))
    return sa.intersection(sb)

# Calculate v0 set
for df in results_v0.values():
    # Drop non hashed columns
    df = cleanup(df)
    # hash and add to set
    v0 = v0 | set(df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

# Calculate boosted
for df in results_boost.values():
    # filter out boosted
    df = df[df['origin'] == 'boosted']
    df = cleanup(df)
    boosted = boosted | set(
        df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

# Calculate worse
for df in results_boost.values():
    # filter out boosted
    df = df[df['origin'] == 'worse']
    df = cleanup(df)
    worse = worse | set(df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

all_results = v0 | worse | boosted

print("Result sets sizes: Worse %d, Boosted %d, V0 %d, All %d" %
         (len(worse), len(boosted), len(v0), len(all_results)))

Result sets sizes: Worse 353, Boosted 217, V0 184, All 365


In [25]:
[k for k in results_boost.keys() if k.startswith('cnodejs_nodeclub')]

[]

## Analizo para db los resultados de las evaluaciones en worse, boosted y v0

In [26]:
counts = []
for n in results_boost.keys():
    intersect_size = len(calculate_df_intersect(results_boost[n], results_v0[n]))
    r_boost = results_boost[n]
    total_worse = r_boost[r_boost['origin'] == 'worse'].shape[0]
    total_boosted = r_boost[r_boost['origin'] == 'boosted'].shape[0]
    counts.append([n, total_worse, total_boosted, results_v0[n].shape[0], intersect_size])
df = pd.DataFrame(counts, columns=["name", "worse rows", "boosted rows", "v0 rows", "intersect"])
df

Unnamed: 0,name,worse rows,boosted rows,v0 rows,intersect
0,Qard_v8-autodoc_e57f9fc1dff83e054c01202a87ca6a...,5,5,2,2
1,JWally_jsLPSolver_2f067bc134d720f1900e291f4ef4...,5,1,4,4
2,HOPLONG_17052017_983bf4170d6b76a6df0c9152926fb...,21,10,11,11
3,WebReflection_testardo_6b52853e1da25509ad5fa16...,4,2,2,2
4,alphagov_location-picker-prototype_2c59195f46f...,7,2,5,5
5,abecms_abecms_f2a52d2635be2ca3f83af13a4985b3e6...,58,56,15,10
6,byteclubfr_uncloak_eb51694605cc01f6b2dac2906d6...,1,0,1,1
7,alphagov_transformation-dependencies_054f1841d...,2,0,2,2
8,MatAtBread_nodent-demo_07fac74f6b38b106175ccc1...,5,2,1,1
9,KIDx_ACdream_516798071f31c04ee1cfe8881fe509de9...,19,6,13,13


In [27]:
df[df["worse rows"]==0].shape

(3, 5)

In [28]:
# set(results_v0['FurutaTakuto_infoVis2017_7b7aa8999b4233eb2b9e5fc7d3ba6488b80ed141.csv'].apply(lambda t: hash_tuple(t), axis=1))

# Helper function to make a set iterable in a repeatable order. Used for generating
# the item sets required by the score calculation functions in sklearn

def repeatable_for_each_set(s: Set[int]) -> Iterable[int]:
    return sorted(list(s))

all_ordered = repeatable_for_each_set(all_results)

y_worse = [
    int(it in worse)
    for it in all_ordered
]

y_pred = [
    int(it in (boosted))
    for it in all_ordered
]

y_true = [
    int(it in v0)
    for it in all_ordered
]

precision = precision_score(y_true, y_worse, pos_label=1)
recall = recall_score(y_true, y_worse, pos_label=1)
accuracy = accuracy_score(y_true, y_worse)
print("WORSE")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
accuracy = accuracy_score(y_true, y_pred)
print("BOOSTED")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

WORSE
Score results: Precision: 0.4873. Recall: 0.9348. Accuracy: 0.4712
BOOSTED
Score results: Precision: 0.1982. Recall: 0.2337. Accuracy: 0.1370


In [34]:
from sklearn.metrics import multilabel_confusion_matrix
# set(results_v0['FurutaTakuto_infoVis2017_7b7aa8999b4233eb2b9e5fc7d3ba6488b80ed141.csv'].apply(lambda t: hash_tuple(t), axis=1))

# Helper function to make a set iterable in a repeatable order. Used for generating
# the item sets required by the score calculation functions in sklearn

def repeatable_for_each_set(s: Set[int]) -> Iterable[int]:
    return sorted(list(s))

v0_prime = v0.difference(worse)
all_ordered = repeatable_for_each_set(v0_prime.union(boosted))

print("# of elements in worse: %d" % (len(worse)))
print("# of elements in v0: %d" % (len(v0)))
print("# of elements in v0Prime: %d" % (len(v0_prime)))
print("# of elements in boosted: %d" % (len(boosted)))

alerts_to_recover = len(v0_prime)

# tp son los elementos que estan en v0_prime, es decir que son alertas encontradas en v0, y tambien en boosted
tp = len(v0_prime & boosted)

# fp son los elementos que estan en v0_prime, es decir que son alertas encontradas en v0, y NO en boosted
fp = len(boosted - v0_prime)

# fn son las alertas que estan en boosted, pero no en v0_prime, lo cual quiere decir que no eran alertas reales
fn = len(v0_prime - boosted)

print("alerts to recover: %d" % (alerts_to_recover))
print("vanilla tp: %d" % (tp))
print("vanilla fp: %d" % (fp))
print("vanilla fn: %d" % (fn))

print("vanilla precision: %.2f" % (tp/(tp+fp)))
print("vanilla recall: %.2f" % (tp/(tp+fn)))

y_pred = [
    int(it in (boosted))
    for it in all_ordered
]

y_true = [
    int(it in v0_prime)
    for it in all_ordered
]

m = multilabel_confusion_matrix(y_true, y_pred)
print("tn, fp")
print("fn, tp")
print("")
print(m)

precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
accuracy = accuracy_score(y_true, y_pred)
print("BOOSTED")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

# of elements in worse: 353
# of elements in v0: 184
# of elements in v0Prime: 12
# of elements in boosted: 217
alerts to recover: 12
vanilla tp: 0
vanilla fp: 217
vanilla fn: 12
vanilla precision: 0.00
vanilla recall: 0.00
tn, fp
fn, tp

[[[  0.  12.]
  [217.   0.]]

 [[  0. 217.]
  [ 12.   0.]]]
BOOSTED
Score results: Precision: 0.0000. Recall: 0.0000. Accuracy: 0.0000


## Analizando los resultados de todas las corridas

In [30]:
import sys
if "/home/pablo/tesis/tsm-pipeline/code" not in sys.path:
    sys.path.append("/home/pablo/tesis/tsm-pipeline/code")    

import pandas as pd

In [35]:
import os
from scripts.calculate_scores import calculate_scores

results = '/home/pablo/results'
rows = []
for d in os.listdir(results):
    rd = os.path.join(results, d)
    prec, rec, acc = calculate_scores(rd, '/tmp')
    prec2, rec2, acc2 = calculate_scores(rd, '/tmp',  use_v0_prime=False)
    rows.append([d, prec, rec, acc, prec2, rec2, acc2])
pd.DataFrame(rows, columns=['name', 'precision', 'recall', 'accuracy', 'W precision', 'W recall', 'W accuracy'])

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)
  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


Unnamed: 0,name,precision,recall,accuracy,W precision,W recall,W accuracy
0,bb_path_1,0.0,0.0,0.0,0.198157,0.233696,0.120112
1,seldonstar_1,0.0,0.0,0.0,0.0,0.0,0.0
2,seldonstar_1_re,0.0,0.0,,0.0,0.0,
3,seldonstar_nosql_1,0.0,0.0,0.0,0.0,0.0,0.0
4,seldon_xss_1,0.0,0.0,0.0,0.0,0.0,0.0
5,seldon_tainted_path_1,0.0,0.0,0.0,0.006061,0.005435,0.002874
6,seldonstar_2,0.0,0.0,0.0,0.0,0.0,0.0
7,nosql_atm_top100,0.0,0.0,0.0,0.0,0.0,0.0
8,seldonstar_1_repsize_3,0.0,0.0,0.0,0.0,0.0,0.0
9,dell_test_run,0.0,0.0,,0.0,0.0,


In [36]:
import os
from scripts.calculate_scores import _calculate_score_sets

results = '/home/pablo/results'
rows = []
for d in os.listdir(results):
    rd = os.path.join(results, d)
    v0, worse, boosted = _calculate_score_sets(rd, '/tmp')
    v0_prime = v0-worse
    alerts_to_recover = len(v0_prime)
    alerts_recovered = len(v0_prime & boosted)
    spurious_alerts = len(boosted-v0_prime)
    rows.append([d, alerts_to_recover, alerts_recovered, spurious_alerts])
pd.DataFrame(rows, columns=['name', 'alerts to recover', 'alerts recovered', 'suprious alerts'])

Unnamed: 0,name,alerts to recover,alerts recovered,suprious alerts
0,bb_path_1,12,0,217
1,seldonstar_1,436,0,79
2,seldonstar_1_re,0,0,0
3,seldonstar_nosql_1,411,0,66
4,seldon_xss_1,54,0,63
5,seldon_tainted_path_1,74,0,165
6,seldonstar_2,103,0,58
7,nosql_atm_top100,65,0,115
8,seldonstar_1_repsize_3,436,0,69
9,dell_test_run,0,0,0
