In [14]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [15]:
from sklearn.metrics import accuracy_score, precision_score, recall_score
from typing import Iterable, Set, Tuple
import pandas as pd
import glob
import os
import re
import logging

log = logging.getLogger("debug")
log.setLevel(logging.DEBUG)

In [16]:
## constants
results_folder = "/home/pablo/results/ds_nosql_sample_50_2" 
cleanup_base_dir = "/tmp"

## Cargando los resultados de evaluar las quieres de evaluacion

In [17]:
# Take as input results folder, and assume that there exists a `v0` and `worse` folder with the evaluation results
v0_dir = "v0/"
boost_dir = "worse/"

results_boost = {}
results_v0 = {}

for f in glob.glob(os.path.join(results_folder, boost_dir, "*.csv")):
    results_boost[os.path.basename(f)] = pd.read_csv(f)

for f in glob.glob(os.path.join(results_folder, v0_dir, "*.csv")):
    results_v0[os.path.basename(f)] = pd.read_csv(f)

print("Read total %s dataframes" % (len(results_boost) + len(results_v0)))

Read total 100 dataframes


In [18]:
# Post-processing required for results:
# - remove the leading /tesis/tmp/*/ from the filePathSource and filePathSink columns
# - drop score
# - drop the origin worse results

# Post-processing results
# Clean filePath* columns
def cleanup_filePath_col(value: str) -> str:
    # /tesis/tmp/78fi2yky
    leading_replace_pat = "%s/[a-zA-Z0-9]+/" % (cleanup_base_dir)
    return re.sub(leading_replace_pat, "", value)

# This bit below will cleanup the /tesis/tmp/[a-zA-Z0-9]+ bit of the path column in the results
for df in results_boost.values():
    # Cleanup filePath* columns
    df['filePathSource'] = df['filePathSource'].map(cleanup_filePath_col)
    df['filePathSink'] = df['filePathSink'].map(cleanup_filePath_col)

for df in results_v0.values():
    df['filePathSource'] = df['filePathSource'].map(cleanup_filePath_col)
    df['filePathSink'] = df['filePathSink'].map(cleanup_filePath_col)

# utility function used for hashing each row in a consistent way, so they
# can be operated in sets
def hash_tuple(x: Tuple):
    return "#".join([
        str(v) for v in x
    ])

# Now, generate the a set for worse, v0 and boosted alone
v0 = set()
worse = set()
boosted = set()

# cleanup of each dataframe, dropping columns that will not give repeatable results between
# evaluation sets.
def cleanup(df):
    return df.drop(["score", "origin", "source", "sink"], axis=1)

def calculate_df_intersect(a,b):
    sa = set(cleanup(a).apply(lambda x: hash_tuple(tuple(x)), axis=1))
    sb = set(cleanup(b).apply(lambda x: hash_tuple(tuple(x)), axis=1))
    return sa.intersection(sb)

# Calculate v0 set
for df in results_v0.values():
    # Drop non hashed columns
    df = cleanup(df)
    # hash and add to set
    v0 = v0 | set(df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

# Calculate boosted
for df in results_boost.values():
    # filter out boosted
    df = df[df['origin'] == 'boosted']
    df = cleanup(df)
    boosted = boosted | set(
        df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

# Calculate worse
for df in results_boost.values():
    # filter out boosted
    df = df[df['origin'] == 'worse']
    df = cleanup(df)
    worse = worse | set(df.apply(lambda x: hash_tuple(tuple(x)), axis=1))

all_results = v0 | worse | boosted

print("Result sets sizes: Worse %d, Boosted %d, V0 %d, All %d" %
         (len(worse), len(boosted), len(v0), len(all_results)))

Result sets sizes: Worse 394, Boosted 379, V0 94, All 420


In [19]:
[k for k in results_boost.keys() if k.startswith('cnodejs_nodeclub')]

[]

## Analizo para db los resultados de las evaluaciones en worse, boosted y v0

In [26]:
counts = []
for n in results_boost.keys():
    intersect_size = len(calculate_df_intersect(results_boost[n], results_v0[n]))
    r_boost = results_boost[n]
    total_worse = r_boost[r_boost['origin'] == 'worse'].shape[0]
    total_boosted = r_boost[r_boost['origin'] == 'boosted'].shape[0]
    counts.append([n, total_worse, total_boosted, results_v0[n].shape[0], intersect_size])
df = pd.DataFrame(counts, columns=["name", "worse rows", "boosted rows", "v0 rows", "intersect"])
df
# df2 = df[df['intersect']>5]
# for e in df2['name']:
#     print(e)

Unnamed: 0,name,worse rows,boosted rows,v0 rows,intersect
0,bergwhite_nchat_5442684.csv,1,1,0,0
1,HashBrownCMS_hashbrown-cms_b987436.csv,0,0,0,0
2,luin_ioredis_0db527f.csv,0,0,0,0
3,kuzzleio_kuzzle_4eb8ec7.csv,0,0,0,0
4,linnovate_mean_5d0a693.csv,0,0,0,0
5,Binomi0_crm-backoffice-api_b63a84b.csv,14,14,8,0
6,nosplashurinal_homeaway-mongo_af285fd.csv,2,0,1,1
7,oswinso_kids4kids_3e51084.csv,13,13,6,6
8,linagora_openpaas-esn_dc4cb4a.csv,48,42,7,6
9,18F_checklistomania_9865774.csv,8,4,3,3


In [21]:
df[df["worse rows"]==0].shape

(16, 5)

In [22]:
# set(results_v0['FurutaTakuto_infoVis2017_7b7aa8999b4233eb2b9e5fc7d3ba6488b80ed141.csv'].apply(lambda t: hash_tuple(t), axis=1))

# Helper function to make a set iterable in a repeatable order. Used for generating
# the item sets required by the score calculation functions in sklearn

def repeatable_for_each_set(s: Set[int]) -> Iterable[int]:
    return sorted(list(s))

all_ordered = repeatable_for_each_set(all_results)

y_worse = [
    int(it in worse)
    for it in all_ordered
]

y_pred = [
    int(it in (boosted))
    for it in all_ordered
]

y_true = [
    int(it in v0)
    for it in all_ordered
]

precision = precision_score(y_true, y_worse, pos_label=1)
recall = recall_score(y_true, y_worse, pos_label=1)
accuracy = accuracy_score(y_true, y_worse)
print("WORSE")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
accuracy = accuracy_score(y_true, y_pred)
print("BOOSTED")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

WORSE
Score results: Precision: 0.1726. Recall: 0.7234. Accuracy: 0.1619
BOOSTED
Score results: Precision: 0.1530. Recall: 0.6170. Accuracy: 0.1500


In [23]:
from sklearn.metrics import multilabel_confusion_matrix
# set(results_v0['FurutaTakuto_infoVis2017_7b7aa8999b4233eb2b9e5fc7d3ba6488b80ed141.csv'].apply(lambda t: hash_tuple(t), axis=1))

# Helper function to make a set iterable in a repeatable order. Used for generating
# the item sets required by the score calculation functions in sklearn

def repeatable_for_each_set(s: Set[int]) -> Iterable[int]:
    return sorted(list(s))

v0_prime = v0.difference(worse)
all_ordered = repeatable_for_each_set(v0_prime.union(boosted))

print("# of elements in worse: %d" % (len(worse)))
print("# of elements in v0: %d" % (len(v0)))
print("# of elements in v0Prime: %d" % (len(v0_prime)))
print("# of elements in boosted: %d" % (len(boosted)))

alerts_to_recover = len(v0_prime)

# tp son los elementos que estan en v0_prime, es decir que son alertas encontradas en v0, y tambien en boosted
tp = len(v0_prime & boosted)

# fp son los elementos que estan en v0_prime, es decir que son alertas encontradas en v0, y NO en boosted
fp = len(boosted - v0_prime)

# fn son las alertas que estan en boosted, pero no en v0_prime, lo cual quiere decir que no eran alertas reales
fn = len(v0_prime - boosted)

print("alerts to recover: %d" % (alerts_to_recover))
print("vanilla tp: %d" % (tp))
print("vanilla fp: %d" % (fp))
print("vanilla fn: %d" % (fn))

print("vanilla precision: %.2f" % (tp/(tp+fp)))
print("vanilla recall: %.2f" % (tp/(tp+fn)))

y_pred = [
    int(it in (boosted))
    for it in all_ordered
]

y_true = [
    int(it in v0_prime)
    for it in all_ordered
]

m = multilabel_confusion_matrix(y_true, y_pred)
print("tn, fp")
print("fn, tp")
print("")
print(m)

precision = precision_score(y_true, y_pred, pos_label=1)
recall = recall_score(y_true, y_pred, pos_label=1)
accuracy = accuracy_score(y_true, y_pred)
print("BOOSTED")
print("Score results: Precision: %.4f. Recall: %.4f. Accuracy: %.4f" % (precision, recall, accuracy))

# of elements in worse: 394
# of elements in v0: 94
# of elements in v0Prime: 26
# of elements in boosted: 379
alerts to recover: 26
vanilla tp: 0
vanilla fp: 379
vanilla fn: 26
vanilla precision: 0.00
vanilla recall: 0.00
tn, fp
fn, tp

[[[  0.  26.]
  [379.   0.]]

 [[  0. 379.]
  [ 26.   0.]]]
BOOSTED
Score results: Precision: 0.0000. Recall: 0.0000. Accuracy: 0.0000
