In [1]:
from itertools import combinations
from functools import reduce
from pyspark.sql.functions import col, when, lit, concat, sum as _sum
from pyspark.sql.types import StringType

In [2]:
traces =(spark.read.option('mergeSchema', 'true')
              .parquet('data/*.parquet'))

In [3]:
frontEndCol = 'web-service_HomeControllerHome_avg_dur'

selfDurCols = [c for c in traces.columns
        if c !='traceId' and c.endswith('avg_self_dur')]

fullDurCols = [c for c in traces.columns
        if c !='traceId' and c.endswith('avg_dur') and c!=frontEndCol]

In [4]:
backendCols = selfDurCols
p = 0.5

In [5]:
cols = [frontEndCol] + backendCols

anomalies ={ cols[i]: q[0]
            for i, q in enumerate(traces.approxQuantile(cols, [p],0))}

In [6]:
def replaceWithBinaryColumn(df,c):
    binaryCol = when(col(c)> anomalies[c], 1).otherwise(0)
    return df.withColumn(c, binaryCol)

binTraces = reduce(replaceWithBinaryColumn,
        cols,
        traces.select(cols))

In [7]:
binTracesWithPattern =reduce(lambda df, c: df.withColumn('pattern', concat(col('pattern'),col(c).cast(StringType()))),
                      backendCols,
                      binTraces.withColumn('pattern', lit('')))

In [8]:
patterns =(binTracesWithPattern.groupBy('pattern')
                     .agg(_sum(frontEndCol).alias('anomalies'))
                     .orderBy('anomalies',ascending=False)
                     .filter(col('anomalies')>0))

In [9]:
def allCombinations(l):
    for s in range(1, len(l)+1):
        for comb in combinations(l,s):
            yield comb

In [10]:
res = []
relevantCount = binTraces.filter(col(frontEndCol)==1).count()
for comb in allCombinations(backendCols):
    retreived = reduce(lambda df, c: df.filter(col(c)==1),
                  comb,
                  binTraces)
    tpCount = retreived.filter(col(frontEndCol)==1).count()
    precision = tpCount/retreived.count()
    recall = tpCount/relevantCount
    fmeasure = 2*(precision*recall)/(precision+recall)
    res.append((comb,(fmeasure, precision, recall)))

In [16]:
for r in sorted(res, key=lambda x : x[1][0], reverse=True):
    print(r[0])
    print('f-measure:',r[1][0],', precision:',r[1][1], ', recall:', r[1][2], '\n\n')

('web-service_HomeControllerHome_avg_self_dur',)
f-measure: 0.8251264880451112 , precision: 0.825120929167172 , recall: 0.825132046997952 


('gateway_get_avg_self_dur', 'web-service_HomeControllerHome_avg_self_dur')
f-measure: 0.5500650791031916 , precision: 0.8602001032050914 , recall: 0.4042996119435162 


('items-server_ItemsControllerFinditemrandom_avg_self_dur', 'web-service_HomeControllerHome_avg_self_dur')
f-measure: 0.5482811513959456 , precision: 0.8667802682745663 , recall: 0.40095127735259245 


('items-server_ItemsControllerFinditemsrandombyidproduct_avg_self_dur', 'web-service_HomeControllerHome_avg_self_dur')
f-measure: 0.5249398923787353 , precision: 0.8991371421100798 , recall: 0.37067478710790125 


('categories-server_CategoriesControllerGetcategory_avg_self_dur', 'web-service_HomeControllerHome_avg_self_dur')
f-measure: 0.5236127787915174 , precision: 0.8776027636040947 , recall: 0.37311361431497253 


('gateway_get_avg_self_dur',)
f-measure: 0.5039936130577486 , pr