In [1]:
from itertools import combinations
from functools import reduce
from pyspark.sql.functions import col, when, concat, concat_ws, collect_list
from pyspark.sql.types import StringType
import pickle
import random

In [2]:
traces =(spark.read.option('mergeSchema', 'true')
              .parquet('data/14-11-18*.parquet'))

In [3]:
frontend = 'web-service_HomeControllerHome_avg_dur'

backends = [c for c in traces.columns
        if c !='traceId' and c.endswith('avg_self_dur')]

In [4]:
frontendSLA = traces.approxQuantile([frontend], [0.99],0)[0][0]

In [5]:
maxs ={ c : traces.select(c).rdd.max()[0]
        for c in backends }
mins ={ c : traces.select(c).rdd.min()[0]
        for c in backends }

In [6]:
normalizedTrace = reduce(lambda df, c : df.withColumn(c, (col(c) - mins[c])/(maxs[c] - mins[c])),
                        backends,
                        traces)

In [7]:
from sklearn.metrics import precision_recall_curve

thresholdsDict = {}
y = [1 if row[0]>frontendSLA else 0
     for row in traces.select(frontend).collect()]

for aBackend in backends:
    scores = [row[0] for row in normalizedTrace.select(aBackend).collect()]
    _, _, thresholds = precision_recall_curve(y, scores)
    thresholdsDict[aBackend] = thresholds

In [8]:
k = 27
for aBackend in backends:
    thresholdsDict[aBackend] = thresholdsDict[aBackend][:k]

In [8]:
k=10
for aBackend in backends:
    stepSize = len(thresholdsDict[aBackend])//k
    thresholdsDict[aBackend] = thresholdsDict[aBackend][::stepSize]

In [9]:
def withColumnsTpFp(df, indexThresholdPair):
        i, t = indexThresholdPair
        return (df.withColumn('tp-%d'% i,
                              when((col(aBackend)>= t) & (col(frontend)>frontendSLA), '1')
                              .otherwise('0'))
                  .withColumn('fp-%d'% i,
                              when((col(aBackend)>= t) & (col(frontend)<=frontendSLA), '1')
                              .otherwise('0')))

In [10]:
%%time
cache = {}
for aBackend in backends:
    df = reduce(withColumnsTpFp,
                enumerate(thresholdsDict[aBackend]),
                normalizedTrace)

    aggs = [concat_ws("",collect_list(col('tp-%d' % i))).alias('tp-%d' % i)
                      for i, t in enumerate(thresholdsDict[aBackend])]

    aggs += [concat_ws("",collect_list(col('fp-%d' % i))).alias('fp-%d' % i)
                      for i, t in enumerate(thresholdsDict[aBackend])]

    res=df.groupBy().agg(*aggs).collect()[0]
    for i in range(len(thresholdsDict[aBackend])):
        tpBit, fpBit = res['tp-%d'%i], res['fp-%d'%i]
        cache[aBackend, i] = int(tpBit, 2),int(fpBit, 2)

CPU times: user 821 ms, sys: 229 ms, total: 1.05 s
Wall time: 10.6 s


In [11]:
posCount = normalizedTrace.filter(col(frontend)> frontendSLA).count()

def computePrecRec(chromosome):
    backendAndThresholds = [(b, t)
                            for b, t in zip(backends, chromosome)]
    tpBit = reduce(lambda bits,bt : bits & cache[bt][0] ,
                   backendAndThresholds[1:],
                   cache[backendAndThresholds[0]][0])
    fpBit = reduce(lambda bits,bt  : bits & cache[bt][1] ,
                    backendAndThresholds[1:],
                    cache[backendAndThresholds[0]][1])
    truePosCount = bin(tpBit).count("1")
    falsePosCount = bin(fpBit).count("1")
    retreived = (truePosCount + falsePosCount) 
    precision = truePosCount/retreived if retreived > 0 else 0
    recall = truePosCount/posCount
    
    return precision, recall

In [12]:
def computeFMeasure(chromosome):
    precision, recall = computePrecRec(chromosome)
    return 2*(precision*recall)/(precision+recall) if precision > 0 or recall > 0 else 0

In [13]:
import random, copy
from deap import base, creator, tools, algorithms

In [14]:
creator.create("Fitness", base.Fitness, weights=(1.0,1.0))
evaluate = lambda ind: computePrecRec(ind)
selection = {'function':tools.selNSGA2}

In [40]:
creator.create("Fitness", base.Fitness, weights=(1.0,))
evaluate = lambda ind: (computeFMeasure(ind),)
selection = {'function':tools.selTournament, 'tournsize':3}

In [31]:
creator.create("Fitness", base.Fitness, weights=(1.0,1.0))
evaluate = lambda ind: (computeFMeasure(ind), ind.count(0))
selection = {'function':tools.selNSGA2}



In [46]:
creator.create("Fitness", base.Fitness, weights=(1.0,1.0,1.0))
evaluate = lambda ind: (*computePrecRec(ind),ind.count(0))
selection = {'function':tools.selNSGA2}



In [42]:
creator.create("Individual", list, fitness=creator.Fitness)
toolbox = base.Toolbox()

In [43]:
for aBackend in backends:
    toolbox.register(aBackend,
                     lambda a, b: 0, #random.randint,
                     0,
                     len(thresholdsDict[aBackend])-1 )

In [44]:
toolbox.register("individual",
                 tools.initCycle,
                 creator.Individual,
                 tuple(toolbox.__dict__[b] for b in backends))

toolbox.register("population", tools.initRepeat, list, toolbox.individual)

In [45]:
toolbox.register("mate", tools.cxTwoPoint)
toolbox.register("mutate",
                 tools.mutUniformInt,
                 low = [0]*len(backends),
                 up = [len(thresholdsDict[b]) - 1
                       for b in backends],
                 indpb = 1.0/len(backends))


In [46]:
toolbox.register("select", **selection)
toolbox.register("evaluate", evaluate)

In [47]:
toolbox.pop_size = 100
toolbox.max_gen = 200
toolbox.mut_prob = 0.2

In [48]:
def runEA(toolbox, stats=None, verbose=False):
    pop = toolbox.population(n=toolbox.pop_size)
    pop = toolbox.select(pop, len(pop))
    return algorithms.eaMuPlusLambda(pop, toolbox, mu=toolbox.pop_size, 
                                     lambda_=toolbox.pop_size, 
                                     cxpb=1-toolbox.mut_prob,
                                     mutpb=toolbox.mut_prob, 
                                     stats=stats, 
                                     ngen=toolbox.max_gen, 
                                     verbose=verbose)

In [49]:
res, _ = runEA(toolbox)

In [24]:
def denormalize(ind):
    return [(thresholdsDict[b][t] *(maxs[b] - mins[b]) + mins[b], b.split('_')[1].replace('avg_self_dur', ''))
            for t, b in zip(ind, backends) if t>0]

In [38]:
solutions = sorted([(computeFMeasure(ind), *computePrecRec(ind), denormalize(ind)) for ind in res],
                    key=lambda x:x[0])

for f, prec, rec, sol in solutions:
    print (sol)
    print('F-score: ',f)
    print('Precision:',prec, '\t Recall: ', rec, '\n\n')
    

[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F-score:  0.029766382693207003
Precision: 0.01510804730552517 	 Recall:  1.0 


[]
F

In [50]:
solutions = sorted([(computeFMeasure(ind), *computePrecRec(ind), denormalize(ind)) for ind in res],
                    key=lambda x:x[0])

for f, prec, rec, sol in solutions:
    print (sol)
    print('F-score: ',f)
    print('Precision:',prec, '\t Recall: ', rec, '\n\n')

[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeControllerHome')]
F-score:  0.44863883847549907
Precision: 0.30548690064260997 	 Recall:  0.8442622950819673 


[(1800.0000000000002, 'get'), (276232.0, 'HomeContro