In [1]:
from openpyxl import load_workbook, Workbook
import os
import pickle
import pandas as pd
import numpy as np
from numpy import nan as NaN
import re
import math

%run CollectForestInfo.ipynb

## Used for classification result reading

In [None]:
# calculate profile's maximum score in each family
def profileScoreInEachFamily(profileScoreInFamilyBTs):
    profileFamilyScores = dict()
    for proc, familyBTScores in profileScoreInFamilyBTs.items():

        profileFamilyScores[proc] = dict()
        for familyName, profileBTScores in familyBTScores.items():
            if familyName not in profileFamilyScores[proc].keys():
                profileFamilyScores[proc][familyName] = (0,0,0)

            for profileBTScore in profileBTScores:
                maxScore, maxGSALen, maxScoreModel = profileFamilyScores[proc][familyName]
                score, gsaLen, modelLen = profileBTScore
                if float(score) > float(maxScore):
                    profileFamilyScores[proc][familyName] = (score, gsaLen, modelLen)
                elif float(score) == float(maxScore):
                    if gsaLen > maxGSALen:
                        profileFamilyScores[proc][familyName] = (score, gsaLen, modelLen)
    return profileFamilyScores

In [None]:
def createScoreBox(result_data_dir, ignoreFamilys, weight_dict):
    scoreBox = pd.DataFrame()
    for resPkl in os.listdir(result_data_dir):
        truth = resPkl.split('_')[1].split('.')[0]
        with open(result_data_dir + resPkl, 'rb') as fHandle:
            result = pickle.load(fHandle)
            fHandle.close()
            
        processList = result.keys()
        profileScoreInFamilyBTs = dict()
        
        # calculate profile's score with each behavior tree
        # record with family name
        for proc in processList:
            familyBTScores = dict()
            BTNames = result[proc]
            for bt in BTNames:
                famName = bt.split('_')[0]
                if famName in ignoreFamilys: continue # skip unwanted family
                trName = bt.split('_')[2]
                gsaLen, commSeq = result[proc][bt]
                modelWeight, modelLength, modelMemberCount = weight_dict[famName][trName]
                
                # filter trees
                if modelLength > 10 and modelMemberCount > 2:
                    if famName not in familyBTScores.keys():
                        familyBTScores[famName] = list()
                        
                    profileInBT = gsaLen/modelLength # calculate gsa ratio score for bt
                    familyBTScores[famName].append((profileInBT, gsaLen, modelLength))
            profileScoreInFamilyBTs[proc] = familyBTScores
            
        # calculate profile's maximum score in each family
        profileFamilyScores = profileScoreInEachFamily(profileScoreInFamilyBTs)

        # record profile score of family to scorebox
        for proc, familyScores in profileFamilyScores.items():
            for famName, scoreInfo in familyScores.items():
                score, gsaLen, modelLen = scoreInfo
                score = (math.floor(score*100)) / 100
                outputSentence = str(gsaLen) +'/'+ str(modelLen) + '=' + str(score)
                scoreBox.loc[proc, famName+' Score'] = outputSentence
    return scoreBox

In [None]:
def assignGroundTruth(scoreBox, sample_truth_dict):
    myScoreBox = scoreBox.copy()
    for proc in myScoreBox.index:
        shaName = proc.split('_')[0]
        myScoreBox.loc[proc, 'GroundTruthLabel'] = truth_dict[shaName]
    return myScoreBox

In [None]:
def getTruthScore(processScores, groundTruth):
    for pair in processScores:
        famName, score = pair
        if groundTruth == famName:
            return score
    return None # should not touch

In [None]:
def exactMatch(threshold, processScores, groundTruth):
    maxPair = processScores[0]
    maxScoreFamily, maxScore = maxPair
    
    if maxScore < threshold:
        return 3
    else:
        # if there are many equal maximum score, check whether truth in there.
        multi_max = set()
        for pair in processScores:
            famName, score = pair
            if score == maxScore:
                multi_max.add(famName)
#             else:
#                 break
        if groundTruth in multi_max:
            return 1 # match
        else:
            return 2 # mismatch

def effectiveMatch(threshold, processScores, groundTruth):
    maxPair = processScores[0]
    maxScoreFamily, maxScore = maxPair
    
    # prepare the prospective candidate set
    candidateSet = set()
    for pair in processScores:
        famName = pair[0]
        score = pair[1]
        if score >= threshold:
            candidateSet.add(famName)
            
    if len(candidateSet) == 0:
        return 3
    else:
        if groundTruth in candidateSet: return 1
        else: return 2


In [None]:
def getMatchResult(scoreBox, threshold, matchMethod):
    sample_scores = dict()
    sample_truth = dict()
    for procName, row in scoreBox.iterrows():
        sampleName = procName.split('_')[0]
        truthLabel = row['GroundTruthLabel']
        if sampleName not in sample_scores.keys():
            sample_scores[sampleName] = []
            sample_truth[sampleName] = truthLabel

        processScoreList = []
        for col in range(0,28):
            s_in_fam = float(row[col].split('=')[1])
            famName = scoreBox.columns[col].split(' ')[0]
            scorePair = (famName, s_in_fam)
            processScoreList.append(scorePair)
        sample_scores[sampleName].extend(processScoreList)

    sample_result = dict()
    matchResultTable = pd.DataFrame(columns = ['MatchType', 'PredictFamily', 'GroundTruth'])
    for key, val in sample_scores.items():
        truth = sample_truth[key]
        processScores = sorted(val, key=lambda x:x[1], reverse=True)
        maxPair = processScores[0]
        maxScoreFamily, maxScore = maxPair
        truthScore = getTruthScore(processScores, truth)

        # there are 'Exact-Match' and 'Effective-Match' method.
        # result: 1-match, 2-mismatch, 3-undecided
        if matchMethod == 'Effective_Match':
            matchResult = effectiveMatch(threshold, processScores, truth)
        elif matchMethod == 'Exact_Match':
            matchResult = exactMatch(threshold, processScores, truth)
            
        if matchResult == 1:
            matchResultTable.loc[key, 'MatchType'] = 'Match'
            matchResultTable.loc[key, 'PredictFamily'] = truth
            matchResultTable.loc[key, 'GroundTruth'] = truth
            matchResultTable.loc[key, 'MaxScore'] = maxScore
            matchResultTable.loc[key, 'TruthScore'] = truthScore
        else:
            matchResultTable.loc[key, 'PredictFamily'] = maxScoreFamily
            matchResultTable.loc[key, 'GroundTruth'] = truth
            matchResultTable.loc[key, 'MaxScore'] = maxScore
            matchResultTable.loc[key, 'TruthScore'] = truthScore
            if matchResult == 2:
                matchResultTable.loc[key, 'MatchType'] = 'Mismatch'
            else:
                matchResultTable.loc[key, 'MatchType'] = 'Undecided'

    return matchResultTable

In [None]:
def readModelFiles(modelBaseDirectory):
    base_dir = modelBaseDirectory
    familyDirs = [base_dir+f+'/' for f in os.listdir(base_dir)]

    weight_dict = dict()
    ignoreFamilys = set()
    for fam in familyDirs:
        pickleDir = fam + 'pickle/'
        tag = pickleDir.split('/')[-3]
        interPkl = pickleDir + tag + "_intermediate.pickle"
        residualPkl = pickleDir + tag + "_residual.pickle"
        forestInfo = CollectForestInfo(interPkl,
                               residualPkl,
                               True) # one pickle is a forest

        forestMemberCount = forestInfo.getForestMemberCount()
        weight_dict[tag.split("_")[0]] = dict()
        for treeName in forestInfo.getTreeRootNameList():
            labelName = tag+'_'+treeName
            memberCount = len(forestInfo.getTreeMembers(treeName))
            repSeq = forestInfo.getRepAPISeq(treeName)
            weight_dict[tag.split("_")[0]][treeName] = (memberCount/forestMemberCount,
                                                        len(repSeq), memberCount)

    for fName, trs in weight_dict.items():
        save = False
        for tr, info in trs.items():
            if info[1] > 10 and info[2] > 2:
                save = True
                break
        if not save:
            ignoreFamilys.add(fName)

    print("=== Finish building model ===")

    return ignoreFamilys, weight_dict

## Used for Write the Excel Files

In [None]:
def getProcessMaxScore(dataframeRow, truthLabel):
    maxScore = 0
    outputScore = ""
    for colName in dataframeRow.keys():
        colVal = dataframeRow[colName]
        score = float(colVal.split('=')[1])
        gsaLen = int(colVal.split('/')[0])
        if score > maxScore:
            maxScore = score
            outputScore = colVal
        elif score == maxScore:
            colName = colName.split(' ')[0]
            if colName == truthLabel:
                outputScore = colVal
            else:
                if gsaLen > int(outputScore.split('/')[0]):
                    outputScore = colVal
    return outputScore
def getProcessTruthScore(dataframeRow, truthLabel):
    for colName in dataframeRow.keys():
        colVal = dataframeRow[colName]
        shortColName = colName.split(' ')[0]
        if shortColName == truthLabel:
            return colVal
    return None

## Create Confusion Matrix and Do Evaluation
##### Calculate 'Precision', 'Recall', and 'F1'

In [None]:
def evaluationMethodCalculate(matchResultTable):
    family_confusion = dict()
    for family in matchResultTable['GroundTruth'].values:
        if family not in family_confusion.keys():
            family_confusion[family] = {'TP':0, 'FP':0, 'TN':0, 'FN':0}

    ## create confusion matrix for each family
    for sample, row in matchResultTable.iterrows():
        predictLabel = row['PredictFamily']
        truthLabel = row['GroundTruth']
        matchType = row['MatchType']

        if matchType == 'Undecided':
            family_confusion[truthLabel]['FN'] += 1
        else:
            if predictLabel == truthLabel:
                family_confusion[truthLabel]['TP'] += 1
            else:
                family_confusion[truthLabel]['FN'] += 1
                family_confusion[predictLabel]['FP'] += 1

        for famName in family_confusion.keys():
            if famName != truthLabel and famName != predictLabel:
                family_confusion[famName]['TN'] += 1

    recalls = []
    precisions = []
    fs = []
    for famName in sorted(family_confusion.keys()):
        matrix = family_confusion[famName]
        tp = matrix['TP']
        tn = matrix['TN']
        fp = matrix['FP']
        fn = matrix['FN']
        population = tp+tn+fp+fn

        if (tp+fp) != 0:
            precision = tp/(tp+fp)
        else:
            precision = 0.0
        recall = tp/(tp+fn)
        if recall!=0 and precision!=0:
            f1 = 2 / ((1/recall) + (1/precision))
        else:
            f1 = 0.0

        recalls.append("{0:.2f}".format(recall))
        precisions.append("{0:.2f}".format(precision))
        fs.append("{0:.2f}".format(f1))

    ### Create a dataframe table (dashboard)
    evaTable = pd.DataFrame(columns=['Precision', 'Recall', 'F1'])
    families = sorted(family_confusion.keys())
    for idx in range(len(recalls)):
        evaTable.loc[families[idx]] = (precisions[idx], recalls[idx], fs[idx])

#     print(evaTable.shape)
    return evaTable, precisions, recalls, fs, family_confusion