# Evaluation Result

In [1]:
from sklearn import metrics
from pylib.tlsh_lib import *
import numpy as np
import pandas as pd
import time
import argparse
import sys
import warnings
warnings.filterwarnings("ignore")

In [2]:
# List of Function
def containNan(labelList):
    for lable in labelList:
        if lable == 'n/a':
            haveNan = True
            return haveNan

def removeNan(hashlist, labelList):
    count = -1
    newhashlist = []
    newlabelList = []

    for lable in labelList:
        count += 1
        if lable != 'n/a':
            newhashlist.append(hashlist[count])
            newlabelList.append(labelList[count])
    return newhashlist, newlabelList

In [3]:
def getResult(hashType, clusterType, labelList, clusterNumber):
    d = {word: key for key, word in enumerate(set(labelList))}
    labelList_id = [d[word] for word in labelList]
    a = np.array(labelList_id).reshape(-1, 1)

    # decimal place
    dp = 4

    homo = round(metrics.homogeneity_score(labelList_id, clusterNumber), dp)
    silh = round(metrics.silhouette_score(a, clusterNumber, metric='euclidean'), dp)
    cali = round(metrics.calinski_harabasz_score(a, clusterNumber), dp)
    dav = round(metrics.davies_bouldin_score(a, clusterNumber), dp)

    # print("Homogeneity score is " + str(homo))
    # print("Silhouette score is " + str(silh))
    # print("Calinski harabasz score is " + str(cali))
    # print("Davies bouldin score is " + str(dav))

    result = {"File": str(filename),
              "nSample": int(len(tlist)),
              "Hash": str(hashType),
              "Cluster": str(clusterType),
              "Has_n/a": bool(containNan(labelList)),
              "nLabel": int(nlable),
              "nCluster": int(max(clusterNumber)),
              "Time(s)": float(end),
              "Homo.": float(homo),
              "Sil.": float(silh),
              "Cal.": float(cali),
              "Dav.": float(dav)}
    return result

In [4]:
# start of main program

parser = argparse.ArgumentParser(prog='readcsv')  # provides a convenient interface to handle command-line arguments.
parser.add_argument('-f', help='fname', type=str, default="")  # the extra part need to run file
args = parser.parse_args()
datafile = args.f

datafile = "dataDir/mb_1K.csv" #<-----Change this file size

if (datafile == ""):
    print("you must provide a datafile name (-f)\n")
    sys.exit()
# end if

In [5]:
tic = time.perf_counter()  # experiment time counter
haveNan = False
df = pd.DataFrame()

(path, file) = datafile.split("/")  # save file path
(filename, filetype) = file.split(".")  # save file type

(tlist, [labelList, dateList, slist]) = tlsh_csvfile(datafile)  # return (tlist, [labelList, dateList, hashList])
# (tlist, labelList) = tlsh_csvfile(datafile)

# remove Nan Value
#(tlist, labelList) = removeNan(tlist, labelList)

hashList = tlist
print("Number of samples is " + str(len(hashList)))
print("Number of Unique Lable is " + str(len(set(labelList))))
print(hashList[0:10])
nlable = len(set(labelList))
haveNan = containNan(labelList)

Number of samples is 1000
Number of Unique Lable is 56
['T134A4C0203AFA9015F1B3AFB98ED575969B6EF7633603E51E2490038B0613F81DE8157E', 'T1B4D423DF86D730F97196CFCE23860A7E845A27BDA7313AC50205FBE574A141122B8E5E', 'T16B45EC38ED0A22B7EBA5D334E0BA581AA1F51CBB33308D1DD9D6764E1937603709636D', 'T1DA7579A92363AC2AF31B04BB1473B00BD5617B8DDC14B7652D39367A51AE371B1F4B82', 'T149A42311209DA1BCFE81886F17D7AD62F3CA2E5426B67F40824427B82FD8E564FF494D', 'T1F61302834697D5F638BB97C9781842BC88A4F370BA19FB11E5F801935F45294FA884FC', 'T180A4C0243AFE511DF173AF794AE475D69ABEBB333B06D45D1451038A4A22B81CEC063B', 'T18CD4F1120FDE441AC78FB0BCD7199CCB76A6CD4C5B5845F2127D76AA2E34232E38817A', 'T1BDA42311209DA1BCFE81886F17D39D62F3CA2E5526B67F40824427B82FD8E564FF454D', 'T1052512A82710AA66DADE0F73C1234589D330A5665742F40F79D6A2D73EB2ACFB5050CF']


# TLSH Agglomerative Clustering

In [6]:
try:
    start = time.perf_counter()
    res = assignCluster(hashList, nlable)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "hac", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("Agglomerative Clustering didn't work.")
    print(e)

hac
Code ran in 0.7462 seconds


# TLSH HAC-T

In [7]:
from pylib.hac_lib import *

try:
    hac_resetDistCalc()

    start = time.perf_counter()
    res = HAC_T(datafile, CDist=30, step3=0, outfname="tmp.txt", cenfname="tmp2.txt")
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "hac-t", labelList, res)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")

    nclusters = max(res)
    nDistCalc = hac_lookupDistCalc()
    print("nclusters is " + str(nclusters))
    print("nDistCalc is " + str(nDistCalc))
except Exception as e:
    print("HAC-T didn't work.")
    print(e)

hac-t
Code ran in 0.3445 seconds
nclusters is 85
nDistCalc is 227245


# TLSH DBSCAN

In [8]:
from pylib.hac_lib import *

try:
    resetDistCalc()

    start = time.perf_counter()
    res = runDBSCAN(hashList, eps=30, min_samples=2, algorithm='auto')
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "dbscan", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")

    nclusters = max(res.labels_)
    nDistCalc = lookupDistCalc()
    print("nclusters is " + str(nclusters))
    print("nDistCalc is " + str(nDistCalc))
except Exception as e:
    print("DBSCAN didn't work.")
    print(e)

dbscan
Code ran in 2.2475 seconds
nclusters is 71
nDistCalc is 961443


# TLSH KMeans

In [9]:
try:
    start = time.perf_counter()
    res = runKMean(hashList, nlable)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "kmeans", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("KMean didn't work.")
    print(e)

kmeans
Code ran in 0.5338 seconds


# Affinity Propagation

In [10]:
try:
    start = time.perf_counter()
    res = runAffinityPropagation(hashList, 5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ap", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("Affinity Propagation didn't work.")
    print(e)

Affinity Propagation didn't work.
Affinity must be 'precomputed' or 'euclidean'. Got <function sim_affinity at 0x00000173C9998310> instead


# Mean Shift

In [11]:
try:
    start = time.perf_counter()
    res = runMeanShift(hashList, 5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ms", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("Mean Shift didn't work.")
    print(e)

ms
Code ran in 1.5852 seconds


# Spectral Clustering

In [12]:
try:
    start = time.perf_counter()
    res = runMeanShift(hashList, 5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "sp", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame(dict, index=[0])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("Spectral Clustering didn't work.")
    print(e)

sp
Code ran in 1.5126 seconds


# OPTICS

In [24]:
try:
    start = time.perf_counter()
    res = runOPTICS(hashList, min_samples=2)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "optics", labelList, res.labels_)
    df = pd.concat([df, pd.DataFrame([dict])])
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("OPTICS didn't work.")
    print(e)

  ratio = reachability_plot[:-1] / reachability_plot[1:]


optics
Code ran in 6.8752 seconds


# BIRCH

In [41]:
try:
    start = time.perf_counter()
    res = runBIRCH(hashList,nlable)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "birch", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)
    print(dict.get('Cluster'))
    print("Code ran in " + str(end) + " seconds")
except Exception as e:
    print("BIRCH didn't work.")
    print(e)

birch
Code ran in 0.1237 seconds


In [36]:
#res.labels_

# Output

In [37]:
outfile = path + "/output/" + filename + "_Tlsh_result.csv"
df.to_csv(outfile, index=False)

In [38]:
toc = round(time.perf_counter() - tic, 4)
print("All code ran in " + str(toc) + " seconds")

All code ran in 324.4388 seconds


In [39]:
df

Unnamed: 0,File,nSample,Hash,Cluster,Has_n/a,nLabel,nCluster,Time(s),Homo.,Sil.,Cal.,Dav.
0,mb_1K,1000,tlsh,hac,True,56,55,0.7462,0.4868,-0.617,13.4908,172.4432
1,mb_1K,1000,tlsh,hac-t,True,56,85,0.3445,0.3453,-0.7216,7.1279,67.8209
2,mb_1K,1000,tlsh,dbscan,True,56,71,2.2475,0.3336,-0.7254,8.2254,22.0264
3,mb_1K,1000,tlsh,kmeans,True,56,55,0.5338,0.4641,-0.6967,9.9843,178.4013
4,mb_1K,1000,tlsh,ms,True,56,165,1.5852,0.6466,-0.6811,4.4257,139.9304
5,mb_1K,1000,tlsh,sp,True,56,165,1.5126,0.6466,-0.6811,4.4257,139.9304
6,mb_1K,1000,tlsh,optics,True,56,200,6.7965,0.5552,-0.6593,3.7966,39.7486
7,mb_1K,1000,tlsh,birch,True,56,55,0.2693,0.4522,-0.7003,8.283,176.406
8,mb_1K,1000,tlsh,birch,True,56,55,0.1569,0.4522,-0.7003,8.283,176.406
9,mb_1K,1000,tlsh,birch,True,56,55,0.2208,0.4522,-0.7003,8.283,176.406
