In [1]:
import argparse
import sys
import time

import pandas as pd
from sklearn import metrics
import numpy as np

from pylib.tlsh_lib import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# List of Function
def getResult(hashType, clusterType, labelList, clusterNumber):
    data = tlist2cdata(hashList)
    
    d = {word: key for key, word in enumerate(set(labelList))}
    labelList_id = [d[word] for word in labelList]
    
    outlierRemoveLabel = []
    outlierRemoveID = []
    outlierRemoveData = []
    
    for i in range(len(clusterNumber)):
        if clusterNumber[i] >= 0:
            outlierRemoveLabel.append(clusterNumber[i])
            outlierRemoveID.append(labelList_id[i])
            outlierRemoveData.append(data[i])
            
    #print("labelList_id=", labelList_id)
    #print("cluster labels=",clusterNumber)
    #print("outlierRemoveLabel =", outlierRemoveLabel)
    #print("outlierRemoveID =", outlierRemoveID)
    
    # Number of decimal place for score
    dp = 4 
    
    homo = round(metrics.homogeneity_score(outlierRemoveID, outlierRemoveLabel), dp)
    silh1 = round(metrics.silhouette_score(data, clusterNumber, metric=sim), dp)
    silh2 = round(metrics.silhouette_score(outlierRemoveData, outlierRemoveLabel, metric=sim), dp)
    cali = round(metrics.calinski_harabasz_score(outlierRemoveData, outlierRemoveLabel), dp)
    dav = round(metrics.davies_bouldin_score(outlierRemoveData, outlierRemoveLabel), dp)
    
    print(clusterType + " ran in " + str(end) + " seconds")
    print("Homogeneity score =",homo)
    print("Silhouette score =",silh1)
    print("Silhouette score with Outlier Remove =",silh2)
    print("Calinski harabasz score =",cali)
    print("Davies bouldin score =",dav)
    #print(metrics.silhouette_samples(outlierRemoveData, outlierRemoveLabel, metric=sim))
    print()
    
    result = {"nSample": int(len(tlist)),
              "Hash": str(hashType),
              "Cluster": str(clusterType),
              "nLabel": int(nlabel),
              "nCluster": int(max(clusterNumber)),
              "Time(s)": float(end),
              "Homo.": float(homo),
              "Sil.": float(silh2),
              "Cal.": float(cali),
              "Dav.": float(dav)}
    return result

In [3]:
datafile = "dataDir/mb_1000.csv" #<-----Change this file size
if (datafile == ""):
    print("you must provide a datafile name (-f)\n")
    sys.exit()

tic = time.perf_counter()  # experiment time counter
df = pd.DataFrame() #Result Table

(path, file) = datafile.split("/")  # save file path
(filename, filetype) = file.split(".")  # save file type

(tlist, [labelList, dateList, slist]) = tlsh_csvfile(datafile)  # return (tlshList, [labelList, dateList, ssdeepList])

hashList = tlist

print("Number of samples is " + str(len(hashList)))
print("Number of Unique Label is " + str(len(set(labelList))))
print("Example hash: " + str(hashList[0]))
nlabel = len(set(labelList))
nClusters = [nlabel]

Number of samples is 1000
Number of Unique Label is 63
Example hash: T10263F782BC80EA22C7C01677FE6F518E331567D8E1EA32429D155FA07A8FC1B0D5B786


In [4]:
# Agglomerative Clustering
try:
    start = time.perf_counter()
    res = assignCluster(hashList, nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ac", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Agglomerative Clustering didn't work.")
    print(e)

ac ran in 0.572 seconds
Homogeneity score = 0.5708
Silhouette score = 0.3126
Silhouette score with Outlier Remove = 0.3126
Calinski harabasz score = 2.3056
Davies bouldin score = 188.1731



In [5]:
# DBSCAN
from pylib.hac_lib import *
try:
    resetDistCalc()

    start = time.perf_counter()
    res = runDBSCAN(hashList, eps=30, min_samples=2, algorithm='auto')
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "dbscan", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res.labels_)
    nDistCalc = lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_dbscan_out.txt"
    #outputClusters(outfile, hashList, res.labels_, labelList, quiet=True)
    
except Exception as e:
    print("DBSCAN didn't work.")
    print(e)

dbscan ran in 0.9103 seconds
Homogeneity score = 0.7968
Silhouette score = -0.0341
Silhouette score with Outlier Remove = 0.731
Calinski harabasz score = 4.3307
Davies bouldin score = 153.8919



In [6]:
# HAC-T
from pylib.hac_lib import *
try:
    hac_resetDistCalc()

    start = time.perf_counter()
    res = HAC_T(datafile, CDist=30, step3=0, outfname="tmp.txt", cenfname="tmp2.txt")
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "hac-t", labelList, res)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res)
    nDistCalc = hac_lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_hac-t_out.txt"
    #outputClusters(outfile, hashList, res, labelList, quiet=True)
    
except Exception as e:
    print("HAC-T didn't work.")
    print(e)

hac-t ran in 0.1965 seconds
Homogeneity score = 0.8542
Silhouette score = -0.1316
Silhouette score with Outlier Remove = 0.5195
Calinski harabasz score = 7.7455
Davies bouldin score = 147.5003

HAC-T didn't work.
string index out of range


In [7]:
# OPTICS
try:
    start = time.perf_counter()
    res = runOPTICS(hashList, min_samples=2)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "optics", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("OPTICS didn't work.")
    print(e)

optics ran in 1.9921 seconds
Homogeneity score = 0.8776
Silhouette score = 0.0311
Silhouette score with Outlier Remove = 0.4754
Calinski harabasz score = 2.876
Davies bouldin score = 372.3439



In [8]:
# KMeans
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runKMean(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "kmeans", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)
        
    except Exception as e:
        print("KMeans didn't work.")
        print(e)

kmeans ran in 0.1937 seconds
Homogeneity score = 0.4831
Silhouette score = -0.2625
Silhouette score with Outlier Remove = -0.2625
Calinski harabasz score = 57657.9806
Davies bouldin score = 0.499

KMeans didn't work.
string index out of range
kmeans ran in 0.1678 seconds
Homogeneity score = 0.4638
Silhouette score = -0.256
Silhouette score with Outlier Remove = -0.256
Calinski harabasz score = 52899.8946
Davies bouldin score = 0.4993

KMeans didn't work.
string index out of range
kmeans ran in 0.2165 seconds
Homogeneity score = 0.5224
Silhouette score = -0.2768
Silhouette score with Outlier Remove = -0.2768
Calinski harabasz score = 67619.3416
Davies bouldin score = 0.4986

KMeans didn't work.
string index out of range


In [9]:
# BIRCH
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runBIRCH(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "birch", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    except Exception as e:
        print("BIRCH didn't work.")
        print(e)

birch ran in 0.06 seconds
Homogeneity score = 0.4906
Silhouette score = -0.2509
Silhouette score with Outlier Remove = -0.2509
Calinski harabasz score = 59610.2702
Davies bouldin score = 0.5

birch ran in 0.0519 seconds
Homogeneity score = 0.4771
Silhouette score = -0.2477
Silhouette score with Outlier Remove = -0.2477
Calinski harabasz score = 43657.9944
Davies bouldin score = 0.5

birch ran in 0.052 seconds
Homogeneity score = 0.5123
Silhouette score = -0.3033
Silhouette score with Outlier Remove = -0.3033
Calinski harabasz score = 57697.5069
Davies bouldin score = 0.5



In [10]:
# Affinity Propagation
try:
    start = time.perf_counter()
    res = runAffinityPropagation(hashList, random_state=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ap", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Affinity Propagation didn't work.")
    print(e)

Affinity Propagation didn't work.
Affinity must be 'precomputed' or 'euclidean'. Got <function sim_affinity at 0x000001B76AC7DCA0> instead


In [11]:
# Mean Shift
try:
    start = time.perf_counter()
    res = runMeanShift(hashList, bandwidth=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ms", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Mean Shift didn't work.")
    print(e)

ms ran in 0.5594 seconds
Homogeneity score = 0.6545
Silhouette score = -0.3743
Silhouette score with Outlier Remove = -0.3743
Calinski harabasz score = 142585.2298
Davies bouldin score = 0.5



In [12]:
# Spectral Clustering
try:
    start = time.perf_counter()
    res = runSpectral(hashList, n_clusters=nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "sp", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Spectral Clustering didn't work.")
    print(e)

sp ran in 0.6531 seconds
Homogeneity score = 0.4931
Silhouette score = -0.2604
Silhouette score with Outlier Remove = -0.2604
Calinski harabasz score = 59873.9614
Davies bouldin score = 0.4998



In [13]:
# Output
outfile = path + "/output/" + filename + "_Tlsh_result.csv"
df.to_csv(outfile, index=False)

toc = round(time.perf_counter() - tic, 4)

print("All code ran in " + str(toc) + " seconds")
df

All code ran in 18.1912 seconds


Unnamed: 0,nSample,Hash,Cluster,nLabel,nCluster,Time(s),Homo.,Sil.,Cal.,Dav.
0,1000,tlsh,ac,63,62,0.572,0.5708,0.3126,2.3056,188.1731
1,1000,tlsh,dbscan,63,57,0.9103,0.7968,0.731,4.3307,153.8919
2,1000,tlsh,hac-t,63,75,0.1965,0.8542,0.5195,7.7455,147.5003
3,1000,tlsh,optics,63,197,1.9921,0.8776,0.4754,2.876,372.3439
4,1000,tlsh,kmeans,63,62,0.1937,0.4831,-0.2625,57657.9806,0.499
5,1000,tlsh,kmeans,63,56,0.1678,0.4638,-0.256,52899.8946,0.4993
6,1000,tlsh,kmeans,63,74,0.2165,0.5224,-0.2768,67619.3416,0.4986
7,1000,tlsh,birch,63,62,0.06,0.4906,-0.2509,59610.2702,0.5
8,1000,tlsh,birch,63,56,0.0519,0.4771,-0.2477,43657.9944,0.5
9,1000,tlsh,birch,63,74,0.052,0.5123,-0.3033,57697.5069,0.5
