In [1]:
import argparse
import sys
import time

import pandas as pd
from sklearn import metrics
import numpy as np

from pylib.tlsh_lib import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# List of Function
def getResult(hashType, clusterType, labelList, clusterNumber):
    data = tlist2cdata(hashList)
    
    d = {word: key for key, word in enumerate(set(labelList))}
    labelList_id = [d[word] for word in labelList]
    
    outlierRemoveLabel = []
    outlierRemoveID = []
    outlierRemoveData = []
    
    # Number of decimal place for score
    dp = 4 
    
    for i in range(len(clusterNumber)):
        if clusterNumber[i] >= 0:
            outlierRemoveLabel.append(clusterNumber[i])
            outlierRemoveID.append(labelList_id[i])
            outlierRemoveData.append(data[i])
            
    #print("labelList_id=", labelList_id)
    #print("cluster labels=",clusterNumber)
    #print("outlierRemoveLabel =", outlierRemoveLabel)
    #print("outlierRemoveData =", outlierRemoveData)
    
    homo = round(metrics.homogeneity_score(outlierRemoveID, outlierRemoveLabel), dp)
    silh1 = round(metrics.silhouette_score(data, clusterNumber, metric=sim), dp)
    silh2 = round(metrics.silhouette_score(outlierRemoveData, outlierRemoveLabel, metric=sim), dp)
    #cali = round(metrics.calinski_harabasz_score(outlierRemoveData, outlierRemoveLabel), dp)
    #dav = round(metrics.davies_bouldin_score(outlierRemoveData, outlierRemoveLabel), dp)
    
    coverage = len(outlierRemoveLabel)*100/len(clusterNumber)
    coverage = round(coverage, dp)
    
    print(clusterType + " ran in " + str(end) + " seconds")
    print("Homogeneity score =",homo)
    print("Silhouette score =",silh1)
    print("Silhouette score with Outlier Remove =",silh2)
    #print("Calinski harabasz score =",cali)
    #print("Davies bouldin score =",dav)
    #print(metrics.silhouette_samples(outlierRemoveData, outlierRemoveLabel, metric=sim))
    print("% of coverage =",coverage)
    print()
    
    result = {"nSample": int(len(tlist)),
              "Hash": str(hashType),
              "Cluster": str(clusterType),
              "nLabel": int(nlabel),
              "nCluster": int(max(clusterNumber)),
              "Time(s)": float(end),
              "Homogeneity": float(homo),
              "Silhouette": float(silh2),
              #"Cal.": float(cali),
              #"Dav.": float(dav),
              "Coverage(%)": float(coverage)
             }
    return result

In [3]:
datafile = "dataDir/mb_1000.csv" #<-----Change this file size
if (datafile == ""):
    print("you must provide a datafile name (-f)\n")
    sys.exit()

tic = time.perf_counter()  # experiment time counter
df = pd.DataFrame() #Result Table

(path, file) = datafile.split("/")  # save file path
(filename, filetype) = file.split(".")  # save file type

(tlist, [labelList, dateList, slist]) = tlsh_csvfile(datafile)  # return (tlshList, [labelList, dateList, ssdeepList])

hashList = tlist

print("Number of samples is " + str(len(hashList)))
print("Number of Unique Label is " + str(len(set(labelList))))
print("Example hash: " + str(hashList[0]))
nlabel = len(set(labelList))
nClusters = [nlabel]

Number of samples is 1000
Number of Unique Label is 63
Example hash: T10263F782BC80EA22C7C01677FE6F518E331567D8E1EA32429D155FA07A8FC1B0D5B786


In [4]:
# Agglomerative Clustering
try:
    start = time.perf_counter()
    res = assignCluster(hashList, nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ac", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Agglomerative Clustering didn't work.")
    print(e)

ac ran in 2.0144 seconds
Homogeneity score = 0.5708
Silhouette score = 0.3126
Silhouette score with Outlier Remove = 0.3126
% of coverage = 100.0



In [5]:
# DBSCAN
from pylib.hac_lib import *
try:
    resetDistCalc()

    start = time.perf_counter()
    res = runDBSCAN(hashList, eps=30, min_samples=2, algorithm='auto')
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "dbscan", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res.labels_)
    nDistCalc = lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_dbscan_out.txt"
    #outputClusters(outfile, hashList, res.labels_, labelList, quiet=True)
    
except Exception as e:
    print("DBSCAN didn't work.")
    print(e)

dbscan ran in 4.101 seconds
Homogeneity score = 0.7968
Silhouette score = -0.0341
Silhouette score with Outlier Remove = 0.731
% of coverage = 40.9



In [6]:
# HAC-T
from pylib.hac_lib import *
try:
    hac_resetDistCalc()

    start = time.perf_counter()
    res = HAC_T(datafile, CDist=30, step3=0, outfname="tmp.txt", cenfname="tmp2.txt")
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "hac-t", labelList, res)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res)
    nDistCalc = hac_lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_hac-t_out.txt"
    #outputClusters(outfile, hashList, res, labelList, quiet=True)
    
except Exception as e:
    print("HAC-T didn't work.")
    print(e)

hac-t ran in 0.8246 seconds
Homogeneity score = 0.8542
Silhouette score = -0.1316
Silhouette score with Outlier Remove = 0.5195
% of coverage = 40.3



In [7]:
# OPTICS
try:
    start = time.perf_counter()
    res = runOPTICS(hashList, min_samples=2)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "optics", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("OPTICS didn't work.")
    print(e)

optics ran in 7.0606 seconds
Homogeneity score = 0.8776
Silhouette score = 0.0311
Silhouette score with Outlier Remove = 0.4754
% of coverage = 62.2



In [8]:
"""
# KMeans
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runKMean(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "kmeans", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)
        
    except Exception as e:
        print("KMeans didn't work.")
        print(e)
"""

'\n# KMeans\nfor i in nClusters:\n    try:\n        start = time.perf_counter()\n        res = runKMean(hashList, n_clusters=i)\n        end = round(time.perf_counter() - start, 4)\n\n        dict = getResult("tlsh", "kmeans", labelList, res.labels_)\n        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)\n        \n    except Exception as e:\n        print("KMeans didn\'t work.")\n        print(e)\n'

In [9]:
"""
# BIRCH
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runBIRCH(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "birch", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    except Exception as e:
        print("BIRCH didn't work.")
        print(e)
"""

'\n# BIRCH\nfor i in nClusters:\n    try:\n        start = time.perf_counter()\n        res = runBIRCH(hashList, n_clusters=i)\n        end = round(time.perf_counter() - start, 4)\n\n        dict = getResult("tlsh", "birch", labelList, res.labels_)\n        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)\n\n    except Exception as e:\n        print("BIRCH didn\'t work.")\n        print(e)\n'

In [10]:
"""
# Affinity Propagation
try:
    start = time.perf_counter()
    res = runAffinityPropagation(hashList, random_state=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ap", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Affinity Propagation didn't work.")
    print(e)
"""

'\n# Affinity Propagation\ntry:\n    start = time.perf_counter()\n    res = runAffinityPropagation(hashList, random_state=5)\n    end = round(time.perf_counter() - start, 4)\n\n    dict = getResult("tlsh", "ap", labelList, res.labels_)\n    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)\n\nexcept Exception as e:\n    print("Affinity Propagation didn\'t work.")\n    print(e)\n'

In [11]:
"""
# Mean Shift
try:
    start = time.perf_counter()
    res = runMeanShift(hashList, bandwidth=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ms", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Mean Shift didn't work.")
    print(e)
"""

'\n# Mean Shift\ntry:\n    start = time.perf_counter()\n    res = runMeanShift(hashList, bandwidth=5)\n    end = round(time.perf_counter() - start, 4)\n\n    dict = getResult("tlsh", "ms", labelList, res.labels_)\n    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)\n\nexcept Exception as e:\n    print("Mean Shift didn\'t work.")\n    print(e)\n'

In [12]:
"""
# Spectral Clustering
try:
    start = time.perf_counter()
    res = runSpectral(hashList, n_clusters=nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "sp", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Spectral Clustering didn't work.")
    print(e)
"""

'\n# Spectral Clustering\ntry:\n    start = time.perf_counter()\n    res = runSpectral(hashList, n_clusters=nlabel)\n    end = round(time.perf_counter() - start, 4)\n\n    dict = getResult("tlsh", "sp", labelList, res.labels_)\n    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)\n\nexcept Exception as e:\n    print("Spectral Clustering didn\'t work.")\n    print(e)\n'

In [13]:
# Output
outfile = path + "/output/" + filename + "_Tlsh_result.csv"
df.to_csv(outfile, index=False)

toc = round(time.perf_counter() - tic, 4)

print("All code ran in " + str(toc) + " seconds")
df

All code ran in 25.8635 seconds


Unnamed: 0,nSample,Hash,Cluster,nLabel,nCluster,Time(s),Homogeneity,Silhouette,Coverage(%)
0,1000,tlsh,ac,63,62,2.0144,0.5708,0.3126,100.0
1,1000,tlsh,dbscan,63,57,4.101,0.7968,0.731,40.9
2,1000,tlsh,hac-t,63,75,0.8246,0.8542,0.5195,40.3
3,1000,tlsh,optics,63,197,7.0606,0.8776,0.4754,62.2
