In [1]:
import argparse
import sys
import time

import pandas as pd
from sklearn import metrics
import numpy as np

from pylib.tlsh_lib import *
import warnings
warnings.filterwarnings("ignore")

In [2]:
# List of Function
def getResult(hashType, clusterType, labelList, clusterNumber):
    data = tlist2cdata(hashList)
    
    d = {word: key for key, word in enumerate(set(labelList))}
    labelList_id = [d[word] for word in labelList]
    
    outlierRemoveLabel = []
    outlierRemoveID = []
    outlierRemoveData = []
    
    for i in range(len(clusterNumber)):
        if clusterNumber[i] >= 0:
            outlierRemoveLabel.append(clusterNumber[i])
            outlierRemoveID.append(labelList_id[i])
            outlierRemoveData.append(data[i])
            
    #print("labelList_id=", labelList_id)
    #print("cluster labels=",clusterNumber)
    #print("outlierRemoveLabel =", outlierRemoveLabel)
    #print("outlierRemoveData =", outlierRemoveData)
    
    # Number of decimal place for score
    dp = 4 
    
    homo = round(metrics.homogeneity_score(outlierRemoveID, outlierRemoveLabel), dp)
    silh1 = round(metrics.silhouette_score(data, clusterNumber, metric=sim), dp)
    silh2 = round(metrics.silhouette_score(outlierRemoveData, outlierRemoveLabel, metric=sim), dp)
    #cali = round(metrics.calinski_harabasz_score(outlierRemoveData, outlierRemoveLabel), dp)
    #dav = round(metrics.davies_bouldin_score(outlierRemoveData, outlierRemoveLabel), dp)
    
    print(clusterType + " ran in " + str(end) + " seconds")
    print("Homogeneity score =",homo)
    print("Silhouette score =",silh1)
    print("Silhouette score with Outlier Remove =",silh2)
    #print("Calinski harabasz score =",cali)
    #print("Davies bouldin score =",dav)
    #print(metrics.silhouette_samples(outlierRemoveData, outlierRemoveLabel, metric=sim))
    print()
    
    result = {"nSample": int(len(tlist)),
              "Hash": str(hashType),
              "Cluster": str(clusterType),
              "nLabel": int(nlabel),
              "nCluster": int(max(clusterNumber)),
              "Time(s)": float(end),
              "Homogeneity": float(homo),
              "Silhouette": float(silh2)
              #"Cal.": float(cali),
              #"Dav.": float(dav)
             }
    return result

In [3]:
datafile = "dataDir/mb_323425.csv" #<-----Change this file size
if (datafile == ""):
    print("you must provide a datafile name (-f)\n")
    sys.exit()

tic = time.perf_counter()  # experiment time counter
df = pd.DataFrame() #Result Table

(path, file) = datafile.split("/")  # save file path
(filename, filetype) = file.split(".")  # save file type

(tlist, [labelList, dateList, slist]) = tlsh_csvfile(datafile)  # return (tlshList, [labelList, dateList, ssdeepList])

hashList = tlist

print("Number of samples is " + str(len(hashList)))
print("Number of Unique Label is " + str(len(set(labelList))))
print("Example hash: " + str(hashList[0]))
nlabel = len(set(labelList))
nClusters = [nlabel]

Number of samples is 323410
Number of Unique Label is 537
Example hash: T10263F782BC80EA22C7C01677FE6F518E331567D8E1EA32429D155FA07A8FC1B0D5B786


In [4]:
# Agglomerative Clustering
try:
    start = time.perf_counter()
    res = assignCluster(hashList, nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ac", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Agglomerative Clustering didn't work.")
    print(e)

Agglomerative Clustering didn't work.
Unable to allocate 779. GiB for an array with shape (323410, 323410) and data type float64


In [5]:
# DBSCAN
from pylib.hac_lib import *
try:
    resetDistCalc()

    start = time.perf_counter()
    res = runDBSCAN(hashList, eps=30, min_samples=2, algorithm='auto')
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "dbscan", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res.labels_)
    nDistCalc = lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_dbscan_out.txt"
    #outputClusters(outfile, hashList, res.labels_, labelList, quiet=True)
    
except Exception as e:
    print("DBSCAN didn't work.")
    print(e)

dbscan ran in 56910.678 seconds
Homogeneity score = 0.9383
Silhouette score = 0.4256
Silhouette score with Outlier Remove = 0.7866



In [6]:
# HAC-T
from pylib.hac_lib import *
try:
    hac_resetDistCalc()

    start = time.perf_counter()
    res = HAC_T(datafile, CDist=30, step3=0, outfname="tmp.txt", cenfname="tmp2.txt")
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "hac-t", labelList, res)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    nclusters = max(res)
    nDistCalc = hac_lookupDistCalc()
    #print("nclusters is " + str(nclusters))
    #print("nDistCalc is " + str(nDistCalc))

    nClusters.append(nclusters)

    #outfile = path + "/output/" + filename + "_hac-t_out.txt"
    #outputClusters(outfile, hashList, res, labelList, quiet=True)
    
except Exception as e:
    print("HAC-T didn't work.")
    print(e)

hac-t ran in 3697.8187 seconds
Homogeneity score = 0.9561
Silhouette score = 0.4176
Silhouette score with Outlier Remove = 0.7818



In [None]:
# OPTICS
try:
    start = time.perf_counter()
    res = runOPTICS(hashList, min_samples=2)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "optics", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("OPTICS didn't work.")
    print(e)

In [None]:
"""
# KMeans
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runKMean(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "kmeans", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)
        
    except Exception as e:
        print("KMeans didn't work.")
        print(e)
"""

In [None]:
"""
# BIRCH
for i in nClusters:
    try:
        start = time.perf_counter()
        res = runBIRCH(hashList, n_clusters=i)
        end = round(time.perf_counter() - start, 4)

        dict = getResult("tlsh", "birch", labelList, res.labels_)
        df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

    except Exception as e:
        print("BIRCH didn't work.")
        print(e)
"""

In [None]:
"""
# Affinity Propagation
try:
    start = time.perf_counter()
    res = runAffinityPropagation(hashList, random_state=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ap", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Affinity Propagation didn't work.")
    print(e)
"""

In [None]:
"""
# Mean Shift
try:
    start = time.perf_counter()
    res = runMeanShift(hashList, bandwidth=5)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "ms", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Mean Shift didn't work.")
    print(e)
"""

In [None]:
"""
# Spectral Clustering
try:
    start = time.perf_counter()
    res = runSpectral(hashList, n_clusters=nlabel)
    end = round(time.perf_counter() - start, 4)

    dict = getResult("tlsh", "sp", labelList, res.labels_)
    df = pd.concat((df, pd.DataFrame([dict])), ignore_index=True)

except Exception as e:
    print("Spectral Clustering didn't work.")
    print(e)
"""

In [None]:
# Output
outfile = path + "/output/" + filename + "_Tlsh_result.csv"
df.to_csv(outfile, index=False)

toc = round(time.perf_counter() - tic, 4)

print("All code ran in " + str(toc) + " seconds")
df