In [2]:
import pandas as pd
from sklearn.metrics import silhouette_score, calinski_harabasz_score, davies_bouldin_score
from ast import literal_eval

# Evaluation for K-Means

In [8]:
# Load Clustering results
kMeansDF = pd.read_csv("./Output Clusters/k-means-3-cluster-output.csv" , index_col=[0], converters={"Transaction": literal_eval, "Vector":literal_eval})
kMeansDF.head()

Unnamed: 0,Transaction,Vector,Cluster
0,"[salty snack, pastry, whole milk]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
1,"[yogurt, semi-finished bread, whole milk, saus...","[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",2
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,"[misc. beverages, canned beer]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",1
4,"[hygiene articles, sausage]","[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",1


In [10]:
# Getting Sihoulette Score
silhouette_score(list(kMeansDF["Vector"]), list(kMeansDF["Cluster"]))

0.08738963332039071

In [17]:
# Getting Calinski Harabasz Score
calinski_harabasz_score(list(kMeansDF["Vector"]), list(kMeansDF["Cluster"]))

697.3075224829968

In [13]:
# Getting Sihoulette Score
davies_bouldin_score(list(kMeansDF["Vector"]), list(kMeansDF["Cluster"]))

3.2287140484389583

In [33]:
sil_dict = {}
cal_dict = {}
dav_dict = {}

for i in range(5,10,1):
    sil_dict["book-{}-k-means".format(i)] = {}
    cal_dict["book-{}-k-means".format(i)] = {}
    dav_dict["book-{}-k-means".format(i)] = {}
    for j in range(3,16,3):
        
        # Read Output CSV
        KMeansDF = pd.read_csv("./Output Clusters/Book/k_means_0.0{}_{}_output.csv".format(i,j) , index_col=[0], names=["Description", "Vector", "Cluster"], converters={"Description": literal_eval, "Vector":literal_eval})

        try:
            # Getting Sihoulette Score
            sil_dict["book-{}-k-means".format(i)][str(j)] = silhouette_score(list(KMeansDF["Vector"]), list(KMeansDF["Cluster"]))

            # Getting Calinski Harabasz Score
            cal_dict["book-{}-k-means".format(i)][str(j)] = calinski_harabasz_score(list(KMeansDF["Vector"]), list(KMeansDF["Cluster"]))

            # Getting Sihoulette Score
            dav_dict["book-{}-k-means".format(i)][str(j)] = davies_bouldin_score(list(KMeansDF["Vector"]), list(KMeansDF["Cluster"]))
        
        except ValueError as e:
            sil_dict["book-{}-k-means".format(i)][str(j)] = "-"
            cal_dict["book-{}-k-means".format(i)][str(j)] = "-"
            dav_dict["book-{}-k-means".format(i)][str(j)] = "-"

In [34]:
pd.DataFrame(sil_dict).transpose()

Unnamed: 0,3,6,9,12,15
book-5-k-means,0.243541,0.218641,0.130394,0.148731,0.094476
book-6-k-means,0.256968,0.228965,0.12996,0.064824,0.124476
book-7-k-means,0.272556,0.251505,0.227168,0.188123,0.157513
book-8-k-means,0.287085,0.254424,0.206226,0.190647,0.177563
book-9-k-means,0.30361,0.285995,0.224013,0.195805,0.114281


In [35]:
pd.DataFrame(cal_dict).transpose()

Unnamed: 0,3,6,9,12,15
book-5-k-means,233.086029,145.915909,114.668814,96.198089,84.767701
book-6-k-means,274.075852,174.192823,138.471386,116.960668,102.383978
book-7-k-means,329.936637,213.055326,167.187322,142.147842,124.43316
book-8-k-means,367.196393,241.367191,192.649397,163.830556,145.014854
book-9-k-means,436.05125,290.168884,236.417468,203.02987,180.383478


In [36]:
pd.DataFrame(dav_dict).transpose()

Unnamed: 0,3,6,9,12,15
book-5-k-means,3.589646,3.604397,3.573086,3.280994,3.612359
book-6-k-means,3.310797,3.30684,3.329126,3.106905,3.135875
book-7-k-means,3.091545,2.96792,2.792711,2.84254,2.710633
book-8-k-means,2.926753,2.826358,2.716446,2.627282,2.568417
book-9-k-means,2.672216,2.504513,2.390347,2.537327,2.23513


### Finding ground-truth scores

In [54]:
"".join(cat_df["categories"][0])

'Fiction'

In [56]:
cat_df = pd.read_csv("../Apriori/Dataset/book_category.csv", converters={"categories": literal_eval} )
cat_df["categories"] = cat_df["categories"].apply(lambda x: "".join(x))
cat_df.head()

Unnamed: 0,categories
0,Fiction
1,Architecture
2,History
3,Fiction
4,Fiction


In [90]:
kMeansDF = pd.read_csv("./Output Clusters/Book/k_means_0.09_15_output.csv" , index_col=[0], names=["Description", "Vector", "Cluster"], converters={"Transaction": literal_eval, "Vector":literal_eval})
kMeansDF = pd.concat([kMeansDF, cat_df], axis=1)
kMeansDF.head()

Unnamed: 0,Description,Vector,Cluster,categories
0,"['rug', 'sackett', 'famili', 'stood', 'breed',...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,Fiction
1,"['allinclus', 'guid', 'design', 'vacat', 'retr...","[0, 0, 1, 1, 1, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",11,Architecture
2,"['world', 'war', 'two', 'luftwaff', 'aircraft'...","[0, 0, 0, 2, 0, 0, 1, 1, 3, 0, 0, 0, 0, 0, 0, ...",13,History
3,"['fiction', 'histori', 'fourthcenturi', 'irish...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, ...",11,Fiction
4,"['1941', 'killakeet', 'island', 'windswept', '...","[0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 2, 1, 1, 1, 1, ...",14,Fiction


In [67]:
cluster_mapping = {}
for cluster in kMeansDF["Cluster"].unique().tolist():
    cluster_mapping[cluster] = kMeansDF[kMeansDF["Cluster"] == cluster]["categories"].value_counts()

categories
Fiction                      1074
Juvenile Fiction              486
Religion                      341
History                       247
Biography & Autobiography     212
Health & Fitness               69
Family & Relationships         68
Art                            57
Sports & Recreation            49
Foreign Language Study         38
Architecture                   19
Gardening                      18
Name: count, dtype: int64

In [113]:
kMeansDF[kMeansDF["Cluster"] == 9]["categories"].value_counts()#.index

categories
Fiction                      106
Biography & Autobiography     25
Juvenile Fiction               7
Sports & Recreation            3
Religion                       3
History                        3
Health & Fitness               2
Name: count, dtype: int64

In [3]:
# Accuracy Calculator
import pandas as pd

def get_accuracy(file_1, file_2, isBook):
    """Calculate accuracy by comparing cluster labels of two files"""
    if isBook:
        directory = "./Output Clusters/Book/"
    else:
        directory = "./Output Clusters/Basket/"
    
    # Read CSVs
    file_1_df = pd.read_csv( directory + file_1, index_col=[0],names=["Description", "Vector", "Cluster 1"])
    file_2_df = pd.read_csv( directory + file_2, index_col=[0],names=["Description", "Vector", "Cluster 2"])
    
    concat_df = pd.concat([file_1_df, file_2_df], axis=1)
    
    # Calculate Accuracy based on Cluster Column
    concat_df["Accuracy"] = concat_df["Cluster 1"] == concat_df["Cluster 2"]
    
    # Return Accuracy Score
    return concat_df["Accuracy"].value_counts(normalize=True)

In [30]:
book_accuracy_dict = {}
basket_accuracy_dict = {}
for cluster in range(3,16,3):
    book_accuracy_dict[str(cluster)] = {}
    basket_accuracy_dict[str(cluster)] = {}
    for min_supp in range(5,16,1): # BOOK
        min_supp /= 100
        book_accuracy_dict[str(cluster)][str(min_supp)] = get_accuracy("k_means_{}_{}_output.csv".format(min_supp, cluster), "k_means_full_{}_output.csv".format(cluster),True).to_dict()
    for min_supp in range(1,6,1) : # BASKET
        min_supp /= 100
        basket_accuracy_dict[str(cluster)][str(min_supp)] =get_accuracy("k_means_{}_{}_output.csv".format(min_supp, cluster), "k_means_full_{}_output.csv".format(cluster),False).to_dict()

In [31]:
book_accuracy_dict['3']['0.05'][True]

0.0766

In [32]:
accuracy = {}
# for cluster in book_accuracy_dict:
#     accuracy[cluster] = {}
#     accuracy[cluster]["0"] = [(book_accuracy_dict[cluster]["0.05"][True]+book_accuracy_dict[cluster]["0.06"][True]+book_accuracy_dict[cluster]["0.07"][True]+book_accuracy_dict[cluster]["0.08"][True])/4,
#                               (book_accuracy_dict[cluster]["0.05"][False]+book_accuracy_dict[cluster]["0.06"][False]+book_accuracy_dict[cluster]["0.07"][False]+book_accuracy_dict[cluster]["0.08"][False])/4]
#     accuracy[cluster]["1"] = [(book_accuracy_dict[cluster]["0.09"][True]+book_accuracy_dict[cluster]["0.1"][True]+book_accuracy_dict[cluster]["0.11"][True]+book_accuracy_dict[cluster]["0.12"][True])/4,
#                               (book_accuracy_dict[cluster]["0.09"][False]+book_accuracy_dict[cluster]["0.1"][False]+book_accuracy_dict[cluster]["0.11"][False]+book_accuracy_dict[cluster]["0.12"][False])/4]
#     accuracy[cluster]["2"] = [(book_accuracy_dict[cluster]["0.13"][True]+book_accuracy_dict[cluster]["0.14"][True]+book_accuracy_dict[cluster]["0.15"][True])/3,
#                               (book_accuracy_dict[cluster]["0.13"][False]+book_accuracy_dict[cluster]["0.14"][False]+book_accuracy_dict[cluster]["0.15"][False])/3]

for cluster in basket_accuracy_dict:
    accuracy[cluster] = {}
    accuracy[cluster]["0"] = [(basket_accuracy_dict[cluster]["0.01"][True]+basket_accuracy_dict[cluster]["0.02"][True])/2,
                                (basket_accuracy_dict[cluster]["0.01"][False]+basket_accuracy_dict[cluster]["0.02"][False])/2]
    accuracy[cluster]["1"] = [(basket_accuracy_dict[cluster]["0.03"][True]+basket_accuracy_dict[cluster]["0.04"][True])/2,
                                (basket_accuracy_dict[cluster]["0.03"][False]+basket_accuracy_dict[cluster]["0.04"][False])/2]
    accuracy[cluster]["2"] = [basket_accuracy_dict[cluster]["0.05"][True],basket_accuracy_dict[cluster]["0.05"][False]]
print(accuracy)

{'3': {'0': [0.4261511728931364, 0.5738488271068636], '1': [0.49675867138942725, 0.5032413286105728], '2': [0.11187596070306757, 0.8881240392969324]}, '6': {'0': [0.12089821559847624, 0.8791017844015238], '1': [0.3375325803649001, 0.6624674196350999], '2': [0.06636369711956158, 0.9336363028804384]}, '9': {'0': [0.10693042839002874, 0.8930695716099712], '1': [0.23177170353538729, 0.7682282964646128], '2': [0.0463142417964312, 0.9536857582035688]}, '12': {'0': [0.05700728463543407, 0.942992715364566], '1': [0.12480785938648667, 0.8751921406135132], '2': [0.028470226558845152, 0.9715297734411549]}, '15': {'0': [0.050591458932032346, 0.9494085410679676], '1': [0.07077457729065027, 0.9292254227093497], '2': [0.06014836596939117, 0.9398516340306088]}}


In [29]:
basket_accuracy_dict

{'3': {'0.01': {True: 0.7460402325736818, False: 0.25395976742631826},
  '0.02': {True: 0.7460402325736818, False: 0.25395976742631826},
  '0.03': {True: 0.7460402325736818, False: 0.25395976742631826},
  '0.04': {True: 0.7460402325736818, False: 0.25395976742631826},
  '0.05': {True: 0.7460402325736818, False: 0.25395976742631826}},
 '6': {'0.01': {False: 0.9433937044710285, True: 0.056606295528971465},
  '0.02': {False: 0.9433937044710285, True: 0.056606295528971465},
  '0.03': {False: 0.9433937044710285, True: 0.056606295528971465},
  '0.04': {False: 0.9433937044710285, True: 0.056606295528971465},
  '0.05': {False: 0.9433937044710285, True: 0.056606295528971465}},
 '9': {'0.01': {False: 0.9127180378266391, True: 0.08728196217336096},
  '0.02': {False: 0.9127180378266391, True: 0.08728196217336096},
  '0.03': {False: 0.9127180378266391, True: 0.08728196217336096},
  '0.04': {False: 0.9127180378266391, True: 0.08728196217336096},
  '0.05': {False: 0.9127180378266391, True: 0.08728196

# Evaluation for DBScan

In [18]:
# Load Clustering results
dbScanDF = pd.read_csv("./Output Clusters/Book/dbscan_0.0{}_{}_output.csv".format(5,0.5) , index_col=[0], names=["Description", "Vector", "Cluster"], converters={"Description": literal_eval, "Vector":literal_eval})
dbScanDF.head()

Unnamed: 0,Description,Vector,Cluster
0,"[rug, sackett, famili, stood, breed, apart, fe...","[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[allinclus, guid, design, vacat, retreat, hosp...","[0, 0, 2, 1, 2, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, ...",0
2,"[world, war, two, luftwaff, aircraft, pilot, s...","[0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, ...",0
3,"[fiction, histori, fourthcenturi, irish, monk,...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[1941, killakeet, island, windswept, outer, ba...","[0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",-1


In [19]:
# Getting Sihoulette Score
silhouette_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))

-0.04678420607482126

In [14]:
# Getting Calinski Harabasz Score
calinski_harabasz_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))

5.765933014135346

In [15]:
# Getting Sihoulette Score
davies_bouldin_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))

2.4151104569121853

In [29]:
sil_dict = {}
cal_dict = {}
dav_dict = {}

for i in range(5,10,1):
    sil_dict["book-{}".format(i)] = {}
    cal_dict["book-{}".format(i)] = {}
    dav_dict["book-{}".format(i)] = {}
    for j in range(10,51,5):
        j /= 100
        
        # Read Output CSV
        dbScanDF = pd.read_csv("./Output Clusters/Book/dbscan_0.0{}_{}_output.csv".format(i,j) , index_col=[0], names=["Description", "Vector", "Cluster"], converters={"Description": literal_eval, "Vector":literal_eval})

        try:
            # Getting Sihoulette Score
            sil_dict["book-{}".format(i)][str(j)] = silhouette_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))

            # Getting Calinski Harabasz Score
            cal_dict["book-{}".format(i)][str(j)] = calinski_harabasz_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))

            # Getting Sihoulette Score
            dav_dict["book-{}".format(i)][str(j)] = davies_bouldin_score(list(dbScanDF["Vector"]), list(dbScanDF["Cluster"]))
        
        except ValueError as e:
            sil_dict["book-{}".format(i)][str(j)] = "-"
            cal_dict["book-{}".format(i)][str(j)] = "-"
            dav_dict["book-{}".format(i)][str(j)] = "-"

In [30]:
pd.DataFrame(sil_dict).transpose()

Unnamed: 0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5
book-5,-0.343905,-0.344695,-0.33342,-0.328482,-0.272339,-0.260714,-0.26238,-0.161783,-0.065702
book-6,-0.343668,-0.341067,-0.323939,-0.312285,-0.105451,-0.271215,-0.220687,-0.086474,-0.242031
book-7,-0.325775,-0.322066,-0.329626,-0.319768,0.018018,-0.016076,-0.10832,-0.254966,-0.283248
book-8,-0.29781,-0.298917,-0.327635,-0.324965,-0.009781,-0.109162,-0.237618,-0.26296,-0.26296
book-9,-0.281084,-0.306494,-0.321878,-0.213905,-0.166446,-0.216568,-0.220111,-0.220111,-0.220111


In [31]:
pd.DataFrame(cal_dict).transpose()

Unnamed: 0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5
book-5,3.400456,2.968857,2.985354,5.525633,12.808625,12.786376,6.838096,5.063745,9.395574
book-6,2.381124,2.809465,5.309863,5.606625,18.463977,7.575156,8.094107,8.666053,17.573602
book-7,2.814564,3.781388,3.771143,5.35451,27.144224,16.266177,22.406561,32.667293,34.797558
book-8,2.89236,4.792751,4.41882,4.134588,8.637037,26.457468,43.687259,47.556761,47.556761
book-9,5.865369,6.963983,5.42042,8.330591,61.106767,77.462961,78.239265,78.239265,78.239265


In [32]:
pd.DataFrame(dav_dict).transpose()

Unnamed: 0,0.1,0.15,0.2,0.25,0.3,0.35,0.4,0.45,0.5
book-5,2.121804,2.35252,2.371653,2.372556,2.979637,4.322082,4.479357,9.146329,11.644301
book-6,2.220535,2.257321,2.375476,2.484931,5.522135,4.762072,10.245109,6.826574,3.904613
book-7,2.052546,2.149731,2.412308,2.579195,11.722062,13.154044,7.126223,2.906185,2.354735
book-8,1.977967,2.12394,2.342516,2.801202,10.088414,6.82031,2.777111,2.255888,2.255888
book-9,1.938843,2.214234,3.030593,5.97348,3.432303,2.175628,2.101011,2.101011,2.101011


# Evaluation for Hierarchical Clustering

In [80]:
# Accuracy Calculator
import pandas as pd
import numpy as np
import os

def get_accuracy(file_1, file_2):
    """Calculate accuracy by comparing cluster labels of two files"""
    if "book" in file_1 and "book" in file_2:
        directory = "./Output Clusters/Book/"
    elif "basket" in file_1 and "basket" in file_2:
        directory = "./Output Clusters/Basket/"
    else:
        raise Exception("File names do not match")
    
    # Read CSVs
    file_1_df = pd.read_csv( directory + file_1, index_col=[0])
    file_2_df = pd.read_csv( directory + file_2, index_col=[0])
    
    # Rename Columns
    file_1_df.columns = ["Description", "Vector", "Cluster 1"]
    file_2_df.columns = ["Description", "Vector", "Cluster 2"]
    
    concat_df = pd.concat([file_1_df, file_2_df], axis=1)
    
    # Calculate Accuracy
    concat_df["Same"] = concat_df["Cluster 1"] == concat_df["Cluster 2"]
    
    # iterate same column and calculate accuracy
    accuracy = 0
    for i in concat_df["Same"]:
        if i:
            accuracy += 1
    accuracy /= len(concat_df)
    return accuracy

def get_all_accuracy(data_type):
    """Get all accuracy of the same data type and save as csv"""
    
    if data_type == "book":
        dir = "./Dataset/Book/"
        output_dir = "./Output Clusters/Book/"
    elif data_type == "basket":
        dir = "./Dataset/Basket/"
        output_dir = "./Output Clusters/Basket/"
    else:
        raise Exception("Data Type not found")
    
    accuracy_list = []
    
    # Get all files in directory
    freq_list = []
    orig_list = []
    
    files = sorted(os.listdir(output_dir))
    for f in files:
        if "freq_vector_hc_" in f:
            freq_list.append(f)
        elif "vector_hc_" in f:
            orig_list.append(f)
            
    # Sort orig_list on accending order of num_clusters 
    orig_list = sorted(orig_list, key=lambda x: int(x.split("_")[3].split(".")[0]))
    
    print(freq_list)
    print(orig_list)
    
    df = pd.DataFrame(accuracy_list, columns=["Original", "Freq", "Accuracy"])
    for f in orig_list:
        f_config = f.split("_")[-1].split(".")[0]
        # print(f_config)
        accuracy_list = []
        for f2 in freq_list:
            f2_config = f2.split("_")[-1].split(".")[0]
            # print(f2_config)
            if f_config == f2_config:
                accuracy = get_accuracy(f, f2)
                accuracy_list.append([f, f2, accuracy])
        df = df.append(accuracy_list, ignore_index=True)
    print(accuracy_list)
    
    df.to_csv("{}/{}_accuracy.csv".format(output_dir, data_type))
    return df.head()
    
get_all_accuracy("basket") 
get_all_accuracy("book")
# get_accuracy("basket_vector_hc_3.csv", "basket_0.05_freq_vector_hc_15.csv")

['basket_0.01_freq_vector_hc_12.csv', 'basket_0.01_freq_vector_hc_15.csv', 'basket_0.01_freq_vector_hc_3.csv', 'basket_0.01_freq_vector_hc_6.csv', 'basket_0.01_freq_vector_hc_9.csv', 'basket_0.02_freq_vector_hc_12.csv', 'basket_0.02_freq_vector_hc_15.csv', 'basket_0.02_freq_vector_hc_3.csv', 'basket_0.02_freq_vector_hc_6.csv', 'basket_0.02_freq_vector_hc_9.csv', 'basket_0.03_freq_vector_hc_12.csv', 'basket_0.03_freq_vector_hc_15.csv', 'basket_0.03_freq_vector_hc_3.csv', 'basket_0.03_freq_vector_hc_6.csv', 'basket_0.03_freq_vector_hc_9.csv', 'basket_0.04_freq_vector_hc_12.csv', 'basket_0.04_freq_vector_hc_15.csv', 'basket_0.04_freq_vector_hc_3.csv', 'basket_0.04_freq_vector_hc_6.csv', 'basket_0.04_freq_vector_hc_9.csv', 'basket_0.05_freq_vector_hc_12.csv', 'basket_0.05_freq_vector_hc_15.csv', 'basket_0.05_freq_vector_hc_3.csv', 'basket_0.05_freq_vector_hc_6.csv', 'basket_0.05_freq_vector_hc_9.csv']
['basket_vector_hc_3.csv', 'basket_vector_hc_6.csv', 'basket_vector_hc_9.csv', 'basket_ve

  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)


[['basket_vector_hc_15.csv', 'basket_0.01_freq_vector_hc_15.csv', 0.034351400120296735], ['basket_vector_hc_15.csv', 'basket_0.02_freq_vector_hc_15.csv', 0.45345184789146564], ['basket_vector_hc_15.csv', 'basket_0.03_freq_vector_hc_15.csv', 0.4275880505246274], ['basket_vector_hc_15.csv', 'basket_0.04_freq_vector_hc_15.csv', 0.28623939049655817], ['basket_vector_hc_15.csv', 'basket_0.05_freq_vector_hc_15.csv', 0.024527167011962843]]
['book_0.05_freq_vector_hc_12.csv', 'book_0.05_freq_vector_hc_15.csv', 'book_0.05_freq_vector_hc_3.csv', 'book_0.05_freq_vector_hc_6.csv', 'book_0.05_freq_vector_hc_9.csv', 'book_0.06_freq_vector_hc_12.csv', 'book_0.06_freq_vector_hc_15.csv', 'book_0.06_freq_vector_hc_3.csv', 'book_0.06_freq_vector_hc_6.csv', 'book_0.06_freq_vector_hc_9.csv', 'book_0.07_freq_vector_hc_12.csv', 'book_0.07_freq_vector_hc_15.csv', 'book_0.07_freq_vector_hc_3.csv', 'book_0.07_freq_vector_hc_6.csv', 'book_0.07_freq_vector_hc_9.csv', 'book_0.08_freq_vector_hc_12.csv', 'book_0.08_

  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)
  df = df.append(accuracy_list, ignore_index=True)


[['book_vector_hc_15.csv', 'book_0.05_freq_vector_hc_15.csv', 0.1116], ['book_vector_hc_15.csv', 'book_0.06_freq_vector_hc_15.csv', 0.053], ['book_vector_hc_15.csv', 'book_0.07_freq_vector_hc_15.csv', 0.0044], ['book_vector_hc_15.csv', 'book_0.08_freq_vector_hc_15.csv', 0.0948], ['book_vector_hc_15.csv', 'book_0.09_freq_vector_hc_15.csv', 0.022], ['book_vector_hc_15.csv', 'book_0.11_freq_vector_hc_15.csv', 0.0458], ['book_vector_hc_15.csv', 'book_0.12_freq_vector_hc_15.csv', 0.034], ['book_vector_hc_15.csv', 'book_0.13_freq_vector_hc_15.csv', 0.0152], ['book_vector_hc_15.csv', 'book_0.14_freq_vector_hc_15.csv', 0.044], ['book_vector_hc_15.csv', 'book_0.15_freq_vector_hc_15.csv', 0.216], ['book_vector_hc_15.csv', 'book_0.1_freq_vector_hc_15.csv', 0.0252]]


  df = df.append(accuracy_list, ignore_index=True)


Unnamed: 0,Original,Freq,Accuracy,0,1,2
0,,,,book_vector_hc_3.csv,book_0.05_freq_vector_hc_3.csv,0.0994
1,,,,book_vector_hc_3.csv,book_0.06_freq_vector_hc_3.csv,0.1004
2,,,,book_vector_hc_3.csv,book_0.07_freq_vector_hc_3.csv,0.1432
3,,,,book_vector_hc_3.csv,book_0.08_freq_vector_hc_3.csv,0.0978
4,,,,book_vector_hc_3.csv,book_0.09_freq_vector_hc_3.csv,0.0476


In [9]:
# Load Clustering results
hierDF3 = pd.read_csv("./Output Clusters/hierarchical_3_clusters.csv" , index_col=[0], converters={"Transaction": literal_eval, "Vector":literal_eval})
hierDF3.head()

Unnamed: 0,Transaction,Vector,Cluster
0,"[salty snack, pastry, whole milk]","[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
1,"[yogurt, semi-finished bread, whole milk, saus...","[0, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",0
2,"[soda, pickled vegetables]","[0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1
3,"[misc. beverages, canned beer]","[0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, ...",0
4,"[hygiene articles, sausage]","[0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, ...",0


In [8]:
# # Getting Sihoulette Score
# silhouette_score(list(hierDF["Vector"]), list(hierDF["Cluster"]))

# # Getting Calinski Harabasz Score
# calinski_harabasz_score(list(hierDF["Vector"]), list(hierDF["Cluster"]))

# # Getting Davies Bouldin Score
# davies_bouldin_score(list(hierDF["Vector"]), list(hierDF["Cluster"]))

print("silhouette_score: ", silhouette_score(list(hierDF["Vector"]), list(hierDF["Cluster"])))
print("calinski_harabasz_score: ", calinski_harabasz_score(list(hierDF["Vector"]), list(hierDF["Cluster"])))
print("davies_bouldin_score: ", davies_bouldin_score(list(hierDF["Vector"]), list(hierDF["Cluster"])))

silhouette_score:  -0.03153850100284397
calinski_harabasz_score:  224.23528231574852
davies_bouldin_score:  2.503086108335078
