### K-Means Clustering on a Multi-Class and Multi-Label Data Set


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import metrics
from sklearn.cluster import KMeans
from sklearn.metrics import hamming_loss
import random
import statistics

In [2]:
dataset =  pd.read_csv("Frogs_MFCCs.csv")
dataset.shape

(7195, 26)

In [3]:
dataset=dataset.drop('RecordID',axis=1)
dataset.shape

(7195, 25)

In [4]:
X=dataset.iloc[:,:22]
X.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_13,MFCCs_14,MFCCs_15,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.156436,0.082245,0.135752,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.254341,0.022786,0.16332,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.237384,0.050791,0.207338,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.317084,-0.011567,0.100413,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,-0.298524,0.037439,0.219153,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244


In [5]:
Y=dataset.loc[:, ['Family','Genus','Species']]
Y.head()

Unnamed: 0,Family,Genus,Species
0,Leptodactylidae,Adenomera,AdenomeraAndre
1,Leptodactylidae,Adenomera,AdenomeraAndre
2,Leptodactylidae,Adenomera,AdenomeraAndre
3,Leptodactylidae,Adenomera,AdenomeraAndre
4,Leptodactylidae,Adenomera,AdenomeraAndre


In [6]:
unique_family = dataset.Family.unique()
print(unique_family)
count_family = dataset['Family'].value_counts()
print(count_family)

['Leptodactylidae' 'Dendrobatidae' 'Hylidae' 'Bufonidae']
Leptodactylidae    4420
Hylidae            2165
Dendrobatidae       542
Bufonidae            68
Name: Family, dtype: int64


In [7]:
unique_genus = dataset.Genus.unique()
print(unique_genus)
count_genus = dataset['Genus'].value_counts()
print(count_genus)

['Adenomera' 'Ameerega' 'Dendropsophus' 'Hypsiboas' 'Leptodactylus'
 'Osteocephalus' 'Rhinella' 'Scinax']
Adenomera        4150
Hypsiboas        1593
Ameerega          542
Dendropsophus     310
Leptodactylus     270
Scinax            148
Osteocephalus     114
Rhinella           68
Name: Genus, dtype: int64


In [8]:
unique_species = dataset.Species.unique()
print(unique_species)
count_species = dataset['Species'].value_counts()
print(count_species)

['AdenomeraAndre' 'Ameeregatrivittata' 'AdenomeraHylaedactylus'
 'HylaMinuta' 'HypsiboasCinerascens' 'HypsiboasCordobae'
 'LeptodactylusFuscus' 'OsteocephalusOophagus' 'Rhinellagranulosa'
 'ScinaxRuber']
AdenomeraHylaedactylus    3478
HypsiboasCordobae         1121
AdenomeraAndre             672
Ameeregatrivittata         542
HypsiboasCinerascens       472
HylaMinuta                 310
LeptodactylusFuscus        270
ScinaxRuber                148
OsteocephalusOophagus      114
Rhinellagranulosa           68
Name: Species, dtype: int64


In [9]:
def clustering_montecarlo(X,Y):
    final_hammingDistance = []
    final_hammingLoss=[]
    final_hammingScore=[]

    family_majority_triplet={p:[] for p in range(1,51)}
    genus_majority_triplet={p:[] for p in range(1,51)}
    species_majority_triplet={p:[] for p in range(1,51)}
    for cls in range(1,51):
        silhoutee_avg = dict()

        for k in range(2,50):
            random_value=random.randint(0, 900)
            k_means = KMeans(n_clusters=k,init='k-means++',random_state=random_value).fit(X)
            labels = k_means.labels_
            silhoutee_avg.update({k:(metrics.silhouette_score(X, labels))})


        best_k = max(silhoutee_avg,key=silhoutee_avg.get)

        random_value=random.randint(0, 900)


        X1=X
        k_means_f = KMeans(n_clusters=4, random_state=random_value).fit(X1)
        cluster_labels = k_means_f.labels_

        clusters = pd.concat([X1,Y,pd.DataFrame({'labels':cluster_labels.tolist()})],axis = 1)
        clusters['labels'].value_counts()


        for k in range(4):
            find= clusters[clusters['labels']==k]
           

        maj_trip = {k:[] for k in range(4)}
        for k in range(4):
            c_value = clusters[clusters['labels']==k]
            maj_trip[k].append(c_value['Family'].value_counts().index[0])
            maj_trip[k].append(c_value['Genus'].value_counts().index[0])
            maj_trip[k].append(c_value['Species'].value_counts().index[0])
            family_majority_triplet[cls].append(c_value['Family'].value_counts().index[0])
            genus_majority_triplet[cls].append(c_value['Genus'].value_counts().index[0])
            species_majority_triplet[cls].append(c_value['Species'].value_counts().index[0])


        clusters['family_pred'] = 'none'
        clusters['genus_pred'] = 'none'
        clusters['species_pred'] = 'none'

        for k in range(4):
            clusters['family_pred'] = np.where(clusters['labels']==k,maj_trip[k][0],clusters['family_pred'])
            clusters['genus_pred'] = np.where(clusters['labels']==k,maj_trip[k][1],clusters['genus_pred'])
            clusters['species_pred'] = np.where(clusters['labels']==k,maj_trip[k][2],clusters['species_pred'])

        fam_s=hamming_loss(clusters['Family'],clusters['family_pred'])
        gen_s=hamming_loss(clusters['Genus'],clusters['genus_pred'])
        spec_s=hamming_loss(clusters['Species'],clusters['species_pred'])

        ham_loss_s=(fam_s+gen_s+spec_s)/3


        final_hammingLoss.append(np.round(ham_loss_s,6))
        final_hammingScore.append((1-ham_loss_s))
        final_hammingDistance.append((ham_loss_s*3))
    return best_k, final_hammingScore, family_majority_triplet, genus_majority_triplet, species_majority_triplet, final_hammingLoss,  final_hammingDistance

 

In [10]:
best_k, final_hammingScore, family_majority_triplet, genus_majority_triplet, species_majority_triplet,final_hammingLoss,  final_hammingDistance = clustering_montecarlo(X,Y)

2)a)

In [11]:
print("Standart Deviation of 50 Hamming Loss : {}".format(statistics.stdev(final_hammingLoss)))

print("Average of the 50 Hamming Loss : {}".format(statistics.mean(final_hammingLoss)))

print("Standart Deviation of 50 Hamming Distance : {}".format(statistics.stdev(final_hammingDistance)))

print("Average of the 50 Hamming distance : {}".format(statistics.mean(final_hammingDistance)))

print("Standart Deviation of 50 Hamming score : {}".format(statistics.stdev(final_hammingScore)))

print("Average of the 50 Hamming Score : {}".format(statistics.mean(final_hammingScore)))



Standart Deviation of 50 Hamming Loss : 0.01240419855213614
Average of the 50 Hamming Loss : 0.22603656
Standart Deviation of 50 Hamming Distance : 0.037212680161971746
Average of the 50 Hamming distance : 0.6781097984711605
Standart Deviation of 50 Hamming score : 0.012404226720657238
Average of the 50 Hamming Score : 0.7739634005096132


In [12]:
fin=list(zip(list(family_majority_triplet.values()),list(genus_majority_triplet.values()),list(species_majority_triplet.values())))
fin

major_trip={}
for j in range(0,50):
    major_trip.update({j:list(zip(fin[j][0],fin[j][1],fin[j][2]))})
        

In [13]:
i=range(1,51)
final=pd.DataFrame({"Iteration" : i,"Best K":best_k,"Hamming Score":final_hammingScore,"Hamming Loss":final_hammingLoss,"Hamming Distance":final_hammingDistance,"Majority triplets (Family,Genus,Species) for every cluster":list(major_trip.values())})
final

Unnamed: 0,Iteration,Best K,Hamming Score,Hamming Loss,Hamming Distance,"Majority triplets (Family,Genus,Species) for every cluster"
0,1,4,0.719991,0.280009,0.840028,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Hyl..."
1,2,4,0.777577,0.222423,0.667269,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
2,3,4,0.777577,0.222423,0.667269,"[(Dendrobatidae, Ameerega, Ameeregatrivittata)..."
3,4,4,0.754737,0.245263,0.735789,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."
4,5,4,0.777577,0.222423,0.667269,"[(Dendrobatidae, Ameerega, Ameeregatrivittata)..."
5,6,4,0.777577,0.222423,0.667269,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
6,7,4,0.778226,0.221774,0.665323,"[(Hylidae, Hypsiboas, HypsiboasCinerascens), (..."
7,8,4,0.777577,0.222423,0.667269,"[(Leptodactylidae, Adenomera, AdenomeraHylaeda..."
8,9,4,0.754876,0.245124,0.735372,"[(Hylidae, Hypsiboas, HypsiboasCordobae), (Lep..."
9,10,4,0.777855,0.222145,0.666435,"[(Hylidae, Hypsiboas, HypsiboasCinerascens), (..."
