<font size=6>**name: Yonatan Juarez<br>
<font size=5> github: yonatan-juarez-5<br>
HW7: Multi-class and multi-label classification using SVM**

In [31]:
import pandas as pd
import random
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import classification_report, confusion_matrix, f1_score, hamming_loss, silhouette_samples, silhouette_score
from sklearn.svm import SVC, LinearSVC
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.cluster import KMeans
import warnings
warnings.filterwarnings('ignore')

**(1a) Download the Anuram Calls data set**

In [2]:
data_path = '../data/Frogs_MFCCs.csv'
train_data = pd.read_csv(data_path)
train_data

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [3]:
train_data_shuffled = train_data.sample(frac=1, random_state=42)

In [4]:
train_data, test_data = train_test_split(train_data_shuffled, test_size=0.3, random_state=42)
x_train = train_data.iloc[:,:-4]
train_family = train_data['Family']
train_genus= train_data['Genus']
train_species= train_data['Species']

x_test = test_data.iloc[:,:-4]
test_family = test_data['Family']
test_genus= test_data['Genus']
test_species= test_data['Species']
train_data.shape

(5036, 26)

**(1b) Each instance has 3 labels: familes, genus, species<br>
Train a classifier for each label (binary relevance)**

**(1bi) Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem.**

- In multi-label classification, exact match provides a strict measure of accuracy, requiring all predicted labels to match the true labels for a sample.<br>
- Hamming loss is more relaxed compared to exact match because it considers each label independently. It accounts for situations where some of the<br>
predicted labels may be correct even if not all of them match the true labels. 

**(1bii) Train a SVM for each of the labels, using Gaussian kenrls and one vs all classifiers. Determine the weight of the SVM penalty and the width of the <br>
Gaussian Kernel using 10-fold cross validation.**

In [5]:
def train_svm(classifier, kwargs, x_train, y_train, x_test, y_test):
    model = GridSearchCV(estimator=classifier, **kwargs, n_jobs=-1)
    model.fit(x_train, y_train)

    results = {}

    for res in model.cv_results_['params']:
        for key,value in res.items():
            if key not in results:
                results[key] = []
            results[key].append(value)

    
    mean = np.round(np.array(model.cv_results_['mean_test_score']), 6)
    results['mean_test_score'] = mean

    results_df = pd.DataFrame(results)
    print(f"Best parameter setting: {model.best_params_}")
    print(f"Classification report:\n{classification_report(y_test, model.predict(x_test))}")
    return [results_df, model]

In [6]:
param_grid = { 'C': np.logspace(1, 4,4),
    'gamma': np.logspace(-3,6, 10)}
kwargs ={'scoring':'accuracy',
         'param_grid':param_grid , 
         'cv':StratifiedKFold(10, random_state=5036, shuffle=True)
        }

In [7]:
print("SVM with Gaussian Kernels")
print("Label: Families")
family_result, family_model = train_svm(SVC(kernel='rbf'), kwargs, x_train, train_family, x_test, test_family)
family_result

SVM with Gaussian Kernels
Label: Families
Best parameter setting: {'C': 100.0, 'gamma': 1.0}
Classification report:
                 precision    recall  f1-score   support

      Bufonidae       0.89      0.94      0.91        17
  Dendrobatidae       0.99      1.00      1.00       173
        Hylidae       0.99      0.98      0.99       660
Leptodactylidae       0.99      0.99      0.99      1309

       accuracy                           0.99      2159
      macro avg       0.97      0.98      0.97      2159
   weighted avg       0.99      0.99      0.99      2159



Unnamed: 0,C,gamma,mean_test_score
0,10.0,0.001,0.864975
1,10.0,0.01,0.929306
2,10.0,0.1,0.971404
3,10.0,1.0,0.993249
4,10.0,10.0,0.986696
5,10.0,100.0,0.803217
6,10.0,1000.0,0.63066
7,10.0,10000.0,0.618944
8,10.0,100000.0,0.617753
9,10.0,1000000.0,0.617753


In [8]:
print("Label: Genus")
genus_result, genus_model = train_svm(SVC(kernel='rbf'), kwargs, x_train, train_genus, x_test, test_genus)
genus_result

Label: Genus
Best parameter setting: {'C': 10.0, 'gamma': 1.0}
Classification report:
               precision    recall  f1-score   support

    Adenomera       0.99      1.00      0.99      1219
     Ameerega       0.99      1.00      1.00       173
Dendropsophus       0.95      0.95      0.95        85
    Hypsiboas       0.99      0.99      0.99       496
Leptodactylus       1.00      0.98      0.99        90
Osteocephalus       0.88      0.85      0.86        41
     Rhinella       0.94      0.88      0.91        17
       Scinax       0.97      0.97      0.97        38

     accuracy                           0.99      2159
    macro avg       0.96      0.95      0.96      2159
 weighted avg       0.99      0.99      0.99      2159



Unnamed: 0,C,gamma,mean_test_score
0,10.0,0.001,0.799248
1,10.0,0.01,0.931096
2,10.0,0.1,0.972995
3,10.0,1.0,0.990469
4,10.0,10.0,0.98074
5,10.0,100.0,0.762317
6,10.0,1000.0,0.593923
7,10.0,10000.0,0.5834
8,10.0,100000.0,0.58201
9,10.0,1000000.0,0.58201


In [9]:
print("Label: Species")
species_result, species_model = train_svm(SVC(kernel='rbf'), kwargs, x_train, train_species, x_test, test_species)
species_result

Label: Species
Best parameter setting: {'C': 10.0, 'gamma': 1.0}
Classification report:
                        precision    recall  f1-score   support

        AdenomeraAndre       0.97      0.98      0.97       183
AdenomeraHylaedactylus       1.00      1.00      1.00      1036
    Ameeregatrivittata       1.00      1.00      1.00       173
            HylaMinuta       0.95      0.95      0.95        85
  HypsiboasCinerascens       0.97      0.97      0.97       152
     HypsiboasCordobae       0.99      0.99      0.99       344
   LeptodactylusFuscus       1.00      0.98      0.99        90
 OsteocephalusOophagus       0.85      0.83      0.84        41
     Rhinellagranulosa       0.94      0.88      0.91        17
           ScinaxRuber       0.95      0.97      0.96        38

              accuracy                           0.99      2159
             macro avg       0.96      0.96      0.96      2159
          weighted avg       0.99      0.99      0.99      2159



Unnamed: 0,C,gamma,mean_test_score
0,10.0,0.001,0.845906
1,10.0,0.01,0.942015
2,10.0,0.1,0.978555
3,10.0,1.0,0.992453
4,10.0,10.0,0.976568
5,10.0,100.0,0.699761
6,10.0,1000.0,0.496229
7,10.0,10000.0,0.4863
8,10.0,100000.0,0.484909
9,10.0,1000000.0,0.484909


In [10]:
def multilabel_results(x_test, y_test, classifiers):
    y_pred = pd.DataFrame(columns=y_test.columns)
    for label in y_test.columns:
        model = classifiers[label]
        test_pred = model.predict(x_test)
        y_pred.loc[:,label] = test_pred
    
    misclassification = 0
    exact_match = 0
  
    for truth, pred in zip(y_test.values, y_pred.values):
        misclassification += np.sum(truth!= pred)

    hamming = misclassification/(y_test.shape[0]* y_test.shape[1])

    for truth, pred in zip(y_test.values, y_pred.values):
        if sum(truth == pred) == y_test.shape[1]:
            exact_match += 1
    exact_match /= y_test.shape[0]

    print(f"Hamming score: {hamming:.6f}")
    print(f"Exact match: {exact_match:.6f}")

    return [hamming, exact_match]


In [11]:
gaussian_results = {
    'Family': family_model,
    'Genus': genus_model,
    'Species': species_model
}
print("Evaluation summary:")
gauss_eval = multilabel_results(x_test, test_data.iloc[:,-4:-1], gaussian_results)

Evaluation summary:
Hamming score: 0.011579
Exact match: 0.982399


**(1biii) Repeat (1bii) with L1-penalized SVMs. Standardize the attributes.<br>
Determine the weight of the SVM penalty using 10-fold cross validation.**

In [12]:
param_grid = { 'C': np.logspace(1, 5,10)}
kwargs ={'scoring':'accuracy',
         'param_grid':param_grid , 
         'cv':StratifiedKFold(10, random_state=5036, shuffle=True)
        }

In [13]:
# standardize data
std = StandardScaler()
std_x_train = std.fit_transform(x_train)
std_x_test = std.fit_transform(x_test)

In [14]:
print("SVM with L1-Penalized")
print("Label: Family")
l1_family_result, l1_family_model = train_svm(LinearSVC(penalty='l1', dual=False, max_iter=5000),
                kwargs ,std_x_train, train_family, std_x_test, test_family)

l1_family_result

SVM with L1-Penalized
Label: Family
Best parameter setting: {'C': 27.825594022071243}
Classification report:
                 precision    recall  f1-score   support

      Bufonidae       0.00      0.00      0.00        17
  Dendrobatidae       0.89      0.90      0.89       173
        Hylidae       0.92      0.90      0.91       660
Leptodactylidae       0.95      0.97      0.96      1309

       accuracy                           0.94      2159
      macro avg       0.69      0.69      0.69      2159
   weighted avg       0.93      0.94      0.93      2159



Unnamed: 0,C,mean_test_score
0,10.0,0.935265
1,27.825594,0.935464
2,77.426368,0.935464
3,215.443469,0.935464
4,599.48425,0.935464
5,1668.100537,0.935464
6,4641.588834,0.935464
7,12915.49665,0.935464
8,35938.136638,0.935464
9,100000.0,0.935464


In [15]:
print("Label: Genus")
l1_genus_result, l1_genus_model = train_svm(LinearSVC(penalty='l1', dual=False, max_iter=5000),
                kwargs ,std_x_train, train_genus, std_x_test, test_genus)

l1_genus_result

Label: Genus




Best parameter setting: {'C': 10.0}
Classification report:
               precision    recall  f1-score   support

    Adenomera       0.96      0.99      0.97      1219
     Ameerega       0.95      0.97      0.96       173
Dendropsophus       0.98      0.66      0.79        85
    Hypsiboas       0.92      0.98      0.95       496
Leptodactylus       1.00      0.93      0.97        90
Osteocephalus       1.00      0.27      0.42        41
     Rhinella       1.00      0.29      0.45        17
       Scinax       0.90      0.95      0.92        38

     accuracy                           0.95      2159
    macro avg       0.96      0.75      0.80      2159
 weighted avg       0.95      0.95      0.94      2159



Unnamed: 0,C,mean_test_score
0,10.0,0.95115
1,27.825594,0.950356
2,77.426368,0.950157
3,215.443469,0.950356
4,599.48425,0.950157
5,1668.100537,0.950157
6,4641.588834,0.949959
7,12915.49665,0.950157
8,35938.136638,0.950157
9,100000.0,0.950157


In [16]:
print("Label: Species")
l1_species_result, l1_species_model = train_svm(LinearSVC(penalty='l1', dual=False, max_iter=5000),
                kwargs ,std_x_train, train_species, std_x_test, test_species)
l1_species_result

Label: Species




Best parameter setting: {'C': 10.0}
Classification report:
                        precision    recall  f1-score   support

        AdenomeraAndre       0.89      0.95      0.92       183
AdenomeraHylaedactylus       0.99      1.00      0.99      1036
    Ameeregatrivittata       0.94      0.95      0.95       173
            HylaMinuta       0.94      0.68      0.79        85
  HypsiboasCinerascens       0.93      0.92      0.93       152
     HypsiboasCordobae       0.92      0.97      0.94       344
   LeptodactylusFuscus       1.00      0.93      0.97        90
 OsteocephalusOophagus       1.00      0.46      0.63        41
     Rhinellagranulosa       0.80      0.94      0.86        17
           ScinaxRuber       0.88      0.97      0.93        38

              accuracy                           0.95      2159
             macro avg       0.93      0.88      0.89      2159
          weighted avg       0.96      0.95      0.95      2159



Unnamed: 0,C,mean_test_score
0,10.0,0.959093
1,27.825594,0.958894
2,77.426368,0.958695
3,215.443469,0.958695
4,599.48425,0.958695
5,1668.100537,0.958695
6,4641.588834,0.958695
7,12915.49665,0.958695
8,35938.136638,0.958695
9,100000.0,0.958695


In [17]:
l1_pen_results = {
    'Family': l1_family_model,
    'Genus': l1_genus_model,
    'Species': l1_species_model
}
print("Evaluation summary:")
l1_eval = multilabel_results(std_x_test, test_data.iloc[:,-4:-1], l1_pen_results)

Evaluation summary:
Hamming score: 0.052957
Exact match: 0.913386


**(1biv) Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.**

In [18]:
def run_smote(classifier, kwargs, x_train, y_train, x_test, y_test):
    naive_model = Pipeline([ ('sampling', SMOTE()),
        ('classification', classifier)
    ])
    results, model = train_svm(naive_model, kwargs, x_train, y_train, x_test, y_test)
    
    return results, model

In [19]:
param_grid = { 'classification__C': np.logspace(1, 5,10)}
kwargs ={'scoring':'accuracy',
         'param_grid':param_grid , 
         'cv':StratifiedKFold(10, random_state=5036, shuffle=True)
        }

In [20]:
print("SVM with L1-penalized and SMOTE:")
smote_family_result, smote_family_model = run_smote(LinearSVC(penalty='l1', dual=False, max_iter=5000),
                                                    kwargs, std_x_train, train_family, std_x_test, test_family)
smote_family_result

SVM with L1-penalized and SMOTE:
Best parameter setting: {'classification__C': 4641.588833612777}
Classification report:
                 precision    recall  f1-score   support

      Bufonidae       0.28      0.94      0.43        17
  Dendrobatidae       0.76      0.99      0.86       173
        Hylidae       0.93      0.87      0.90       660
Leptodactylidae       0.97      0.94      0.95      1309

       accuracy                           0.92      2159
      macro avg       0.73      0.93      0.78      2159
   weighted avg       0.94      0.92      0.93      2159



Unnamed: 0,classification__C,mean_test_score
0,10.0,0.919186
1,27.825594,0.920971
2,77.426368,0.920179
3,215.443469,0.920378
4,599.48425,0.918191
5,1668.100537,0.921173
6,4641.588834,0.921965
7,12915.49665,0.921768
8,35938.136638,0.919383
9,100000.0,0.921569


In [21]:
smote_genus_result, smote_genus_model = run_smote(LinearSVC(penalty='l1', dual=False, max_iter=10000),
                                                    kwargs, std_x_train, train_genus, std_x_test, test_genus)
smote_genus_result



Best parameter setting: {'classification__C': 215.44346900318823}
Classification report:
               precision    recall  f1-score   support

    Adenomera       0.98      0.91      0.95      1219
     Ameerega       0.85      0.98      0.91       173
Dendropsophus       0.62      0.87      0.72        85
    Hypsiboas       0.96      0.93      0.94       496
Leptodactylus       0.95      0.93      0.94        90
Osteocephalus       0.53      0.73      0.61        41
     Rhinella       0.36      0.94      0.52        17
       Scinax       0.78      0.95      0.86        38

     accuracy                           0.92      2159
    macro avg       0.75      0.90      0.81      2159
 weighted avg       0.94      0.92      0.92      2159



Unnamed: 0,classification__C,mean_test_score
0,10.0,0.916991
1,27.825594,0.917583
2,77.426368,0.915401
3,215.443469,0.917585
4,599.48425,0.916394
5,1668.100537,0.916195
6,4641.588834,0.916791
7,12915.49665,0.916196
8,35938.136638,0.917187
9,100000.0,0.915798


In [22]:
smote_species_result, smote_species_model = run_smote(LinearSVC(penalty='l1', dual=False, max_iter=10000),
                                                    kwargs, std_x_train, train_species, std_x_test, test_species)
smote_species_result



Best parameter setting: {'classification__C': 599.4842503189409}
Classification report:
                        precision    recall  f1-score   support

        AdenomeraAndre       0.90      0.91      0.91       183
AdenomeraHylaedactylus       1.00      0.99      0.99      1036
    Ameeregatrivittata       0.94      0.94      0.94       173
            HylaMinuta       0.82      0.82      0.82        85
  HypsiboasCinerascens       0.94      0.90      0.92       152
     HypsiboasCordobae       0.97      0.94      0.96       344
   LeptodactylusFuscus       0.94      0.93      0.94        90
 OsteocephalusOophagus       0.76      0.68      0.72        41
     Rhinellagranulosa       0.47      0.94      0.63        17
           ScinaxRuber       0.82      0.95      0.88        38

              accuracy                           0.95      2159
             macro avg       0.86      0.90      0.87      2159
          weighted avg       0.95      0.95      0.95      2159



Unnamed: 0,classification__C,mean_test_score
0,10.0,0.958896
1,27.825594,0.958297
2,77.426368,0.958895
3,215.443469,0.958299
4,599.48425,0.959093
5,1668.100537,0.958299
6,4641.588834,0.958497
7,12915.49665,0.957901
8,35938.136638,0.957505
9,100000.0,0.957702


In [23]:
l1_pen_smote_results = {
    'Family': smote_family_model,
    'Genus': smote_genus_model,
    'Species': smote_species_model
}
print("Evaluation summary:")
l1__smote_eval = multilabel_results(std_x_test, test_data.iloc[:,-4:-1], l1_pen_smote_results)

Evaluation summary:
Hamming score: 0.070866
Exact match: 0.864752


In [28]:
evaluation_dict = {
    'Gaussian SVC (raw)':gauss_eval,
    'SVC L1-Penalized': l1_eval,
    'SVC SMOTE L1-Penalized': l1__smote_eval
}
eval_df = pd.DataFrame(evaluation_dict)
eval_df.index = ['Hamming score', 'Exact match']
print("\tConclusions about classifiers trained:")
eval_df

	Conclusions about classifiers trained:


Unnamed: 0,Gaussian SVC (raw),SVC L1-Penalized,SVC SMOTE L1-Penalized
Hamming score,0.011579,0.052957,0.070866
Exact match,0.982399,0.913386,0.864752


**- Exact match is the highest for Gaussian kernels svm.<br>**
**- L1-penalized svm has a better exact match and higher hamming score<br>**
**- When using SMOTE L1-penalized, exact match is the lowest**

**(2) K-means clustering on a multi-class & multi-label data set**

**Monte-carlo simulation: Perform the following procedures 50 times, and report the avg and standard deviation of the 50 hamming distances that you calculate.**

**(2a) Use k-means clustering on the whole anuran calls (MFCCS). Choose k = {1,2,...,50}**

In [29]:
data_path = '../data/Frogs_MFCCs.csv'
data = pd.read_csv(data_path)

In [35]:
def get_best_k(n, x_train, rand_state):
    best_k = 1
    max_score = 0
    for n in range(2, n+1):
        model = KMeans(n_clusters=n, random_state=rand_state)
        labels = model.fit_predict(x_train)
        avg = silhouette_score(x_train, labels)

        if avg > max_score:
            best_k = n
            max_score = avg
    print(f"Best K: {best_k}")
    return best_k

def get_major_labels(best_k, labels, y_train):
    major = pd.DataFrame(columns=y_train.columns)
    for k in range(best_k):
        i, = np.where(labels == k)
        samples = y_train.iloc[i,:]
        row = []
        for label in y_train.columns:
            current_major = samples.loc[:,label].value_counts().index[0]
            row.append(current_major)
        major.loc[k] = row
    return major

def eval(major, labels, y):
    misclassification_labels = 0
    for i in range(len(major)):
        idx, = np.where(labels == i)
        for label in y.loc[idx].values:
            miss = (label != major.loc[i].values)
            misclassification_labels += np.sum(miss)

    hamming_distance = misclassification_labels/y.shape[0]
    hamming_loss = misclassification_labels/(y.shape[0]*y.shape[1])

    return hamming_distance, hamming_loss


In [36]:
# monte carlo
x = data.iloc[:,:-4]
y = data.iloc[:,-4:-1]
hamming_distance = []
hamming_loss = []
k_val = 50
for k in range(k_val):
    print(f'N = {k}')
    best_k = get_best_k(50, x, k)
    model = KMeans(n_clusters=best_k, random_state=k)
    labels = model.fit_predict(x)

    major_labels = get_major_labels(best_k, labels, y)

    print(major_labels)

    curr_distance, curr_loss = eval(major_labels, labels, y)
    hamming_distance.append(curr_distance)
    hamming_loss.append(curr_loss)

    print(f"Hamming distance: {curr_distance}")
    print(f"Hamming loss: {curr_loss}")
    

N = 0
Best K: 4
            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1          Hylidae  Hypsiboas       HypsiboasCordobae
2    Dendrobatidae   Ameerega      Ameeregatrivittata
3          Hylidae  Hypsiboas    HypsiboasCinerascens
Hamming distance: 0.66726893676164
Hamming loss: 0.2224229789205467
N = 1
Best K: 4
            Family      Genus                 Species
0          Hylidae  Hypsiboas       HypsiboasCordobae
1  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
2    Dendrobatidae   Ameerega      Ameeregatrivittata
3          Hylidae  Hypsiboas    HypsiboasCinerascens
Hamming distance: 0.66726893676164
Hamming loss: 0.2224229789205467
N = 2
Best K: 4
            Family      Genus                 Species
0  Leptodactylidae  Adenomera  AdenomeraHylaedactylus
1  Leptodactylidae  Adenomera          AdenomeraAndre
2          Hylidae  Hypsiboas       HypsiboasCordobae
3          Hylidae  Hypsiboas       HypsiboasCordobae
Hammin

In [38]:
print("K-Means results:")
print(f"Avg Hamming distance: {np.mean(hamming_distance):.6f}")
print(f"Avg Hamming loss: {np.mean(hamming_loss):.6f}")
print(f"Avg Hamming score: {1-np.mean(hamming_loss):.6f}")

K-Means results:
Avg Hamming distance: 0.671889
Avg Hamming loss: 0.223963
Avg Hamming score: 0.776037
