In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import numpy as np
import seaborn as sns

from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs
from sklearn.decomposition import PCA
from sklearn.metrics import classification_report, f1_score, accuracy_score, precision_score, recall_score, silhouette_samples, silhouette_score, hamming_loss
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_validate
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import MultiLabelBinarizer, LabelEncoder, StandardScaler
from sklearn.svm import SVC, LinearSVC

from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline

## Multi-class and Multi-Label Classification Using Support Vector Machines

### (a) Download the Anuran Calls (MFCCs) Data Set
Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [3]:
data = pd.read_csv("data/Frogs_MFCCs.csv")
df = pd.DataFrame(data)
df = df.drop(columns='RecordID') # remove unnecessary col
col_names = df.columns.tolist()
y_vars = ['Family', 'Genus', 'Species']
X_vars = [item for item in col_names if item not in y_vars]

print(col_names)
print(df)

['MFCCs_ 1', 'MFCCs_ 2', 'MFCCs_ 3', 'MFCCs_ 4', 'MFCCs_ 5', 'MFCCs_ 6', 'MFCCs_ 7', 'MFCCs_ 8', 'MFCCs_ 9', 'MFCCs_10', 'MFCCs_11', 'MFCCs_12', 'MFCCs_13', 'MFCCs_14', 'MFCCs_15', 'MFCCs_16', 'MFCCs_17', 'MFCCs_18', 'MFCCs_19', 'MFCCs_20', 'MFCCs_21', 'MFCCs_22', 'Family', 'Genus', 'Species']
      MFCCs_ 1  MFCCs_ 2  MFCCs_ 3  MFCCs_ 4  MFCCs_ 5  MFCCs_ 6  MFCCs_ 7  \
0          1.0  0.152936 -0.105586  0.200722  0.317201  0.260764  0.100945   
1          1.0  0.171534 -0.098975  0.268425  0.338672  0.268353  0.060835   
2          1.0  0.152317 -0.082973  0.287128  0.276014  0.189867  0.008714   
3          1.0  0.224392  0.118985  0.329432  0.372088  0.361005  0.015501   
4          1.0  0.087817 -0.068345  0.306967  0.330923  0.249144  0.006884   
...        ...       ...       ...       ...       ...       ...       ...   
7190       1.0 -0.554504 -0.337717  0.035533  0.034511  0.443451  0.093889   
7191       1.0 -0.517273 -0.370574  0.030673  0.068097  0.402890  0.096628   
719

### Train a classifier for each label

**Exact match and hamming score/ loss methods for evaluating multi-label classification** Exact Match requires that all predicted labels perfectly match the predicted labels in a sample to be considered correct. In this case, the family, genus, and species must all be matched correctly for the data point for exact match to consider it a correct label. On the other hand, hamming score focuses on the porportion of correctly predicted labels, which allows for partial correctness. For example, if family and genus are correct, but species is incorrect, this method would allow us to measure accuracy using partial correctness for a data point. This can be useful for this dataset because if the family and genus are being predicted correctly, this can still be useful by providing us with the interpretibility needed to determine what our model needs work on. We can also use this information to potentially gather more data to feed the model based on what it is getting incorrect. Exact match can give us a good overview and allow us to compare various models, but can sometimes be too strict when dealing with nousy data or when some misclassification is tolerable. (sources 1, 2)

#### Train a SVM for each of the labels
Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized and raw attributes and report the results.

In [3]:
# chose to use libsvm because we are using Gaussian kernels
# sklearn.svm.SCV wraps libsvm

# Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers 
# Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation
# You can try to solve the problem with both standardized (2) and raw attributes and report the results

#  hierarchical multiclass classification 

# split data
def split_data(X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

    return X_train, X_test, y_train, y_test


def train_svm(df, label, mod, scale=False):

    # prep data - convert to numerical labels, split, scale if needed
    X, y = df[X_vars], df[label]

    # svm needs numerical labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    # try to solve the problem with standardized (2) and raw attributes
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)

    X_train, X_test, y_train, y_test = split_data(X, y_encoded)

    # Determine hyperparameters using 10 fold cross validation
    # parameter C - low, more regularization, some misclassification - high, less regularization, penalizes misclassification heavy, fits data closely with complex boundary, less misclassification
    # parameter gamma - low, far reach, smooth boundary - high, close reach, points affect local region, more complex boundaries
    if mod == "l1":
        param_grid = {
            "C": np.logspace(-3 ,6, num=10),
        }
        grid_search = GridSearchCV(
            LinearSVC(penalty='l1', dual=False, max_iter=15000), 
            param_grid, scoring="f1_macro", cv=10, n_jobs=-1
        )
    elif mod == "rbf":
        param_grid = {
            "C": np.logspace(-3 ,6, num=10),
            "gamma": np.arange(.1, 2.1, .1)
        }
        grid_search = GridSearchCV(
            SVC(kernel='rbf'), 
            param_grid, scoring="f1_macro", cv=10, n_jobs=-1
        )
    else:
        return "Please enter a supported model. You can enter either 'rbf' or 'l1'."

    # train a SVM for each of the labels
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # get cv scores
    best_params = grid_search.best_params_
    f1_cv = grid_search.best_score_

    # get test scores
    f1_test, accuracy_test, precision_test, recall_test, class_report = assess_model(best_model, X_test, y_test, le)

    # return model data
    results = [[mod, label, scale, best_params, f1_cv, f1_test, accuracy_test, precision_test, recall_test]]

    results_df = pd.DataFrame(results, columns=["model", "label", "scaled", "best_params", "f1_CV", "f1_test", "accuracy_test", "precision_test", "recall_test"])
    return results_df, class_report


def assess_model(model, X_test, y_test, le):

    # make predictions using best model
    y_pred = model.predict(X_test)
    f1_test = f1_score(y_test, y_pred, average='macro')
    accuracy_test = accuracy_score(y_test, y_pred)
    precision_test = precision_score(y_test, y_pred, average='macro')
    recall_test = recall_score(y_test, y_pred, average='macro')

    class_report = classification_report(y_test, y_pred, target_names=le.classes_, digits=3)

    return f1_test, accuracy_test, precision_test, recall_test, class_report

In [None]:
print("Unscaled Data")
unscaled_gaussian_svm = []
for label in y_vars:
    results_df, class_report = train_svm(df, label, "rbf")
    print(f"{label} Class Reports - One v. All")
    print(class_report)
    unscaled_gaussian_svm.append(results_df)

unscaled_gaussian_svm_df = pd.concat(unscaled_gaussian_svm, ignore_index=True)
print(unscaled_gaussian_svm_df)

Unscaled Data
Family Class Reports - One v. All
                 precision    recall  f1-score   support

      Bufonidae      1.000     0.909     0.952        22
  Dendrobatidae      0.988     0.994     0.991       162
        Hylidae      0.984     0.990     0.987       626
Leptodactylidae      0.995     0.993     0.994      1349

       accuracy                          0.991      2159
      macro avg      0.992     0.971     0.981      2159
   weighted avg      0.991     0.991     0.991      2159

Genus Class Reports - One v. All
               precision    recall  f1-score   support

    Adenomera      0.993     0.994     0.994      1248
     Ameerega      0.994     0.994     0.994       162
Dendropsophus      0.962     0.944     0.953       108
    Hypsiboas      0.989     0.986     0.988       441
Leptodactylus      0.990     0.980     0.985       101
Osteocephalus      0.848     0.966     0.903        29
     Rhinella      1.000     0.909     0.952        22
       Scinax      

In [None]:
print("Unscaled Data")
scaled_gaussian_svm = []
for label in y_vars:
    results_df, class_report = train_svm(df, label, "rbf", scale=True)
    print(f"{label} Class Reports - One v. All")
    print(class_report)
    scaled_gaussian_svm.append(results_df)

scaled_gaussian_svm_df = pd.concat(scaled_gaussian_svm, ignore_index=True)
print(scaled_gaussian_svm_df)

Unscaled Data
Family Class Reports - One v. All
                 precision    recall  f1-score   support

      Bufonidae      1.000     0.909     0.952        22
  Dendrobatidae      0.994     1.000     0.997       162
        Hylidae      0.978     0.997     0.987       626
Leptodactylidae      0.998     0.990     0.994      1349

       accuracy                          0.992      2159
      macro avg      0.992     0.974     0.983      2159
   weighted avg      0.992     0.992     0.992      2159

Genus Class Reports - One v. All
               precision    recall  f1-score   support

    Adenomera      0.994     0.993     0.994      1248
     Ameerega      1.000     1.000     1.000       162
Dendropsophus      0.971     0.944     0.958       108
    Hypsiboas      0.967     0.991     0.979       441
Leptodactylus      0.980     0.960     0.970       101
Osteocephalus      0.867     0.897     0.881        29
     Rhinella      1.000     0.864     0.927        22
       Scinax      

#### Repeat with L1-penalized SVMs

In [None]:
print("Scaled Data")
scaled_l1_svm = []
for label in y_vars:
    results_df, class_report = train_svm(df, label, "l1", scale=True)
    print(f"{label} Class Reports - One v. All")
    print(class_report)
    scaled_l1_svm.append(results_df)

scaled_l1_svm_df = pd.concat(scaled_l1_svm, ignore_index=True)
print(scaled_l1_svm_df)

Scaled Data


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Family Class Reports - One v. All
                 precision    recall  f1-score   support

      Bufonidae      0.000     0.000     0.000        22
  Dendrobatidae      0.902     0.907     0.905       162
        Hylidae      0.914     0.879     0.896       626
Leptodactylidae      0.943     0.975     0.959      1349

       accuracy                          0.932      2159
      macro avg      0.690     0.690     0.690      2159
   weighted avg      0.922     0.932     0.927      2159

Genus Class Reports - One v. All
               precision    recall  f1-score   support

    Adenomera      0.963     0.989     0.975      1248
     Ameerega      0.926     0.932     0.929       162
Dendropsophus      0.946     0.648     0.769       108
    Hypsiboas      0.909     0.973     0.940       441
Leptodactylus      0.989     0.911     0.948       101
Osteocephalus      0.875     0.483     0.622        29
     Rhinella      0.923     0.545     0.686        22
       Scinax      0.935     0.89

#### Repeat using SMOTE

Looking at the outcomes of our models, as shown in the 'all_svm_df' dataframe, it is clear that the relationships in the data are nonlinear, so the RBF Gaussian kernels are working best for the data. The RBF SVM consistently outperformed all other models across Family, Genus, and Species classifications, with F1 scores above 96%. 

Some methods to improve my results included standardizing the data and dealing with class imbalance. For the rbf models, scaling had minimal impact on the model performance, only leading to a slight imporovement in model assessment. However, scaling was important with the linear models and allowed the models to work properly without returning errors. For example, in the SMOTE model, unscaled data produced many convergence warnings and took almost 4 times longer to complete than the SMOTE model using scaled data. I also got an undefined metric warning when using the L1 penalized model, which often occurs when data is imbalanced or if the model has low recall for some labels. To address this, we used SMOTE class balancing technique. While this did improve F1 scores of the L1 penalized models, it still could not match the overall performance of RBF kernels, suggesting nonlinear boundaries in the data. For best results, RBF SVM models should be used. However, we can also consider scaled SMOTE model, which did not produce any errors or warnings, but the accuracy scores are much lower in this model, so it may not be the best option. 

In [None]:
def train_svm_smote(df, label, mod, scale=False):

    # prep data - convert to numerical labels, split, scale if needed
    X, y = df[X_vars], df[label]

    # svm needs numerical labels
    le = LabelEncoder()
    y_encoded = le.fit_transform(y)

    pipeline_steps = []

    # try to solve the problem with standardized (2) and raw attributes
    if scale:
        scaler = StandardScaler()
        X = scaler.fit_transform(X)
        pipeline_steps.append(('scaler', StandardScaler()))

    X_train, X_test, y_train, y_test = split_data(X, y_encoded)

    pipeline_steps.extend([
        ('smote', SMOTE(random_state=13)),
        ('clf', LinearSVC(penalty='l1', dual=False, max_iter=15000))
    ])

    pipe = Pipeline(pipeline_steps)

    # Hyperparameter grid (for the classifier step)
    param_grid = {
        'clf__C': np.logspace(-3, 6, num=10)
    }

    grid_search = GridSearchCV(
        pipe,
        param_grid,
        scoring="f1_macro",
        cv=10,
        n_jobs=-1
    )

    grid_search.fit(X_train, y_train)

    # train a SVM for each of the labels
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_

    # get cv scores
    best_params = grid_search.best_params_
    f1_cv = grid_search.best_score_

    # get test scores
    f1_test, accuracy_test, precision_test, recall_test, class_report = assess_model(best_model, X_test, y_test, le)

    # return model data
    results = [[mod, label, scale, best_params, f1_cv, f1_test, accuracy_test, precision_test, recall_test]]

    results_df = pd.DataFrame(results, columns=["model", "label", "scaled", "best_params", "f1_CV", "f1_test", "accuracy_test", "precision_test", "recall_test"])
    return results_df, class_report

In [15]:
print("Unscaled Data")
unscaled_smote_l1_svm = []
for label in y_vars:
    results_df, class_report = train_svm_smote(df, label, "l1 - smote", scale=False)
    print(f"{label} Class Reports - One v. All")
    print(class_report)
    unscaled_smote_l1_svm.append(results_df)

unscaled_smote_l1_svm_df = pd.concat(unscaled_smote_l1_svm, ignore_index=True)
print(unscaled_smote_l1_svm_df)

Unscaled Data




Family Class Reports - One v. All
                 precision    recall  f1-score   support

      Bufonidae      0.301     1.000     0.463        22
  Dendrobatidae      0.748     0.969     0.844       162
        Hylidae      0.916     0.839     0.876       626
Leptodactylidae      0.964     0.931     0.947      1349

       accuracy                          0.908      2159
      macro avg      0.732     0.935     0.783      2159
   weighted avg      0.927     0.908     0.914      2159





Genus Class Reports - One v. All
               precision    recall  f1-score   support

    Adenomera      0.984     0.905     0.943      1248
     Ameerega      0.834     0.963     0.894       162
Dendropsophus      0.677     0.815     0.739       108
    Hypsiboas      0.965     0.925     0.944       441
Leptodactylus      0.951     0.960     0.956       101
Osteocephalus      0.417     0.862     0.562        29
     Rhinella      0.375     0.955     0.538        22
       Scinax      0.870     0.979     0.922        48

     accuracy                          0.913      2159
    macro avg      0.759     0.920     0.812      2159
 weighted avg      0.936     0.913     0.920      2159





Species Class Reports - One v. All
                        precision    recall  f1-score   support

        AdenomeraAndre      0.966     0.947     0.956       208
AdenomeraHylaedactylus      0.994     0.990     0.992      1040
    Ameeregatrivittata      0.927     0.944     0.936       162
            HylaMinuta      0.888     0.806     0.845       108
  HypsiboasCinerascens      0.912     0.919     0.916       136
     HypsiboasCordobae      0.956     0.928     0.942       305
   LeptodactylusFuscus      0.951     0.960     0.956       101
 OsteocephalusOophagus      0.750     0.828     0.787        29
     Rhinellagranulosa      0.579     1.000     0.733        22
           ScinaxRuber      0.902     0.958     0.929        48

              accuracy                          0.956      2159
             macro avg      0.883     0.928     0.899      2159
          weighted avg      0.959     0.956     0.957      2159

        model    label  scaled        best_params     f1_CV   f1_t



In [20]:
print("Scaled Data")
scaled_smote_l1_svm = []
for label in y_vars:
    results_df, class_report = train_svm_smote(df, label, "l1 - smote", scale=True)
    print(f"{label} Class Reports - One v. All")
    print(class_report)
    scaled_smote_l1_svm.append(results_df)

scaled_smote_l1_svm_df = pd.concat(scaled_smote_l1_svm, ignore_index=True)
print(scaled_smote_l1_svm_df)

Scaled Data
Family Class Reports - One v. All
                 precision    recall  f1-score   support

      Bufonidae      0.292     0.955     0.447        22
  Dendrobatidae      0.755     0.969     0.849       162
        Hylidae      0.915     0.839     0.875       626
Leptodactylidae      0.964     0.933     0.948      1349

       accuracy                          0.908      2159
      macro avg      0.731     0.924     0.780      2159
   weighted avg      0.927     0.908     0.914      2159

Genus Class Reports - One v. All
               precision    recall  f1-score   support

    Adenomera      0.985     0.907     0.945      1248
     Ameerega      0.843     0.963     0.899       162
Dendropsophus      0.688     0.815     0.746       108
    Hypsiboas      0.960     0.921     0.940       441
Leptodactylus      0.951     0.970     0.961       101
Osteocephalus      0.419     0.897     0.571        29
     Rhinella      0.375     0.955     0.538        22
       Scinax      0.

In [21]:
# comparison of all models
all_svm = [unscaled_gaussian_svm_df, scaled_gaussian_svm_df, scaled_l1_svm_df, unscaled_smote_l1_svm_df, scaled_smote_l1_svm_df]

all_svm_df = pd.concat(all_svm, ignore_index=True)
print(all_svm_df)
all_svm_df.to_csv("svm_model_assessment.csv")

         model    label  scaled                               best_params  \
0          rbf   Family   False  {'C': 10.0, 'gamma': 1.2000000000000002}   
1          rbf    Genus   False                 {'C': 10.0, 'gamma': 1.3}   
2          rbf  Species   False                {'C': 100.0, 'gamma': 2.0}   
3          rbf   Family    True                 {'C': 10.0, 'gamma': 0.1}   
4          rbf    Genus    True                 {'C': 10.0, 'gamma': 0.1}   
5          rbf  Species    True                 {'C': 10.0, 'gamma': 0.1}   
6           l1   Family    True                              {'C': 100.0}   
7           l1    Genus    True                               {'C': 10.0}   
8           l1  Species    True                                {'C': 1.0}   
9   l1 - smote   Family   False                         {'clf__C': 100.0}   
10  l1 - smote    Genus   False                           {'clf__C': 1.0}   
11  l1 - smote  Species   False                          {'clf__C': 10.0}   

## K-Means Clustering on a Multi-Class and Multi-Label Data Set
Monte-Carlo Simulation - Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

After performing the procedures 50 times, the average Hamming Distance was 0.6989881862404448. The Hamming Distance Standard Deviation was 0.13357457314443455. With a moderate standard deviation, we can conclude that most runs of the monte carlo simulation produced fairly similar and consistent results. However, the average hamming distance suggests that the original points are not too close to the predicted points in their cluster, meaning that they may belong to a different cluster. This allows us to assess the reliability of our clusters.

### Use k-means clustering, determine which family is the majority, and calculate the average Hamming distance, Hamming score, and Hamming loss

In [None]:
# k ∈ {1, 2, . . . , 50}
# Choose k automatically based on method provided in slides 
# (CH or Gap Statistics or scree plots or Silhouettes)

def get_best_k(X, k_list):
    # k is an integer ≥ 2 (silhouette score needs at least 2 clusters)
    silhouette_scores = []

    # for k in k_list:
    # kmeans = KMeans(n_clusters=4, random_state=13, n_init="auto").fit(X)
    for k in k_list:

        # Initialize the clusterer with n_clusters value
        clusterer = KMeans(n_clusters=k, random_state=13)
        cluster_labels = clusterer.fit_predict(X)

        # silhouette_score gives average value for all the samples
        # gives a perspective into the density and separation of the formed clusters
        silhouette_avg = silhouette_score(X, cluster_labels)
        silhouette_scores.append([k, silhouette_avg])

    silhouette_score_df = pd.DataFrame(silhouette_scores, columns=["K", "Average Silhouette Score"])
    silhouette_score_df_sorted = silhouette_score_df.sort_values(by='Average Silhouette Score', ascending=False).reset_index(drop=True)
    best_k = int(silhouette_score_df_sorted.iloc[0,0])
    best_silhouette = silhouette_score_df_sorted.iloc[0,1]

    return best_k, best_silhouette

In [None]:
def get_hamming(df):
    # prep data - split X vars and convert to numpy array
    X, y = df[X_vars].to_numpy(), df[y_vars].to_numpy()

    # k is an integer ≥ 2 (silhouette score needs at least 2 clusters)
    k_list= np.arange(2, 51, 1)
    best_k, best_silhouette = get_best_k(X, k_list)

    # In each cluster, determine which family is the majority by reading true labels
    # Repeat for genus and species
    best_clusterer = KMeans(n_clusters=best_k, random_state=13)
    best_cluster_labels = best_clusterer.fit_predict(X)
    df_with_clusters = df.copy()
    df_with_clusters['Cluster'] = best_cluster_labels

    majority_feats = []

    for feat in y_vars:
        family_counts = (
            df_with_clusters.groupby(['Cluster', feat])
            .size()
            .reset_index(name='Count')
        )
        
        majority_with_counts = (
            family_counts
            .sort_values('Count', ascending=False)
            .groupby('Cluster')
            .first()
            .reset_index()
        )
        
        majority_with_counts.insert(1, "Feature", feat)
        majority_with_counts.columns = ["Cluster", "Feature", "Majority", "Count"] 
        majority_feats.append(majority_with_counts)

    majority_feats_df = pd.concat(majority_feats, ignore_index=True)
    majority_feats_by_cluster_df = majority_feats_df.sort_values(by='Cluster', ascending=True).reset_index(drop=True)

    # Now for each cluster you have a majority label triplet (family, genus, species). 
    # Calculate the average Hamming distance, Hamming score, and Hamming loss between the true labels and the labels assigned by clusters

    # hamming metrics used to evaluate model - offers more balanced view than accuracy

    # access the majority labels from the df
    # map clusters to majority and create predicted labels
    cluster_to_majority = (
        majority_feats_by_cluster_df
        .pivot(index='Cluster', columns='Feature', values='Majority')
    )

    # map each data point to its clusters predicted label based on majority
    predicted_labels = df_with_clusters['Cluster'].map(cluster_to_majority.to_dict('index'))
    pred_df = pd.DataFrame(predicted_labels.tolist(), columns=y_vars)

    # to compute hamming data, need numerical labels of true and predicted vals
    y_true = np.empty_like(df[y_vars].values, dtype=int)
    y_pred = np.empty_like(pred_df.values, dtype=int)

    for i, col in enumerate(y_vars):
        le = LabelEncoder()
        y_true[:, i] = le.fit_transform(df[col])
        y_pred[:, i] = le.transform(pred_df[col])

    # get hamming metrics
    n_samples, n_labels = y_true.shape

    hl = (y_true != y_pred).sum() / (n_samples * n_labels) # hamming loss - fraction of labels incorrectly predicted
    hd = (y_true != y_pred).sum(axis=1).mean() # hamming distance - total numb of differing labels between true and predicted
    hs = (y_true == y_pred).all(axis=1).mean() # hamming score (a.k.a. subset accuracy) - proportion of label sets that are exactly matched

    results = [best_k, best_silhouette, hl, hd, hs]

    return results

def monte_carlo(df, n_simulations):
    results = []
    np.random.seed(15)

    for i in range(n_simulations):
        df_shuffled = df.sample(frac=1).reset_index(drop=True)

        # run model + evaluate
        result = get_hamming(df_shuffled)
        results.append(result)

    monte_carlo_simulation_results = pd.DataFrame(results, columns=["Best K", "Silhouette Score", "Hamming Loss", "Hamming Distance", "Hamming Score"])

    return monte_carlo_simulation_results


In [13]:
monte_carlo_results_df = monte_carlo(df, 50)
print(monte_carlo_results_df)

    Best K  Silhouette Score  Hamming Loss  Hamming Distance  Hamming Score
0        2          0.348678      0.298541          0.895622       0.636136
1        4          0.378634      0.222469          0.667408       0.755386
2        4          0.383873      0.234051          0.702154       0.727033
3        5          0.369951      0.199398          0.598193       0.773593
4        4          0.384611      0.247533          0.742599       0.685893
5        4          0.383873      0.234051          0.702154       0.727033
6        4          0.374710      0.280009          0.840028       0.688534
7        4          0.378751      0.222423          0.667269       0.755525
8        6          0.380286      0.187306          0.561918       0.796386
9        2          0.348678      0.298541          0.895622       0.636136
10       4          0.377931      0.186055          0.558165       0.760250
11       6          0.380012      0.188001          0.564003       0.763586
12       3  

In [14]:
# report the average and standard deviation of the 50 Hamming Distances that you calculate
average_hm = monte_carlo_results_df["Hamming Distance"].mean()
std_hm = monte_carlo_results_df["Hamming Distance"].std()

print(f"Hamming Distance Avg: {average_hm}")
print(f"Hamming Distance Standard Deviation: {std_hm}")


Hamming Distance Avg: 0.6989881862404448
Hamming Distance Standard Deviation: 0.13357457314443455
