In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC, LinearSVC
from sklearn.metrics import hamming_loss, accuracy_score, silhouette_score
from sklearn.multiclass import OneVsRestClassifier
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy.stats import mode
from collections import Counter
import warnings
warnings.filterwarnings("ignore")

# 1(a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics. uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [2]:
mfcc_df = pd.read_csv("../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv")
mfcc_df

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.118680,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.056510,-0.035303,0.020140,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.023590,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.172700,0.266434,...,-0.048885,-0.053074,-0.088550,-0.031346,0.108610,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7190,1.0,-0.554504,-0.337717,0.035533,0.034511,0.443451,0.093889,-0.100753,0.037087,0.081075,...,0.069430,0.071001,0.021591,0.052449,-0.021860,-0.079860,Hylidae,Scinax,ScinaxRuber,60
7191,1.0,-0.517273,-0.370574,0.030673,0.068097,0.402890,0.096628,-0.116460,0.063727,0.089034,...,0.061127,0.068978,0.017745,0.046461,-0.015418,-0.101892,Hylidae,Scinax,ScinaxRuber,60
7192,1.0,-0.582557,-0.343237,0.029468,0.064179,0.385596,0.114905,-0.103317,0.070370,0.081317,...,0.082474,0.077771,-0.009688,0.027834,-0.000531,-0.080425,Hylidae,Scinax,ScinaxRuber,60
7193,1.0,-0.519497,-0.307553,-0.004922,0.072865,0.377131,0.086866,-0.115799,0.056979,0.089316,...,0.051796,0.069073,0.017963,0.041803,-0.027911,-0.096895,Hylidae,Scinax,ScinaxRuber,60


In [3]:
train_df, test_df = train_test_split(mfcc_df, train_size=0.7, random_state=24)
train_df_x = train_df.iloc[:, :-4]
test_df_x = test_df.iloc[:, :-4]
y_train_df = train_df[["Family", "Genus","Species"]]
y_test_df = test_df[["Family", "Genus","Species"]]

# (b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:


# i. Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem.

Hamming Loss refers to the fraction of labels that were incorrectly predicted relative to the total amount of labels that were predicted. The hamming score refers to the opposite of this as (i.e.) fractions of labels that were correctly predicted relative to the total amount of labels predicted.


The exact match method refers to the fraction of correctly predicted exact set of labels. Exact match does not take into account labels that were partially correct in a multiclass problem.

# ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation.1 You are welcome to try to solve the problem with both standardized 2 and raw attributes and report the results.

In [4]:
exact_match_gaussian = []
loss_gaussian = []
for label in ["Family","Genus","Species"]:
    parameters = {'C': np.logspace(-3, 3, 7), 'gamma': np.logspace(-4, 4, 9)}
    model = SVC(kernel='rbf',decision_function_shape='ovr')
    gaussian_svm = GridSearchCV(model, parameters, cv=10, scoring='accuracy',n_jobs=-1)
    gaussian_svm.fit(train_df_x, y_train_df[[label]].values.ravel())
    predictions = gaussian_svm.predict(test_df_x)
    cv_results = gaussian_svm.cv_results_
    loss_value = hamming_loss(y_test_df[[label]].values.ravel(), predictions)
    exact_match_ratio = 1-loss_value
    exact_match_gaussian.append(exact_match_ratio)
    loss_gaussian.append(loss_value)
    
    result_df = pd.DataFrame(cv_results["params"])
    result_df["mean_test_score"] = cv_results["mean_test_score"]
    
    pd.set_option('display.max_rows', None) 
    print("Results for",label)
    display(result_df)

    print("Best Parameters:", gaussian_svm.best_params_)
    print("Best Test Score:", gaussian_svm.best_score_)
    print("Hamming Loss",loss_value)
    print("Exact Match",exact_match_ratio)
    print("------------------------------")

Results for Family


Unnamed: 0,C,gamma,mean_test_score
0,0.001,0.0001,0.606434
1,0.001,0.001,0.606434
2,0.001,0.01,0.606434
3,0.001,0.1,0.606434
4,0.001,1.0,0.606434
5,0.001,10.0,0.606434
6,0.001,100.0,0.606434
7,0.001,1000.0,0.606434
8,0.001,10000.0,0.606434
9,0.01,0.0001,0.606434


Best Parameters: {'C': 100.0, 'gamma': 1.0}
Best Test Score: 0.9928516204361134
Hamming Loss 0.009263547938860583
Exact Match 0.9907364520611395
------------------------------
Results for Genus


Unnamed: 0,C,gamma,mean_test_score
0,0.001,0.0001,0.56811
1,0.001,0.001,0.56811
2,0.001,0.01,0.56811
3,0.001,0.1,0.56811
4,0.001,1.0,0.56811
5,0.001,10.0,0.56811
6,0.001,100.0,0.56811
7,0.001,1000.0,0.56811
8,0.001,10000.0,0.56811
9,0.01,0.0001,0.56811


Best Parameters: {'C': 10.0, 'gamma': 1.0}
Best Test Score: 0.9902710719808135
Hamming Loss 0.012042612320518759
Exact Match 0.9879573876794813
------------------------------
Results for Species


Unnamed: 0,C,gamma,mean_test_score
0,0.001,0.0001,0.475377
1,0.001,0.001,0.475377
2,0.001,0.01,0.475377
3,0.001,0.1,0.475377
4,0.001,1.0,0.475377
5,0.001,10.0,0.475377
6,0.001,100.0,0.475377
7,0.001,1000.0,0.475377
8,0.001,10000.0,0.475377
9,0.01,0.0001,0.475377


Best Parameters: {'C': 10.0, 'gamma': 1.0}
Best Test Score: 0.9904690902205813
Hamming Loss 0.010189902732746642
Exact Match 0.9898100972672533
------------------------------


Exact match values for each label seem to be very high indicating there is a high proportion of predictions that were exactly matched to their samples. The greatest was seen for Family.

Although not a metric of interest, I am utilizing average of hamming loss and exact match score for each label within each model to compare model performances.

In [5]:
average_hamming_gaussian = np.mean(loss_value)
average_exact_match = np.mean(exact_match_gaussian)

# iii. Repeat 1(b)ii with L1-penalized SVMs.3 Remember to standardize4 the at- tributes. Determine the weight of the SVM penalty using 10 fold cross vali- dation.

In [6]:
exact_match_l1 = []
loss_l1 = []
for label in ["Family","Genus","Species"]:
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(train_df_x)
    x_test_scaled = scaler.transform(test_df_x)

    parameters = {'C': np.logspace(-2, 6, 9)}
    model = LinearSVC(penalty='l1', multi_class="ovr", dual=False, random_state=1,max_iter=10000)
    l1_svm = GridSearchCV(model, parameters, cv=10, scoring='accuracy')
    l1_svm.fit(x_train_scaled, y_train_df[[label]].values.ravel() )
    predictions = l1_svm.predict(x_test_scaled)
    cv_results = l1_svm.cv_results_
    loss = hamming_loss(y_test_df[[label]].values.ravel(), predictions)
    exact_match_ratio = 1 - loss

    exact_match_l1.append(exact_match_ratio)
    loss_l1.append(loss)
    
    result_df = pd.DataFrame(cv_results["params"])
    result_df["mean_test_score"] = cv_results["mean_test_score"]
    
    pd.set_option('display.max_rows', None) 
    print("Results for",label)
    display(result_df)

    print("Best Parameters:", l1_svm.best_params_)
    print("Best Score:", l1_svm.best_score_)
    print("Hamming Loss:",loss)
    print("Exact Match Ratio:",exact_match_ratio)
    print("---------------------------------")

Results for Family


Unnamed: 0,C,mean_test_score
0,0.01,0.922161
1,0.1,0.929109
2,1.0,0.929706
3,10.0,0.929705
4,100.0,0.929705
5,1000.0,0.929705
6,10000.0,0.929705
7,100000.0,0.929705
8,1000000.0,0.929705


Best Parameters: {'C': 1.0}
Best Score: 0.929705891634321
Hamming Loss: 0.058823529411764705
Exact Match Ratio: 0.9411764705882353
---------------------------------
Results for Genus


Unnamed: 0,C,mean_test_score
0,0.01,0.910646
1,0.1,0.938246
2,1.0,0.950953
3,10.0,0.951944
4,100.0,0.95254
5,1000.0,0.95254
6,10000.0,0.95254
7,100000.0,0.95254
8,1000000.0,0.95254


Best Parameters: {'C': 100.0}
Best Score: 0.9525395247562247
Hamming Loss: 0.04539138490041686
Exact Match Ratio: 0.9546086150995832
---------------------------------
Results for Species


Unnamed: 0,C,mean_test_score
0,0.01,0.911837
1,0.1,0.949565
2,1.0,0.956116
3,10.0,0.957306
4,100.0,0.957902
5,1000.0,0.957901
6,10000.0,0.957901
7,100000.0,0.957901
8,1000000.0,0.957901


Best Parameters: {'C': 100.0}
Best Score: 0.9579017955757518
Hamming Loss: 0.037054191755442334
Exact Match Ratio: 0.9629458082445577
---------------------------------


In [7]:
average_loss_l1 = np.mean(loss_l1)
average_exact_match_l1 = np.mean(exact_match_l1)

# iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [8]:
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import make_pipeline

exact_match_l1_imbalance = []
loss_l1_imbalance = []
for label in ["Family","Genus","Species"]:
    scaler = StandardScaler()
    x_train_scaled = scaler.fit_transform(train_df_x)
    x_test_scaled = scaler.transform(test_df_x)

    parameters = {'linearsvc__C': np.logspace(-2, 6, 9)}
    model = LinearSVC(penalty='l1', multi_class='ovr', dual=False, random_state=1, max_iter=20000)
    smote_pipeline = make_pipeline(SMOTE(random_state=42,sampling_strategy='minority'), model)
    l1_smote_svm = GridSearchCV(smote_pipeline, parameters, cv=10, scoring='accuracy',n_jobs=-1)
    l1_smote_svm.fit(x_train_scaled, y_train_df[[label]].values.ravel())
    predictions_smote = l1_smote_svm.predict(x_test_scaled)
    loss_smote = hamming_loss(y_test_df[[label]].values.ravel(), predictions_smote)
    exact_match = 1 - loss_smote

    exact_match_l1_imbalance.append(exact_match)
    loss_l1_imbalance.append(loss)
    
    print("Results for",label)
    print("Best Parameters:", l1_smote_svm.best_params_)
    print("Best Score:", l1_smote_svm.best_score_)
    print("Hamming Loss:", loss_smote)
    print("Exact Match Ratio:", exact_match)
    print("--------------------------------")


Results for Family
Best Parameters: {'linearsvc__C': 100.0}
Best Score: 0.929305910568336
Hamming Loss: 0.059286706808707734
Exact Match Ratio: 0.9407132931912923
--------------------------------




Results for Genus
Best Parameters: {'linearsvc__C': 1.0}
Best Score: 0.947178831771277
Hamming Loss: 0.05048633626679018
Exact Match Ratio: 0.9495136637332098
--------------------------------




Results for Species
Best Parameters: {'linearsvc__C': 10.0}
Best Score: 0.9535343494588027
Hamming Loss: 0.041222788327929596
Exact Match Ratio: 0.9587772116720704
--------------------------------


In [9]:
average_loss_l1_imbalance = np.mean(loss_l1_imbalance)
average_exact_match_l1_imbalance = np.mean(exact_match_l1_imbalance)

# Comparison

In [10]:
comparison_df = pd.DataFrame([{"Avg Gaussian Hamming Loss":average_hamming_gaussian,
                              "Avg L1 SVM Hamming Loss": average_loss_l1,
                              "Avg L1 SVM with SMOTE Hamming Loss":average_loss_l1_imbalance}])
comparison_df 

Unnamed: 0,Avg Gaussian Hamming Loss,Avg L1 SVM Hamming Loss,Avg L1 SVM with SMOTE Hamming Loss
0,0.01019,0.04709,0.037054


On average, the hamming loss was the least for the Gaussian SVM when compared to L1 Penalty SVM and L1 Penalty SVM using SMOTE

In [11]:
comparison_df = pd.DataFrame([{"Avg Gaussian Exact Match":average_exact_match,
                              "Avg L1 SVM Exact Match": average_exact_match_l1,
                              "Avg L1 SVM with SMOTE Exact Match":average_exact_match_l1_imbalance}])
comparison_df 

Unnamed: 0,Avg Gaussian Exact Match,Avg L1 SVM Exact Match,Avg L1 SVM with SMOTE Exact Match
0,0.989501,0.95291,0.949668


On avergae, the exact match ratio was highest for the Guassian SVM as well when compared ot the L1 SVM and L1 SVM with smote values.

# 2. K-Means Clustering on a Multi-Class and Multi-Label Data Set Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

# (a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise). Choose k ∈ {1, 2, . . . , 50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method you know.

# (b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus and species.

# (c) Now for each cluster you have a majority label triplet (family, genus, species). Calculate the average Hamming distance, Hamming score, and Hamming loss5 between the true labels and the labels assigned by clusters.


The below code was combined to answer the above questions. The best K found was 4.

In [None]:
mfcc_df_full = pd.read_csv("../data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv")
overall_loss = []
overall_distance = []
overall_score = []
k_values = range(2, 51)
for repetition in range(50):
    features_mfcc = mfcc_df_full.iloc[:, :-4] 
    silhouette_scores = []
    for k in k_values:
        kmeans = KMeans(n_clusters=k, random_state=1)
        clusters = kmeans.fit_predict(features_mfcc)
        sil_avg = silhouette_score(features_mfcc, clusters)
        silhouette_scores.append(sil_avg)
    best_k = k_values[np.argmax(silhouette_scores)]
    print(f"Iteration {repetition + 1}: Best k is {best_k}")
    
    kmeans = KMeans(n_clusters=best_k)
    clusters = kmeans.fit_predict(features_mfcc)
    mfcc_df = mfcc_df_full.copy() 
    mfcc_df['Cluster'] = clusters

    maj_family = {}
    maj_genus = {}
    maj_species = {}
    for cluster in range(best_k):
        cluster_subset = mfcc_df[mfcc_df['Cluster'] == cluster]
        fam_counts = Counter(cluster_subset['Family'])
        maj_family[cluster] = fam_counts.most_common(1)[0][0]
        genus_counts = Counter(cluster_subset['Genus'])
        maj_genus[cluster] = genus_counts.most_common(1)[0][0]
        
        species_counts = Counter(cluster_subset['Species'])
        maj_species[cluster] = species_counts.most_common(1)[0][0]
    
        print(f"Cluster {cluster}:")
        print(f"Majority Family: {maj_family[cluster]}")
        print(f"Majority Genus: {maj_genus[cluster]}")
        print(f"Majority Species: {maj_species[cluster]}\n")

    assigned_labels = pd.DataFrame(index=mfcc_df.index, columns=['Family','Genus','Species'])
    for cluster in range(best_k):
        ind = mfcc_df[mfcc_df['Cluster'] == cluster].index
        assigned_labels.loc[ind,'Family'] = maj_family[cluster]
        assigned_labels.loc[ind,'Genus'] = maj_genus[cluster]
        assigned_labels.loc[ind,'Species'] = maj_species[cluster]

    hamming_score = np.mean(np.equal(mfcc_df[['Family','Genus','Species']].values, assigned_labels.values))
    print(f"Hamming Score: {hamming_score}")
    
    overall_score.append(hamming_score)
    
    hamming_loss = 1 - hamming_score
    print(f"Hamming Loss: {hamming_loss}")
    overall_loss.append(hamming_loss)

    hamming_distance = hamming_loss * 3
    print(f"Hamming Distance: {hamming_distance}\n")
    overall_distance.append(hamming_distance)

average_loss = np.mean(overall_loss)
average_distance = np.mean(overall_distance)
average_score = np.mean(overall_score)
print("The Average Hamming Distance is",average_distance)
print("The Average Hamming Loss is",average_loss)
print("The Average Hamming Score is",average_score)

# References

https://www.w3schools.com/python/python_ml_k-means.asp

https://scikit-learn.org/stable/modules/svm.html#classification

https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html#sklearn.svm.LinearSVC