# Multi-class and Multi-Label Classification Using Support Vector Machines

## a. Download the Anuran Calls (MFCCs) Data Set
from: https://archive.ics. uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29#. Choose 70% of the data randomly as the training set.

In [105]:
import numpy as np
import pandas as pd
import seaborn as sb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from math import log10

In [106]:
anuran_df = pd.read_csv("data/Anuran Calls (MFCCs)/Frogs_MFCCs.csv")
anuran_df.head()

Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species,RecordID
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre,1
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre,1
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre,1
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre,1
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre,1


In [107]:
train_df, test_df = train_test_split(anuran_df, train_size=0.7)



## b. Train a classifier for each label
### ii. Train a SVM for each label, using Gaussian kernels and one versus all classifiers.

In [124]:
from sklearn.svm import SVC
from sklearn import metrics

In [109]:
def tune_GaussianSVC(X_train, y_train):
    #determine range for  C
    c_vals = np.logspace(-5, 8, 10)
    scores = []
    for c in c_vals:
        svc = SVC(kernel='rbf', C=c)
        svc.fit(X_train, y_train)
        scores.append(svc.score(X_train, y_train))
    scores = np.array(scores)
    index = np.argwhere(scores > 0.9).flatten()
    c_low = c_vals[index[0]]
    c_high = c_vals[index[-1]]
    
    #determine range for gamma
    gamma_s = np.append(np.logspace(-4, -1, 10), np.logspace(0, 2, 10))
    scores2 = []
    for gamma in gamma_s:
        svc = SVC(kernel='rbf', gamma=gamma)
        svc.fit(X_train, y_train)
        scores2.append(svc.score(X_train, y_train))
    scores2 = np.array(scores2)
    index2 = np.argwhere(scores2 > 0.9).flatten()
    gamma_low = gamma_s[index2[0]]
    gamma_high = gamma_s[index2[-1]]
    
    #cross validation
    param_grid = {
        'C': np.logspace(log10(c_low), log10(c_high), 10),
        'gamma': np.logspace(log10(gamma_low), log10(gamma_high), 10)
    }
    
    kf = StratifiedKFold(n_splits=10, shuffle=True)
    grid = GridSearchCV(svc, param_grid, cv=kf, scoring='accuracy')
    grid.fit(X_train, y_train)
    
    return grid
    
    

In [110]:
labels = ['Family', 'Genus', 'Species']

pred = pd.DataFrame()

for l in labels:
    X_train = train_df.iloc[:, :-4]
    y_train = train_df[l]
    X_test = test_df.iloc[:, :-4]
    
    print("Train SVM for label '" + str(l) + "':\n")
    
    gridCV = tune_GaussianSVC(X_train, y_train)
    print('The best parameters are', gridCV.best_params_)
    svc = gridCV.best_estimator_
    
    pred[l] = svc.predict(X_test)

Train SVM for label 'Family':

The best parameters are {'C': 38.04056104782507, 'gamma': 2.3462288481422626}
Train SVM for label 'Genus':

The best parameters are {'C': 38.04056104782507, 'gamma': 3.3000347911252854}
Train SVM for label 'Species':

The best parameters are {'C': 5.994842503189409, 'gamma': 1.406527242105237}


In [111]:
pred = pd.DataFrame(pred)

In [133]:
print("prediction for test data:\n", pred)

prediction for test data:
                Family          Genus                 Species
0     Leptodactylidae      Adenomera  AdenomeraHylaedactylus
1             Hylidae  Dendropsophus              HylaMinuta
2     Leptodactylidae      Adenomera  AdenomeraHylaedactylus
3             Hylidae      Hypsiboas       HypsiboasCordobae
4     Leptodactylidae      Adenomera  AdenomeraHylaedactylus
5     Leptodactylidae      Adenomera  AdenomeraHylaedactylus
6             Hylidae      Hypsiboas       HypsiboasCordobae
7       Dendrobatidae       Ameerega      Ameeregatrivittata
8     Leptodactylidae      Adenomera  AdenomeraHylaedactylus
9     Leptodactylidae      Adenomera          AdenomeraAndre
10    Leptodactylidae      Adenomera  AdenomeraHylaedactylus
11    Leptodactylidae      Adenomera  AdenomeraHylaedactylus
12    Leptodactylidae      Adenomera  AdenomeraHylaedactylus
13    Leptodactylidae  Leptodactylus     LeptodactylusFuscus
14    Leptodactylidae      Adenomera  AdenomeraHylaedactyl

In [140]:
def compute_hamming_loss(pred):
    hamming_loss = []
    for l in labels:
        hamming_loss.append(metrics.hamming_loss(test_df[l], pred[l]))
    return np.mean(hamming_loss)
print("The hamming loss for Gaussian kernel SVM is", compute_hamming_loss(pred))

The hamming loss for Gaussian kernel SVM is 0.008491585610622202


### iii. Train L1-penalized SVMs.

In [136]:
def tune_linearSVC(X_train, y_train):
    '''
    tune the penalty parameter of LinearSVC
    return the best estinator chosen by CV
    '''
    C_s = np.logspace(-5, 5, 11)
    scores = []
    for c in C_s:
        svc = LinearSVC(penalty='l1', C=c, dual=False)
        svc.fit(X_train, y_train)
        scores.append(svc.score(X_train, y_train))
    scores = np.array(scores)
    index = np.argwhere(scores > 0.9).flatten()
    low = C_s[index[0]]
    high = C_s[index[-1]]
    
    #do fine grid cross validation
    C_vals = np.logspace(log10(low), log10(high), 20)
    param_grid = {'C': C_vals}
    
    svc = LinearSVC(penalty='l1', dual=False)
    n_splits = 10 if (len(X_train) > 10) else 5
    
    kf = StratifiedKFold(n_splits=n_splits, shuffle=True)
    clf = GridSearchCV(svc, param_grid, cv=kf, scoring='accuracy')
    clf.fit(X_train, y_train)
    print('\tThe best C is', clf.best_params_['C'], ', CV-ed accurracy is', clf.best_score_, '\n')    
    return clf.best_estimator_

In [137]:
pred2 = pd.DataFrame()

for l in labels:
    X_train = train_df.iloc[:, :-4]
    y_train = train_df[l]
    X_test = test_df.iloc[:, :-4]
    
    print("Train linear SVM for label '" + str(l) + "':\n")
    
    ln_svc = tune_linearSVC(X_train, y_train)
    
    pred2[l] = ln_svc.predict(X_test)

Train linear SVM for label 'Family':

	The best C is 16.23776739188721 , CV-ed accurracy is 0.9326846703733122 

Train linear SVM for label 'Genus':

	The best C is 297.6351441631316 , CV-ed accurracy is 0.9507545671167593 

Train linear SVM for label 'Species':

	The best C is 297.6351441631316 , CV-ed accurracy is 0.9571088165210484 



In [141]:
print("The hamming loss for linear SVM is", compute_hamming_loss(pred2))

The hamming loss for linear SVM is 0.050331943801142504
