In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import preprocessing

# No Missing Values in the Dataset.
# Loading the dataset

path = "/Users/sumanthkakani/Documents/EE559-Mathematical-Pattern-Recognition/Anuran Calls Multi Label Multi Class Classification"
Dataset = pd.read_csv(path+"/Anuran Calls (MFCCs)/" + "Frogs_MFCCs.csv")
col_ind = Dataset.columns.to_numpy()

# Splitting (Randomly) the dataset into trainset and testset:
# Trainset: 70% of data
# Testset: 30 % of data

train,test = train_test_split(Dataset,train_size = 0.7, test_size= 0.3)



In [7]:


from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn import preprocessing
from sklearn.multiclass import OneVsRestClassifier

## Multi-Label Multi-Class Classification
# Multi-Class Classification: One Vs All Classifier

# For Multi Label Classification, Binary Relavance method is implemented. We train the data for each label seperately

# For the label "Family" 

le1 = preprocessing.LabelEncoder()
fam_lab    = le1.fit(train['Family'])
fam_labels = le1.transform(train['Family']);
    

# SVM with RBF Kernel is being implemented.
print("RBF Kernel:")
print("***********")
    
print("Training the SVC Classifier for 'Family' classification")


# Gridsearch strategy is implemented for choosing the parameters 'C' and 'Gamma'.

# C and Gamma values
C =[0.001, 10, 100, 1000, 10000, 100000] 
gamma = [0.0001,,0.01, 1 , 10, 100,10000]


parameters = {'gamma':(gamma), 'C': (C)}
svc1 = SVC(kernel = 'rbf')

print("Finding C and Gamma for the classifier")
clf1 = GridSearchCV(svc1, parameters, cv=10).fit(train.iloc[:,0:22], fam_labels)

print("Best C and Gamma for 'Family' label");
print(clf1.best_params_)

print("Training Score Obtained for 'Family' Classifier")
print(clf1.best_score_)

# Multi Class Classification
svc1 = OneVsRestClassifier(clf1.best_estimator_).fit(train.iloc[:,0:22], fam_labels)

print("******************************")


RBF Kernel:
***********
Training the SVC Classifier for 'Family' classification
Finding C and Gamma for the classifier
Best C and Gamma for 'Family' label
{'C': 100, 'gamma': 1}
Training Score Obtained for 'Family' Classifier
0.9930500397140588
******************************


In [8]:
## Now the classifiers are trained for other two labels



# For Genus Classification
le2 = preprocessing.LabelEncoder()
genus_lab    = le2.fit(train['Genus'])
genus_labels = le2.transform(train['Genus']);

parameters = {'gamma':(gamma), 'C': (C)}
svc2 = SVC(kernel = 'rbf')

print("Training the SVC Classifier for 'Genus' classification")

print("Finding C and Gamma for the classifier...")
clf2 = GridSearchCV(svc2, parameters, cv=10).fit(train.iloc[:,0:22], genus_labels)

print("Best C and Gamma for 'Genus' label");
print(clf2.best_params_)

print("Training Score Obtained for 'Genus' Classifier")
print(clf2.best_score_)

svc2 = OneVsRestClassifier(clf2.best_estimator_).fit(train.iloc[:,0:22], genus_labels)

print("******************************")


## For Species Classification

le3 = preprocessing.LabelEncoder()
species_lab    = le3.fit(train['Species'])
species_labels = le3.transform(train['Species']);

print("Training the SVC Classifier for 'Species' classification")
    
parameters = {'gamma':(gamma), 'C': (C)}
svc3 = SVC(kernel = 'rbf')

print("Finding C and Gamma for the classifier...")

clf3 = GridSearchCV(svc3, parameters, cv=10).fit(train.iloc[:,0:22], species_labels)


print("Best C and Gamma for 'Species' label");
print(clf3.best_params_)

print("Training Score Obtained for 'Species' Classifier")
print(clf3.best_score_)

svc3 = OneVsRestClassifier(clf3.best_estimator_).fit(train.iloc[:,0:22], species_labels)

print("*******************")



Training the SVC Classifier for 'Genus' classification
Finding C and Gamma for the classifier...
Best C and Gamma for 'Genus' label
{'C': 1000, 'gamma': 1}
Training Score Obtained for 'Genus' Classifier
0.9912629070691025
******************************
Training the SVC Classifier for 'Species' classification
Finding C and Gamma for the classifier...
Best C and Gamma for 'Species' label
{'C': 100, 'gamma': 1}
Training Score Obtained for 'Species' Classifier
0.9914614773629865
*******************


In [9]:
#Evaluation of the Model on the Testing set

## Multi Label Classification Models can be evaluated in two ways

# 1) Exact Match :  (#samples for which all the three labels are correctly classified)/(#samples)

# 2) Hamming Loss : (This loss counts all the misclassied lab)

family_test_labels = le1.transform(test['Family'])
genus_test_labels =  le2.transform(test['Genus'])
species_test_labels = le3.transform(test['Species'])

pred1 = clf1.predict(test.iloc[:,0:22])
pred2 = clf2.predict(test.iloc[:,0:22])
pred3 = clf3.predict(test.iloc[:,0:22])

# Exact Loss Implementation

count =0
for i in list(range(1,np.shape(test)[0])):
    if (pred1[i] == family_test_labels[i] and pred2[i] == genus_test_labels[i]  and pred3[i] == species_test_labels[i]):
        count = count+1;
Exact_Match = (np.shape(test)[0]-count)/2159
print("Exact Match:",Exact_Match)

# Hamming Loss Implementation

Score1 = clf1.score(test.iloc[:,0:22],family_test_labels )
Score2 = clf2.score(test.iloc[:,0:22],genus_test_labels)
Score3 = clf3.score(test.iloc[:,0:22],species_test_labels )
Hamming_Loss  = 1 - (Score1+Score2+Score3)/3
print("Hamming Loss:",Hamming_Loss)


Exact Match: 0.015284854099119963
Hamming Loss: 0.010498687664041939


In [105]:
# L1 Penalized SVMS. 

# It has only one parameter, C.

# Multi Label Strategy: Binary Relavance
# Multi Class: One vs Rest

from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')

print("Linear Kernel SVMs")
C =[0.00001,0.001,1,10,1000,100000]
parameters = {'C': (C)}

svc4 = LinearSVC(penalty='l1',dual= False,multi_class='ovr')

print("Training the SVC Classifier for 'Family' classification")
print("Finding C for the classifier...")
clf4 = GridSearchCV(svc4, parameters, cv=10).fit(train.iloc[:,0:22], fam_labels)

print("Best Parameter")
print(clf4.best_params_)
print("Best Score")
print(clf4.best_score_)

clf4 = clf4.best_estimator_.fit(train.iloc[:,0:22], fam_labels)

print("*******************")

print("Training the SVC Classifier for 'Genus' classification")
print("Finding C for the classifier...")


C =[1,10,1000,100000,1000000,10000000]

clf5 = GridSearchCV(svc4, parameters, cv=10).fit(train.iloc[:,0:22], genus_labels)

print("Best Parameter")
print(clf5.best_params_)
print("Best Score")
print(clf5.best_score_)

clf5 = clf5.best_estimator_.fit(train.iloc[:,0:22], genus_labels)

print("*******************")

print("Training the SVC Classifier for 'Species' classification")
print("Finding C for the classifier...")


C =[0.00001,0.001,1,10,1000,100000]


clf6 = GridSearchCV(svc4, parameters, cv=10).fit(train.iloc[:,0:22], species_labels)

print("Best Parameter")
print(clf6.best_params_)
print("Best Score")
print(clf6.best_score_)

clf6 = clf6.best_estimator_.fit(train.iloc[:,0:22], species_labels)

print("*******************")


Linear Kernel SVMs
Training the SVC Classifier for 'Family' classification
Finding C for the classifier...
Best Parameter
{'C': 10}
Best Score
0.9382446386020651
*******************
Training the SVC Classifier for 'Genus' classification
Finding C for the classifier...
Best Parameter
{'C': 1000}
Best Score
0.954527402700556
*******************
Training the SVC Classifier for 'Species' classification
Finding C for the classifier...
Best Parameter
{'C': 10}
Best Score
0.960087370929309
*******************


In [103]:
# Test Scores for L1 penalised SVMS

pred4 = clf4.predict(test.iloc[:,0:22])
pred5 = clf5.predict(test.iloc[:,0:22])
pred6 = clf6.predict(test.iloc[:,0:22])

count =0
for i in list(range(1,np.shape(test)[0])):
    if (pred4[i] == family_test_labels[i] and pred5[i] == genus_test_labels[i]  and pred6[i] == species_test_labels[i]):
        count = count+1;

Exact_Match = (np.shape(test)[0]-count)/np.shape(test)[0]
print("Exact Match:",Exact_Match)

Score4 = clf4.score(test.iloc[:,0:22],family_test_labels )
Score5 = clf5.score(test.iloc[:,0:22],genus_test_labels)
Score6 = clf6.score(test.iloc[:,0:22],species_test_labels )
Hamming_Loss  = 1 - (Score4+Score5+Score6)/3

print("Hamming Loss:",Hamming_Loss)

Exact Match: 0.08244557665585919
Hamming Loss: 0.0520302609232669


In [11]:
# 

#SMOTE
from sklearn.svm import LinearSVC
import warnings
warnings.filterwarnings('ignore')
from imblearn.over_sampling import SMOTE

print("Linear Kernel SVMs with SMOTE")


sm = SMOTE(random_state=42) 
train_res1, fam_labels_res1 = sm.fit_resample(train.iloc[:,0:22], fam_labels)

C =[0.001, 10, 100, 1000, 10000, 100000] 

parameters = {'C': (C)}

svc7 = LinearSVC(penalty='l1',dual= False,multi_class='ovr')

print("Training the SVC Classifier for 'Family' classification")
print("Finding C for the classifier...")
clf7 = GridSearchCV(svc7, parameters, cv=10).fit(train_res1[:,0:22], fam_labels_res1)

print("Best Parameter")
print(clf7.best_params_)
print("Best Score")
print(clf7.best_score_)

clf7 = clf7.best_estimator_.fit(train_res1[:,0:22], fam_labels_res1)

print("*******************")


Linear Kernel SVMs with SMOTE
Training the SVC Classifier for 'Family' classification
Finding C for the classifier...
Best Parameter
{'C': 1000}
Best Score
0.948078165374677
*******************


In [115]:

train_res2, genus_labels_res = sm.fit_resample(train.iloc[:,0:22], genus_labels)

C =[0.00001,0.001,1,10,1000,100000]

parameters = {'C': (C)}

svc8 = LinearSVC(penalty='l1',dual= False,multi_class='ovr')

print("Training the SVC Classifier for 'Genus' classification")
print("Finding C for the classifier...")
clf8 = GridSearchCV(svc8, parameters, cv=10).fit(train_res2[:,0:22], genus_labels_res)

print("Best Parameter")
print(clf8.best_params_)
print("Best Score")
print(clf8.best_score_)

clf8 = clf8.best_estimator_.fit(train_res2[:,0:22], genus_labels_res)

print("*******************")

Training the SVC Classifier for 'Genus' classification
Finding C for the classifier...
Best Parameter
{'C': 100000}
Best Score
0.9606625258799172
*******************


In [13]:
train_res3, species_labels_res = sm.fit_resample(train.iloc[:,0:22], species_labels)


C =[0.00001,0.001,1,10,1000,100000]

parameters = {'C': (C)}

svc9 = LinearSVC(penalty='l1',dual= False,multi_class='ovr')

print("Training the SVC Classifier for 'Species' classification")
print("Finding C for the classifier...")
clf9 = GridSearchCV(svc9, parameters, cv=10).fit(train_res3[:,0:22], species_labels_res)

print("Best Parameter")
print(clf9.best_params_)
print("Best Score")
print(clf9.best_score_)

clf9 = clf9.best_estimator_.fit(train_res3[:,0:22], species_labels_res)

print("*******************")

Training the SVC Classifier for 'Species' classification
Finding C for the classifier...
Best Parameter
{'C': 1000}
Best Score
0.956984842277755
*******************


In [15]:
# Test Scores for L1 penalised SVMS

pred7 = clf7.predict(test.iloc[:,0:22])
pred8 = clf8.predict(test.iloc[:,0:22])
pred9 = clf9.predict(test.iloc[:,0:22])

count =0
for i in list(range(1,np.shape(test)[0])):
    if (pred7[i] == family_test_labels[i] and pred8[i] == genus_test_labels[i]  and pred9[i] == species_test_labels[i]):
        count = count+1;

Exact_Match = (np.shape(test)[0]-count)/np.shape(test)[0]
print("Exact Match:",Exact_Match)

Score7 = clf7.score(test.iloc[:,0:22],family_test_labels )
Score8 = clf8.score(test.iloc[:,0:22],genus_test_labels)
Score9 = clf9.score(test.iloc[:,0:22],species_test_labels )
Hamming_Loss  = 1 - (Score7+Score8+Score9)/3

print("Hamming Loss:",Hamming_Loss)

Exact Match: 0.1440481704492821
Hamming Loss: 0.0765786629612476


In [None]:
'''

# Conclusions:

1) SVM with RBF Kernel have outperformed the Linear Kernel SVM in both Hamming Loss and Exact Match Metrics.
2) Grid Search used to tune the hyper-parameters is computationally expensive.
3) It can also be seen the Hamming Loss is smaller than Exact Match metric for any classifier here. 
This is because Exact Match is more a strict metric than Hamming Loss as it requires all the 3 labels to be 
correctly classifed.
4) Linear Classifier may have Class Imbalance during Multi Class Classification.
5) SMOTE has been implemented to increase the accuracy


'''
