# MULTI-CLASS AND MULTI-LABEL CLASSIFICATION USING SVM

### Part (a) - To download the dataset. 

### Initiliazation of libraries 

In [2]:
#Iniatialization of libraries 

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import warnings
from sklearn.metrics import accuracy_score, f1_score
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from sklearn.svm import SVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import hamming_loss
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
warnings.filterwarnings('ignore')  # Used to avoid any warnings in the output 


### Creating pandas Dataframe from the Dataset CSV

In [3]:
data = pd.read_csv('Frogs_MFCCs.csv', usecols = range(1,26))
df= pd.DataFrame(data)
X = df.iloc[:, 0:21]
y = df.iloc[:, 21:24]

# Encoding the class in each label i.e. allocating int values to string values of class in each label
le = LabelEncoder()
y_f= le.fit_transform(y.iloc[:,0])
y_g = le.fit_transform(y.iloc[:,1])
y_s = le.fit_transform(y.iloc[:,2])

# Making a new dataframe after LabelEncoder function. 
y_new = pd.DataFrame({'Family': y_f, 'Genus': y_g, 'Species': y_s})
X_train, X_test, y_train, y_test = train_test_split(X, y_new, test_size=0.3, random_state=42)

### Part (b)ii  - To train model for each label using SVM RBF kernel. 

### Here, the parameters i.e. weight of SVM penalty and width of gaussian kernel(gamma) is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100 and the gamma varies from 0.05 to 2. 

### We use OneVsRest classifier to train SVM for each of the label. 

### The label classified here is "FAMILY"

In [3]:
parameters = {'C': np.linspace(0.1, 100), 'gamma': np.linspace(0.05,2)}
#parameters = {'C': np.linspace(0.1, 100, 100), 'gamma': np.linspace(0.05,0.1,2)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(SVC(kernel='rbf'), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Family'])

#Instanciating of the classifier OneVsRest to train SVC with kernel RBF. 
estimator = SVC(C = gs.best_params_.get('C'), kernel = 'rbf', gamma = gs.best_params_.get('gamma'))
ovr = OneVsRestClassifier(estimator)
ovr.fit(X_train, y_train.loc[:,'Family'])
y_pred_f = ovr.predict(X_test)   # Calculates the y_pred value. 
loss_f = hamming_loss(y_test.loc[:,'Family'], y_pred_f)  #The indiviadual hamming loss is calculated. 

print("CLASSIFICATION USING SVM WITH RBF KERNEL")
print("LABEL - FAMILY")
print("")
print("Test Score is :", ovr.score(X_test, y_test.loc[:,'Family'], sample_weight=None))
print("Hamming loss is :", loss_f)
print("Best Penalty is:", gs.best_params_.get('C'), "and Best Gamma is :", gs.best_params_.get('gamma'))

CLASSIFICATION USING SVM WITH RBF KERNEL
LABEL - FAMILY

Test Score is : 0.9944418712366836
Hamming loss is : 0.00555812876331635
Best Penalty is: 10.29387755102041 and Best Gamma is : 1.8806122448979592


### Conclusion(for FAMILY label) - We see that the test accuracy score is 0.99 which is expectionally good. Almost total classification. The classifier is working properly for the unseen test data. The best penalty is aroud 10.29 and best gamma is 1.88 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)ii  - To train model for each label using SVM RBF kernel. 

### Here, the parameters i.e. weight of SVM penalty and width of gaussian kernel(gamma) is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100 and the gamma varies from 0.05 to 2. 

### We use OneVsRest classifier to train SVM for each of the label. 

### The label classified here is "GENUS"

In [4]:
parameters = {'C': np.linspace(0.1, 100), 'gamma': np.linspace(0.05,2)}
#parameters = {'C': np.linspace(0.1, 100, 100), 'gamma': np.linspace(0.05,0.1,2)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(SVC(kernel='rbf'), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Genus'])

#Instanciating of the classifier OneVsRest to train SVC with kernel RBF. 
estimator = SVC(C = gs.best_params_.get('C'), kernel = 'rbf', gamma = gs.best_params_.get('gamma'))
ovr = OneVsRestClassifier(estimator)
ovr.fit(X_train, y_train.loc[:,'Genus'])
y_pred_g = ovr.predict(X_test)     # Calculates the y_pred value. 
loss_g = hamming_loss(y_test.loc[:,'Genus'], y_pred_g)     #The indiviadual hamming loss is calculated. 

print("CLASSIFICATION USING SVM WITH RBF KERNEL")
print("LABEL - GENUS")
print("")
print("Test Score is :", ovr.score(X_test, y_test.loc[:,'Genus'], sample_weight=None))
print("Hamming loss is :", loss_g)
print("Best Penalty is:", gs.best_params_.get('C'), "and Best Gamma is :", gs.best_params_.get('gamma'))

CLASSIFICATION USING SVM WITH RBF KERNEL
LABEL - GENUS

Test Score is : 0.9898100972672533
Hamming loss is : 0.010189902732746642
Best Penalty is: 12.332653061224491 and Best Gamma is : 1.920408163265306


### CONCLUSION(For label - GENUS) - We see that the test accuracy score is 0.98 which is expectionally good. Almost total classification. The classifier is working properly for the unseen test data. The best penalty is aroud 12.33 and best gamma is 1.92 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)ii  - To train model for each label using SVM RBF kernel. 

### Here, the parameters i.e. weight of SVM penalty and width of gaussian kernel(gamma) is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100 and the gamma varies from 0.05 to 2. 

### We use OneVsRest classifier to train SVM for each of the label. 

### The label classified here is "SPECIES"

In [5]:
parameters = {'C': np.linspace(0.1, 100), 'gamma': np.linspace(0.05,2)}
#parameters = {'C': np.linspace(0.1, 100, 100), 'gamma': np.linspace(0.05,0.1,2)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(SVC(kernel='rbf'), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Species'])

#Instanciating of the classifier OneVsRest to train SVC with kernel RBF. 
estimator = SVC(C = gs.best_params_.get('C'), kernel = 'rbf', gamma = gs.best_params_.get('gamma'))
ovr = OneVsRestClassifier(estimator)
ovr.fit(X_train, y_train.loc[:,'Species'])
y_pred_s = ovr.predict(X_test)
loss_s = hamming_loss(y_test.loc[:,'Species'], y_pred_s)

print("CLASSIFICATION USING SVM WITH RBF KERNEL")
print("LABEL - SPECIES")
print("")
print("Test Score is :", ovr.score(X_test, y_test.loc[:,'Species'], sample_weight=None))
print("Hamming loss is :", loss_s)
print("Best Penalty is:", gs.best_params_.get('C'), "and Best Gamma is :", gs.best_params_.get('gamma'))

CLASSIFICATION USING SVM WITH RBF KERNEL
LABEL - SPECIES

Test Score is : 0.9893469198703103
Hamming loss is : 0.010653080129689671
Best Penalty is: 6.216326530612245 and Best Gamma is : 0.9255102040816326


### CONCLUSION(For label - SPECIES) - We see that the test accuracy score is 0.989 which is expectionally good. Almost total classification. The classifier is working properly for the unseen test data. The best penalty is aroud 6.216 and best gamma is 0.9255 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

## EXACT MATCH LOSS AND HAMMING LOSS FOR RBF KEREL

### Exact match loss is basically calculated by comparing the y_true and y_pred. If all 3 classifier predict correctly we count it as 1 and if one of them misclasssifies, we count it as 0. 

### Hamming loss is calculated by taking the average of all individual hamming loss. Here we have 3 labels, so we divide the total hamming loss by 3. 

In [6]:
rbf_loss = 0

# Loop for comparing the y_true and y_pred. If they match, we increament. 
for i in range(len(X_test)):
    if y_test.loc[:,'Family'].values[i] == y_pred_f[i] and y_test.loc[:,'Genus'].values[i] == y_pred_g[i] and  y_test.loc[:,'Species'].values[i] == y_pred_s[i]:
        rbf_loss = rbf_loss + 1
    else:
        rbf_loss = rbf_loss 
        
emloss_rbf = rbf_loss/len(X_test)   # The total is divied by total no of X_test data. 
hloss_rbf = (loss_f + loss_g + loss_s)/3   # Total Hamming loss by taking average of all the three classifiers

print("EXACT MATCH LOSS AND HAMMING LOSS FOR RBF KERNEL")
print("")
print("Exact Match loss :", 1 - emloss_rbf)
print("Total Hamming Loss :", hloss_rbf)

EXACT MATCH LOSS AND HAMMING LOSS FOR RBF KERNEL

Exact Match loss : 0.014358499305233918
Total Hamming Loss : 0.008800370541917554


### CONCLUSION - We see that the total hamming loss is less than the exact match loss and which is ideally the case. 

## Final Conclusion about the SVC model with RBF Kernel -- We saw that in every label we got perfect fit for the model. The accuracy turned out to be almost 99% and the parameters for gamma remained between 0.9 to 1.8. This implies that we got a good seperability among classes in each label using RBF kernel. 

### Part (b)iii  - To train model for each label using SVM linear kernel and L1 penalized. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "FAMILY"

In [7]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Family'])

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train, y_train.loc[:,'Family'])
y_pred_f_l1 = svc.predict(X_test)
loss_f_l1 = hamming_loss(y_test.loc[:,'Family'], y_pred_f_l1)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL")
print("LABEL - FAMILY")
print("")
print("Test Score is :", svc.score(X_test, y_test.loc[:,'Family'], sample_weight=None))
print("Hamming loss is :", loss_f_l1)
print("Best Penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL
LABEL - FAMILY

Test Score is : 0.9249652616952293
Hamming loss is : 0.07503473830477073
Best Penalty is: 14.371428571428572


### CONCLUSION(For label - FAMILY) - We see that the test accuracy score is 0.92 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 14.37 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)iii  - To train model for each label using SVM linear kernel and L1 penalized. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "GENUS"

In [8]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Genus'])

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train, y_train.loc[:,'Genus'])
y_pred_g_l1 = svc.predict(X_test)
loss_g_l1 = hamming_loss(y_test.loc[:,'Genus'], y_pred_g_l1)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL")
print("LABEL - GENUS")
print("")
print("Test score is :", svc.score(X_test, y_test.loc[:,'Genus'], sample_weight=None))
print("Hamming loss is :", loss_g_l1)
print("Best Penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL
LABEL - GENUS

Test score is : 0.9407132931912923
Hamming loss is : 0.059286706808707734
Best Penalty is: 97.96122448979592


### CONCLUSION(For label - GENUS) - We see that the test accuracy score is 0.94 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 97.96 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)iii  - To train model for each label using SVM linear kernel and L1 penalized. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "SPECIES"

In [9]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train, y_train.loc[:,'Species'])

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train, y_train.loc[:,'Species'])
y_pred_s_l1 = svc.predict(X_test)
loss_s_l1 = hamming_loss(y_test.loc[:,'Species'], y_pred_s_l1)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL")
print("LABEL - SPECIES")
print("")
print("Test score is :", svc.score(X_test, y_test.loc[:,'Species'], sample_weight=None))
print("Hamming loss is :", loss_s_l1)
print("Best Penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL
LABEL - SPECIES

Test score is : 0.9592403890690134
Hamming loss is : 0.04075961093098657
Best Penalty is: 61.263265306122456


### CONCLUSION(For label - SPECIES) - We see that the test accuracy score is 0.95 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 61.26 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

## EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED 

### Exact match loss is basically calculated by comparing the y_true and y_pred. If all 3 classifier predict correctly we count it as 1 and if one of them misclasssifies, we count it as 0. 

### Hamming loss is calculated by taking the average of all individual hamming loss. Here we have 3 labels, so we divide the total hamming loss by 3. 

In [10]:
l1_loss = 0

# Loop for comparing the y_true and y_pred. If they match, we increament. 
for i in range(len(X_test)):
    if y_test.loc[:,'Family'].values[i] == y_pred_f_l1[i] and y_test.loc[:,'Genus'].values[i] == y_pred_g_l1[i] and y_test.loc[:,'Species'].values[i] == y_pred_s_l1[i]:
        l1_loss = l1_loss + 1
    else:
        l1_loss = l1_loss 
emloss_l1 = l1_loss/len(X_test)   # The total is divied by total no of X_test data.
hloss_l1 = (loss_f_l1 + loss_g_l1 + loss_s_l1)/3   # Total Hamming loss by taking average of all the three classifiers

print("EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED")
print("")
print("Exact Match loss :", 1 - emloss_l1)
print("Total Hamming Loss :", hloss_l1)

EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED

Exact Match loss : 0.09078276980083377
Total Hamming Loss : 0.058360352014821676


### CONCLUSION - We see that the total hamming loss is less than the exact match loss and which is ideally the case. 

## Final Conclusion - 
### We see that with with linear kernel and L1 regularization, the error increased and the accuracy decreased when compared to the RBF kernel. We know that L1 penalty provides sparse and feature elimination. In this case we have large number of datapoints and when L1 penalty eliminates feature, it increases error. 

### Part (b)iv  - To train model for each label using SVM linear kernel and L1 penalized and SMOTE for class imbalance. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "FAMILY"

In [5]:
sm = SMOTE(random_state= 10)

# Using SMOTE to deal with class imbalance. Creating individual dataframe and fitting it. 
# This is for training data. 
X_train_f_smote, y_train_f_smote = sm.fit_resample(X_train, y_train.loc[:,'Family'])
X_train_g_smote, y_train_g_smote = sm.fit_resample(X_train, y_train.loc[:,'Genus'])
X_train_s_smote, y_train_s_smote = sm.fit_resample(X_train, y_train.loc[:,'Species'])

In [11]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train_f_smote, y_train_f_smote)

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train_f_smote, y_train_f_smote)
y_pred_f_l1_smote = svc.predict(X_test)
loss_f_l1_smote = hamming_loss(y_test.loc[:,'Family'], y_pred_f_l1_smote)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)")
print("LABEL - FAMILY")
print("")
print("Test Score is :", svc.score(X_test, y_test.loc[:,'Family'], sample_weight=None))
print("Hamming loss is :", loss_f_l1_smote)
print("Best penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)
LABEL - FAMILY

Test Score is : 0.9092172301991662
Hamming loss is : 0.09078276980083372
Best penalty is: 97.96122448979592


### CONCLUSION(For label - FAMILY) - We see that the test accuracy score is 0.90 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 97.96 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)iv  - To train model for each label using SVM linear kernel and L1 penalized and SMOTE for class imbalance. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "GENUS"

In [7]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train_g_smote, y_train_g_smote)

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train_g_smote, y_train_g_smote)
y_pred_g_l1_smote = svc.predict(X_test)
loss_g_l1_smote = hamming_loss(y_test.loc[:,'Genus'], y_pred_g_l1_smote)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)")
print("LABEL - GENUS")
print("")
print("Test score is :", svc.score(X_test, y_test.loc[:,"Genus"], sample_weight=None))
print("Hamming loss is :", loss_g_l1_smote)
print("Best Penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)
LABEL - GENUS

Test score is : 0.9069013432144511
Hamming loss is : 0.09309865678554886
Best Penalty is: 79.61224489795919


### CONCLUSION(For label - GENUS) - We see that the test accuracy score is 0.90 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 79.61 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

### Part (b)iv  - To train model for each label using SVM linear kernel and L1 penalized and SMOTE for class imbalance. 

### Here, the parameters i.e. weight of SVM penalty is selected using gridsearchCV with 10 fold CV. The SVM penalty varies from 0.1 to 100.  

### We use LinearSVC classifier to train SVM for each of the label. The loss is sqaured-hinge. 

### The label classified here is "SPECIES"

In [8]:
parameters = {'C': np.linspace(0.1, 100)}

# GridsearchCV used to identify the best parameters from the range declared above. 
gs = GridSearchCV(LinearSVC(), parameters, cv = 10)
gs.fit(X_train_s_smote, y_train_s_smote)

#Instanciating of the classifier LinearSVC to train SVC with kernel Linear and l1-penalized with loss = squared hinge loss. 
svc = LinearSVC(penalty='l1', loss='squared_hinge', dual = False, C = gs.best_params_.get('C'), multi_class='ovr')
svc.fit(X_train_s_smote, y_train_s_smote)
y_pred_s_l1_smote = svc.predict(X_test)
loss_s_l1_smote = hamming_loss(y_test.loc[:,'Species'], y_pred_s_l1_smote)

print("CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)")
print("LABEL - SPECIES")
print("")
print("Test score is :", svc.score(X_test, y_test.loc[:,'Species'], sample_weight=None))
print("Hamming loss is :", loss_s_l1_smote)
print("Best Penalty is:", gs.best_params_.get('C'))

CLASSIFICATION USING L1 PENALIZED SVM WITH LINEAR KERNEL(SMOTE)
LABEL - SPECIES

Test score is : 0.9587772116720704
Hamming loss is : 0.041222788327929596
Best Penalty is: 69.41836734693878


### CONCLUSION(For label - SPECIES) - We see that the test accuracy score is 0.95 which is good. Prone to missclassification. The classifier is working properly for the unseen test data. The best penalty is aroud 69.41 as calculated by the gridsearchCV and as a result be got a very good accuracy in test data. 

## EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED WITH SMOTE

### Exact match loss is basically calculated by comparing the y_true and y_pred. If all 3 classifier predict correctly we count it as 1 and if one of them misclasssifies, we count it as 0. 

### Hamming loss is calculated by taking the average of all individual hamming loss. Here we have 3 labels, so we divide the total hamming loss by 3. 

In [12]:
smote_loss = 0

# Loop for comparing the y_true and y_pred. If they match, we increament. 
for i in range(len(X_test)):
    if y_test.loc[:,'Family'].values[i] == y_pred_f_l1_smote[i] and y_test.loc[:,'Genus'].values[i] == y_pred_g_l1_smote[i] and y_test.loc[:,'Species'].values[i] == y_pred_s_l1_smote[i]:
        smote_loss = smote_loss + 1
    else:
        smote_loss = smote_loss 
emloss_smote = smote_loss/len(X_test)   # The total is divied by total no of X_test data.
hloss_smote = (loss_f_l1_smote + loss_g_l1_smote + loss_s_l1_smote)/3     # Total Hamming loss by taking average of all the three classifiers

print("EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED WITH SMOTE")
print("")
print("Exact Match loss :", 1- emloss_smote)
print("Total Hamming Loss :", hloss_smote)

EXACT MATCH LOSS AND HAMMING LOSS FOR L1 PENALIZED WITH SMOTE

Exact Match loss : 0.14126910606762388
Total Hamming Loss : 0.07503473830477071


### CONCLUSION - We see that the total hamming loss is less than the exact match loss and which is ideally the case. 

## Final Conclusion - 
### To match the class imbalance, we used SMOTE on the training data. We see that the error got increased than the normal L1 penalized SVM. This is probably because SMOTE increased misclassfication by increasing datapoints. Since we are using the same L1 penalized SVM model, it eliminated features on such huge dataset which in turn made the model more prone to missclassification. 