In [69]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.svm import SVC, LinearSVC
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from imblearn.pipeline import Pipeline as imbpipeline
from imblearn.over_sampling import SMOTE
from warnings import simplefilter
from sklearn.exceptions import ConvergenceWarning
simplefilter("ignore", category=ConvergenceWarning)

# 1. Multi-class and Multi-Label Classification Using Support Vector Machines

## (a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics. uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. Choose 70% of the data randomly as the training set.

In [2]:
df = pd.read_csv("../data/Frogs_MFCCs.csv")
df.reset_index(drop=True, inplace = True)
df.drop(['RecordID'], axis = 1, inplace = True)
print("Dataset Shape ",df.shape)
train_df, test_df = train_test_split(df, test_size = 0.3, shuffle = True, random_state = 42)
train_df.reset_index(drop=True, inplace = True)
test_df.reset_index(drop=True, inplace = True)
df.head(10)

Dataset Shape  (7195, 25)


Unnamed: 0,MFCCs_ 1,MFCCs_ 2,MFCCs_ 3,MFCCs_ 4,MFCCs_ 5,MFCCs_ 6,MFCCs_ 7,MFCCs_ 8,MFCCs_ 9,MFCCs_10,...,MFCCs_16,MFCCs_17,MFCCs_18,MFCCs_19,MFCCs_20,MFCCs_21,MFCCs_22,Family,Genus,Species
0,1.0,0.152936,-0.105586,0.200722,0.317201,0.260764,0.100945,-0.150063,-0.171128,0.124676,...,-0.024017,-0.108351,-0.077623,-0.009568,0.057684,0.11868,0.014038,Leptodactylidae,Adenomera,AdenomeraAndre
1,1.0,0.171534,-0.098975,0.268425,0.338672,0.268353,0.060835,-0.222475,-0.207693,0.170883,...,0.012022,-0.090974,-0.05651,-0.035303,0.02014,0.082263,0.029056,Leptodactylidae,Adenomera,AdenomeraAndre
2,1.0,0.152317,-0.082973,0.287128,0.276014,0.189867,0.008714,-0.242234,-0.219153,0.232538,...,0.083536,-0.050691,-0.02359,-0.066722,-0.025083,0.099108,0.077162,Leptodactylidae,Adenomera,AdenomeraAndre
3,1.0,0.224392,0.118985,0.329432,0.372088,0.361005,0.015501,-0.194347,-0.098181,0.270375,...,-0.050224,-0.136009,-0.177037,-0.130498,-0.054766,-0.018691,0.023954,Leptodactylidae,Adenomera,AdenomeraAndre
4,1.0,0.087817,-0.068345,0.306967,0.330923,0.249144,0.006884,-0.265423,-0.1727,0.266434,...,0.062837,-0.048885,-0.053074,-0.08855,-0.031346,0.10861,0.079244,Leptodactylidae,Adenomera,AdenomeraAndre
5,1.0,0.099704,-0.033408,0.349895,0.344535,0.247569,0.022407,-0.213767,-0.127916,0.277353,...,0.055242,-0.080487,-0.130089,-0.171478,-0.071569,0.077643,0.064903,Leptodactylidae,Adenomera,AdenomeraAndre
6,1.0,0.021676,-0.062075,0.318229,0.380439,0.179043,-0.041667,-0.2523,-0.167117,0.220027,...,0.064853,-0.04662,-0.055146,-0.085972,-0.009127,0.06563,0.04404,Leptodactylidae,Adenomera,AdenomeraAndre
7,1.0,0.14513,-0.03366,0.284166,0.279537,0.175211,0.005791,-0.183329,-0.158483,0.192567,...,0.075654,-0.055978,-0.048219,-0.056637,-0.022419,0.070085,0.021419,Leptodactylidae,Adenomera,AdenomeraAndre
8,1.0,0.271326,0.027777,0.375738,0.385432,0.272457,0.098192,-0.17373,-0.157857,0.207181,...,-0.032167,-0.120723,-0.112607,-0.156933,-0.118527,-0.002471,0.002304,Leptodactylidae,Adenomera,AdenomeraAndre
9,1.0,0.120565,-0.107235,0.316555,0.364437,0.307757,0.025992,-0.294179,-0.223236,0.268435,...,0.053436,-0.051073,-0.052568,-0.111338,-0.040014,0.090204,0.088025,Leptodactylidae,Adenomera,AdenomeraAndre


In [3]:
print("Training Set Shape ",train_df.shape)
print("Testing Set Shape ",test_df.shape)
data_cols = set(train_df.columns)

Training Set Shape  (5036, 25)
Testing Set Shape  (2159, 25)


In [10]:
print(train_df.isna().any().any())
print(test_df.isna().any().any())

False
False


## b) Each instance has three labels: Families, Genus, and Species. Each of the labels has multiple classes. We wish to solve a multi-class and multi-label problem. One of the most important approaches to multi-label classification is to train a classifier for each label (binary relevance). We first try this approach:

### i. Research exact match and hamming score/ loss methods for evaluating multi- label classification and use them in evaluating the classifiers in this problem.

> https://stats.stackexchange.com/questions/233275/multilabel-classification-metrics-on-scikit
> https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hamming_loss.html

### ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classifiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation.1 You are welcome to try to solve the problem with both standardized 2 and raw attributes and report the results.

array([1.00000000e-03, 1.45634848e-03, 2.12095089e-03, 3.08884360e-03,
       4.49843267e-03, 6.55128557e-03, 9.54095476e-03, 1.38949549e-02,
       2.02358965e-02, 2.94705170e-02, 4.29193426e-02, 6.25055193e-02,
       9.10298178e-02, 1.32571137e-01, 1.93069773e-01, 2.81176870e-01,
       4.09491506e-01, 5.96362332e-01, 8.68511374e-01, 1.26485522e+00,
       1.84206997e+00, 2.68269580e+00, 3.90693994e+00, 5.68986603e+00,
       8.28642773e+00, 1.20679264e+01, 1.75751062e+01, 2.55954792e+01,
       3.72759372e+01, 5.42867544e+01, 7.90604321e+01, 1.15139540e+02,
       1.67683294e+02, 2.44205309e+02, 3.55648031e+02, 5.17947468e+02,
       7.54312006e+02, 1.09854114e+03, 1.59985872e+03, 2.32995181e+03,
       3.39322177e+03, 4.94171336e+03, 7.19685673e+03, 1.04811313e+04,
       1.52641797e+04, 2.22299648e+04, 3.23745754e+04, 4.71486636e+04,
       6.86648845e+04, 1.00000000e+05])

In [39]:
#For Label Family

train_df_ova = train_df.drop(['Genus','Species'], axis=1)
test_df_ova = test_df.drop(['Genus','Species'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  SVC(kernel='rbf',decision_function_shape='ovr'))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000],
              'SVM__gamma':[0.1, 0.01, 0.001, 0.0001]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)

In [40]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")

Pipeline(steps=[('scaler', StandardScaler()), ('SVM', SVC(C=1000, gamma=0.1))])
0.990667108460349
{'SVM__gamma': 0.1, 'SVM__C': 1000}
0.16438984870910645 seconds


In [41]:
#For Label Genus

train_df_ova = train_df.drop(['Family','Species'], axis=1)
test_df_ova = test_df.drop(['Family','Species'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  SVC(kernel='rbf',decision_function_shape='ovr'))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000],
              'SVM__gamma':[0.1, 0.01, 0.001, 0.0001]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)

In [42]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")

Pipeline(steps=[('scaler', StandardScaler()), ('SVM', SVC(C=1000, gamma=0.1))])
0.9874897440752312
{'SVM__gamma': 0.1, 'SVM__C': 1000}
0.2445511817932129 seconds


In [48]:
#For Label Species

train_df_ova = train_df.drop(['Family','Genus'], axis=1)
test_df_ova = test_df.drop(['Family','Genus'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  SVC(kernel='rbf',decision_function_shape='ovr'))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000],
              'SVM__gamma':[0.1, 0.01, 0.001, 0.0001]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)

In [49]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")

Pipeline(steps=[('scaler', StandardScaler()), ('SVM', SVC(C=1000, gamma=0.01))])
0.9882837893275269
{'SVM__gamma': 0.01, 'SVM__C': 1000}
0.07602310180664062 seconds


### iii. Repeat 1(b)ii with L1-penalized SVMs.3 Remember to standardize4 the at- tributes. Determine the weight of the SVM penalty using 10 fold cross vali- dation.

In [54]:
#For Label Family

train_df_ova = train_df.drop(['Genus','Species'], axis=1)
test_df_ova = test_df.drop(['Genus','Species'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  LinearSVC(penalty='l1',multi_class='ovr',dual=False))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [56]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('scaler', StandardScaler()),
                ('SVM', LinearSVC(C=1, dual=False, penalty='l1'))])
0.9406288459717882
{'SVM__C': 1}
1.1750059127807617 seconds
0.9286706808707735


In [57]:
#For Label Genus

train_df_ova = train_df.drop(['Family','Species'], axis=1)
test_df_ova = test_df.drop(['Family','Species'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  LinearSVC(penalty='l1',multi_class='ovr',dual=False))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [58]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('scaler', StandardScaler()),
                ('SVM', LinearSVC(C=10, dual=False, penalty='l1'))])
0.9525438638013192
{'SVM__C': 10}
1.7682170867919922 seconds
0.9416396479851783


In [59]:
#For Label Species

train_df_ova = train_df.drop(['Family','Genus'], axis=1)
test_df_ova = test_df.drop(['Family','Genus'], axis=1)
steps = [('scaler', StandardScaler()),
         ('SVM',  LinearSVC(penalty='l1',multi_class='ovr',dual=False))]
pipeline = Pipeline(steps)
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [60]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('scaler', StandardScaler()),
                ('SVM', LinearSVC(C=10, dual=False, penalty='l1'))])
0.9602847202499291
{'SVM__C': 10}
1.744386911392212 seconds
0.9592403890690134


### iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classifiers you trained.

In [63]:
#For Label Family

train_df_ova = train_df.drop(['Genus','Species'], axis=1)
test_df_ova = test_df.drop(['Genus','Species'], axis=1)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                ['SVM', LinearSVC(penalty='l1',multi_class='ovr',dual=False)]])
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [64]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('smote', SMOTE(random_state=11)), ('scaler', StandardScaler()),
                ['SVM', LinearSVC(C=1, dual=False, penalty='l1')]])
0.9215666319543059
{'SVM__C': 1}
3.584486961364746 seconds
0.9106067623899954


In [65]:
#For Label Genus

train_df_ova = train_df.drop(['Family','Species'], axis=1)
test_df_ova = test_df.drop(['Family','Species'], axis=1)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                ['SVM', LinearSVC(penalty='l1',multi_class='ovr',dual=False)]])
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [66]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('smote', SMOTE(random_state=11)), ('scaler', StandardScaler()),
                ['SVM', LinearSVC(C=1, dual=False, penalty='l1')]])
0.9106436776168387
{'SVM__C': 1}
10.563685894012451 seconds
0.904585456229736


In [67]:
#For Label Species

train_df_ova = train_df.drop(['Family','Genus'], axis=1)
test_df_ova = test_df.drop(['Family','Genus'], axis=1)

pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', StandardScaler()],
                                ['SVM', LinearSVC(penalty='l1',multi_class='ovr',dual=False)]])
parameters = {'SVM__C':[1, 10, 100,1000]}

X_train_ova = train_df_ova.iloc[:,:-1]
y_train_ova = train_df_ova.iloc[:,-1]
X_test_ova = test_df_ova.iloc[:,:-1]
y_test_ova = test_df_ova.iloc[:,-1]

cv = RandomizedSearchCV(pipeline, param_distributions = parameters, cv = 10, scoring='accuracy',return_train_score=True, refit=True)
cv.fit(X_train_ova, y_train_ova)



In [68]:
print(cv.best_estimator_)
print(cv.best_score_)
print(cv.best_params_)
print(str(cv.refit_time_)+ " seconds")
print(cv.score(X_test_ova, y_test_ova))

Pipeline(steps=[('smote', SMOTE(random_state=11)), ('scaler', StandardScaler()),
                ['SVM', LinearSVC(C=1, dual=False, penalty='l1')]])
0.9549252106409165
{'SVM__C': 1}
12.96825909614563 seconds
0.9606299212598425
