In [1]:
#Multi-class Multi-label classification

In [1]:
import numpy as np
import pandas as pd
from sklearn.svm import SVC

data=pd.read_csv('Frogs_MFCCs.csv')

In [2]:
from sklearn.model_selection import train_test_split
data=data.sample(frac=1) # shuffle the data
train, test = train_test_split(data, test_size=0.3)


In [27]:
# bi)
def exact_match_score(prediction,true): #assume inputs are dataframe and contain only relevant data
    num=len(true)
    correct=0
    for i in range (num):
        compare=prediction.iloc[i]==true.iloc[i]
        if compare[0]==compare[1]==compare[2]==True:correct=correct+1 
    accuracy=correct/num
    return (accuracy)


from sklearn.metrics import hamming_loss
def h_loss(prediction,true):
    h_loss=0
    for i in range(len(true)):
        h_loss=h_loss+hamming_loss(true.iloc[i],prediction.iloc[i])
    h_loss=h_loss/len(prediction)
    return (h_loss)

In [None]:
#We try c from 10^-3 to 10^8 and gamma from 0.1 to 2.0 with a step of 0.1. 
#We select those parameters resulting in score larger than 0.8

In [14]:
#bii)
from sklearn.model_selection import GridSearchCV

def svm_modelling(train_data,label):
    C=[10**(-3),10**(-2),10**(-1),10**(0),10**(1),10**(2),10**(3),10**(4),10**(5),10**(6),10**(7),10**(8)]
    gamma=np.arange(0.1, 2.1, 0.1)
    
    param=np.zeros((1,2))
    for c in C:
        svm=SVC(C=c,kernel='rbf',gamma='auto')
        svm.fit(train_data.iloc[:,0:22], train_data.loc[:,label])
        score=svm.score(train_data.iloc[:,0:22], train_data.loc[:,label])
        c_score=np.array([c,score]).reshape((1,2))
        param=np.concatenate((param,c_score),axis=0)
    param=param[1:]
    param=np.array([a for a in param if a[1] >0.8])
    C_list=param[:,0]

    param=np.zeros((1,2))
    for g in gamma:
        svm=SVC(kernel='rbf',gamma=g)
        svm.fit(train_data.iloc[:,0:22], train_data.loc[:,label])
        score=svm.score(train_data.iloc[:,0:22], train_data.loc[:,label])
        g_score=np.array([g,score]).reshape((1,2))
        param=np.concatenate((param,g_score),axis=0)
    param=param[1:]
    param=np.array([a for a in param if a[1] >0.8])
    g_list=param[:,0]
    
    svm=SVC()

    param_grid = [{'C': C_list, 'gamma': g_list, 'kernel': ['rbf']}]

    svm_cv = GridSearchCV(svm, param_grid,cv=10,return_train_score=True,refit=True)
    svm_cv.fit(train_data.iloc[:,0:22], train_data.loc[:,label])
    print('Best estimator:',svm_cv.best_estimator_)
    print('Best score:',svm_cv.best_score_ )
    return (svm_cv)
    
    

In [301]:
family_clf=svm_modelling(train,'Family')

Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.2000000000000002,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.992454328832


In [302]:
genus_clf=svm_modelling(train,'Genus')

Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.9000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.9902700556


In [303]:
species_clf=svm_modelling(train,'Species')

Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.2000000000000002,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.988482922955


In [305]:
# Test classifiers on test set
from sklearn.metrics import accuracy_score
f=family_clf.predict(test.iloc[:,0:22])
g=genus_clf.predict(test.iloc[:,0:22])
s=species_clf.predict(test.iloc[:,0:22])
d = {'Family': f, 'Genus': g,'Species':s}
pred= pd.DataFrame(data=d)
#accuracy_score(test.loc[:,['Family','Genus','Species']],pred)

In [306]:
from sklearn.metrics import hamming_loss
print('Exact match score:',exact_match_score(pred,test.loc[:,['Family','Genus','Species']]))
print('Hamming loss:',h_loss(test.loc[:,['Family','Genus','Species']],pred))

Exact match score: 0.9888837424733673
Hamming loss: 0.0067932684885


In [None]:
#biii)
from sklearn.model_selection import cross_validate
from sklearn.svm import LinearSVC
def linear_svm_modelling(train_data,label):
    C=[10**(-3),10**(-2),10**(-1),10**(0),10**(1),10**(2),10**(3),10**(4),10**(5),10**(6),10**(7),10**(8)]
    #gamma=np.arange(0.1, 2.1, 0.1)
    
    param=np.zeros((1,2))
    for c in C:
        linear_svm=LinearSVC(C=c,penalty='l1',dual=False)
        linear_svm.fit(train_data.iloc[:,0:22], train_data.loc[:,label])
        score=linear_svm.score(train_data.iloc[:,0:22], train_data.loc[:,label])
        c_score=np.array([c,score]).reshape((1,2))
        param=np.concatenate((param,c_score),axis=0)
    param=param[1:]
    param=np.array([a for a in param if a[1] >0.8])
    C_list=param[:,0]

    
    param_grid = [{'C': C_list}]
    linear_svm=LinearSVC(penalty='l1',dual=False)
    linear_svm_cv = GridSearchCV(linear_svm, param_grid,cv=10,return_train_score=True,refit=True)
    linear_svm_cv.fit(train_data.iloc[:,0:22], train_data.loc[:,label])
    print('Best estimator:',linear_svm_cv.best_estimator_)
    print('Best score:',linear_svm_cv.best_score_ )
    return (linear_svm_cv)

In [322]:
family_clf_l1=linear_svm_modelling(train,'Family')

Best estimator: LinearSVC(C=100000.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.932684670373


In [323]:
genus_clf_l1=linear_svm_modelling(train,'Genus')

Best estimator: LinearSVC(C=10.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.948768864178


In [324]:
species_clf_l1=linear_svm_modelling(train,'Species')

Best estimator: LinearSVC(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.957903097697


In [325]:
f_l1=family_clf_l1.predict(test.iloc[:,0:22])
g_l1=genus_clf_l1.predict(test.iloc[:,0:22])
s_l1=species_clf_l1.predict(test.iloc[:,0:22])
d_l1 = {'Family': f_l1, 'Genus': g_l1,'Species':s_l1}
pred_l1= pd.DataFrame(data=d_l1)

print('Exact match score:',exact_match_score(pred_l1,test.loc[:,['Family','Genus','Species']]))
print('Hamming loss:',h_loss(test.loc[:,['Family','Genus','Species']],pred_l1))

Exact match score: 0.9161648911533117
Hamming loss: 0.0501775513355


In [8]:
# biv)
from imblearn.over_sampling import SMOTE

In [327]:
X_resampled, y_resampled = SMOTE().fit_sample(train.iloc[:,0:22], train.loc[:,'Family'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
train_smote_f=np.concatenate((X_resampled,y_resampled),axis=1)
train_smote_f=pd.DataFrame(train_smote_f)
train_smote_f.rename(columns={22:'Family'}, inplace=True )

X_resampled, y_resampled = SMOTE().fit_sample(train.iloc[:,0:22], train.loc[:,'Genus'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
train_smote_g=np.concatenate((X_resampled,y_resampled),axis=1)
train_smote_g=pd.DataFrame(train_smote_g)
train_smote_g.rename(columns={22:'Genus'}, inplace=True )

X_resampled, y_resampled = SMOTE().fit_sample(train.iloc[:,0:22], train.loc[:,'Species'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
train_smote_s=np.concatenate((X_resampled,y_resampled),axis=1)
train_smote_s=pd.DataFrame(train_smote_s)
train_smote_s.rename(columns={22:'Species'}, inplace=True )

In [328]:
# Use SMOTE and retrain L1 penalized Linear SVC
family_clf_l1_smote=linear_svm_modelling(train_smote_f,'Family')
genus_clf_l1_smote=linear_svm_modelling(train_smote_g,'Genus')
species_clf_l1_smote=linear_svm_modelling(train_smote_s,'Species')

Best estimator: LinearSVC(C=100.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.945137555982
Best estimator: LinearSVC(C=100000000.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.954831575366
Best estimator: LinearSVC(C=1000.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l1', random_state=None, tol=0.0001,
     verbose=0)
Best score: 0.954242793341


In [329]:
f_l1_smote=family_clf_l1_smote.predict(test.iloc[:,0:22])
g_l1_smote=genus_clf_l1_smote.predict(test.iloc[:,0:22])
s_l1_smote=species_clf_l1_smote.predict(test.iloc[:,0:22])
d_l1_smote = {'Family': f_l1_smote, 'Genus': g_l1_smote,'Species':s_l1_smote}
pred_l1_smote= pd.DataFrame(data=d_l1_smote)

print('Exact match score:',exact_match_score(pred_l1_smote,test.loc[:,['Family','Genus','Species']]))
print('Hamming loss:',h_loss(test.loc[:,['Family','Genus','Species']],pred_l1_smote))

Exact match score: 0.8582677165354331
Hamming loss: 0.0722556739231


From the above results, we can see that SVM with Gaussian kernel has the best performance. Even though using SMOTE solve the issue of class imbalance, linear SVC cannot generate best results.

In [None]:
#bv) Chain Classifier 
#Using SMOTE to generate balanced dataset
# it is known that family>genus>species and therefore the order of chain classifier is determined 

In [9]:
# convert string to int in the training set 
import copy
train2=copy.copy(train)
test2=copy.copy(test)

temp_f = train2.loc[:,'Family'].astype('category')
temp_f = temp_f.cat.codes
temp_g = train2.loc[:,'Genus'].astype('category')
temp_g = temp_g.cat.codes
temp_s = train2.loc[:,'Species'].astype('category')
temp_s = temp_s.cat.codes

test_temp_f = test2.loc[:,'Family'].astype('category')
test_temp_f = test_temp_f.cat.codes
test_temp_g = test2.loc[:,'Genus'].astype('category')
test_temp_g = test_temp_g.cat.codes
test_temp_s = test2.loc[:,'Species'].astype('category')
test_temp_s = test_temp_s.cat.codes

In [10]:
train2.loc[:,'Family']=temp_f
train2.loc[:,'Genus']=temp_g
train2.loc[:,'Species']=temp_s

test2.loc[:,'Family']=test_temp_f
test2.loc[:,'Genus']=test_temp_g
test2.loc[:,'Species']=test_temp_s

In [11]:
X_resampled, y_resampled = SMOTE().fit_sample(train2.iloc[:,0:22], train2.loc[:,'Family'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
cc_train2_smote_f=np.concatenate((X_resampled,y_resampled),axis=1)
cc_train2_smote_f=pd.DataFrame(cc_train2_smote_f)
cc_train2_smote_f.rename(columns={22:'Family'}, inplace=True )

X_resampled, y_resampled = SMOTE().fit_sample(train2.iloc[:,0:23], train2.loc[:,'Genus'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
cc_train2_smote_g=np.concatenate((X_resampled,y_resampled),axis=1)
cc_train2_smote_g=pd.DataFrame(cc_train2_smote_g)
cc_train2_smote_g.rename(columns={23:'Genus'}, inplace=True )

X_resampled, y_resampled = SMOTE().fit_sample(train2.iloc[:,0:24], train2.loc[:,'Species'])
y_resampled=np.resize(y_resampled,(len(y_resampled),1))
cc_train2_smote_s=np.concatenate((X_resampled,y_resampled),axis=1)
cc_train2_smote_s=pd.DataFrame(cc_train2_smote_s)
cc_train2_smote_s.rename(columns={24:'Species'}, inplace=True )

In [17]:
# chain classifier 
def chain_svm_modelling(train_data,label):
    C=[10**(-3),10**(-2),10**(-1),10**(0),10**(1),10**(2),10**(3),10**(4),10**(5),10**(6),10**(7),10**(8)]
    gamma=np.arange(0.1, 2.1, 0.1)
    if label=='Family':index=22
    elif label=='Genus':index=23
    elif label=='Species':index=24
    param=np.zeros((1,2))
    for c in C:
        svm=SVC(C=c,kernel='rbf',gamma='auto')
        svm.fit(train_data.iloc[:,:index], train_data.loc[:,label])
        score=svm.score(train_data.iloc[:,:index], train_data.loc[:,label])
        c_score=np.array([c,score]).reshape((1,2))
        param=np.concatenate((param,c_score),axis=0)
    param=param[1:]
    param=np.array([a for a in param if a[1] >0.8])
    C_list=param[:,0]

    param=np.zeros((1,2))
    for g in gamma:
        svm=SVC(kernel='rbf',gamma=g)
        svm.fit(train_data.iloc[:,:index], train_data.loc[:,label])
        score=svm.score(train_data.iloc[:,:index], train_data.loc[:,label])
        g_score=np.array([g,score]).reshape((1,2))
        param=np.concatenate((param,g_score),axis=0)
    param=param[1:]
    param=np.array([a for a in param if a[1] >0.8])
    g_list=param[:,0]
    
    svm=SVC()

    param_grid = [{'C': C_list, 'gamma': g_list, 'kernel': ['rbf']}]

    svm_cv = GridSearchCV(svm, param_grid,cv=10,return_train_score=True,refit=True,n_jobs=-1)
    svm_cv.fit(train_data.iloc[:,:index], train_data.loc[:,label])
    print('Best estimator:',svm_cv.best_estimator_)
    print('Best score:',svm_cv.best_score_ )
    return (svm_cv)
    

In [336]:
chain_family_clf=chain_svm_modelling(cc_train2_smote_f,'Family')

Best estimator: SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.6000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.998080614203


In [337]:
chain_genus_clf=chain_svm_modelling(cc_train2_smote_g,'Genus')

Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.7000000000000002,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.999787342634


In [18]:
chain_species_clf=chain_svm_modelling(cc_train2_smote_s,'Species')

KeyboardInterrupt: 

It is too time-consuming to train a svm classfier using smote.

Use original dataset for training instead

In [22]:
chain_family_clf2=chain_svm_modelling(test2,'Family')
chain_genus_clf2=chain_svm_modelling(test2,'Genus')
chain_species_clf2=chain_svm_modelling(test2,'Species')

Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=1.8000000000000003,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.989810097267
Best estimator: SVC(C=100.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.40000000000000002,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.997684113015
Best estimator: SVC(C=10.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.20000000000000001,
  kernel='rbf', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)
Best score: 0.999073645206


In [28]:
test3=copy.copy(test2)
f_chain=chain_family_clf2.predict(test3.iloc[:,0:22])
test3.loc[:,'Family']=f_chain
g_chain=chain_genus_clf2.predict(test3.iloc[:,0:23])
test3.loc[:,'Genus']=g_chain
s_chain=chain_species_clf2.predict(test3.iloc[:,0:24])
test3.loc[:,'Species']=s_chain
d_chain = {'Family': f_chain, 'Genus': g_chain,'Species':s_chain}
pred_chain= pd.DataFrame(data=d_chain)

print('Exact match score:',exact_match_score(pred_chain,test2.loc[:,['Family','Genus','Species']]))
print('Hamming loss:',h_loss(test2.loc[:,['Family','Genus','Species']],pred_chain))

Exact match score: 0.999536822603057
Hamming loss: 0.000154392465648


We can see that using a chain classfier can give more accurate result!