In [1]:
import numpy as np
import pandas as pd
from sklearn.multioutput import MultiOutputClassifier
from sklearn.svm import SVC,LinearSVC
from sklearn.model_selection import train_test_split,GridSearchCV,KFold,StratifiedKFold
from sklearn.preprocessing import LabelEncoder
from sklearn.cluster import KMeans
from sklearn.metrics import calinski_harabasz_score,accuracy_score
from sklearn.multiclass import OneVsOneClassifier
import warnings
warnings.filterwarnings('ignore')
from tqdm import tqdm

(a) Download the Anuran Calls (MFCCs) Data Set from: https://archive.ics.uci.edu/ml/datasets/Anuran+Calls+%28MFCCs%29. 
Choose 70% of the data randomly as the training set.

In [2]:
data = pd.read_csv('../Data/Frogs_MFCCs.csv')
labels = data.loc[:,['Family','Genus','Species']]
data = data.drop(['Family','Genus','Species'],axis=1)

In [3]:
train_x,test_x,train_y,test_y = train_test_split(data, labels,test_size=0.3)

In [4]:
train_x.index = range(len(train_x))
train_y.index = range(len(train_y))
test_x.index = range(len(test_x))
test_y.index = range(len(test_y))

In [5]:
for i in train_y:
    le = LabelEncoder().fit(train_y[i])
    train_y.loc[:,i] = le.transform(train_y[i])
    test_y.loc[:,i] = le.transform(test_y[i])

i. Research exact match and hamming score/ loss methods for evaluating multi-label classication and use them in evaluating the classiers in this problem.

Exact match method is all predict labels should be exactly same with the true label.

$EMR = \frac{1}{n}\sum^n_{i=1}I(Y_i =  Z_i)$

where i represent the ith instance.

Hamming loss is the fraction of wrong labels to the total numbers of labels, which is more tolerant because it consider patrial mis-classification.

$HL = \frac{1}{NL}\sum^N_{i=1}\sum^L_{j=1}I(\hat{y_{ij}}\neq y_{ij})$

Hamming score is the fraction of the set of correctly predict labels to the set of total labels.

$HS = \frac{1}{n}\sum^n_{i=1}\frac{|Y_i\cap Z_i|}{|Y_i\cup Z_i|}$

ii. Train a SVM for each of the labels, using Gaussian kernels and one versus all classiers. Determine the weight of the SVM penalty and the width of the Gaussian Kernel using 10 fold cross validation. You are welcome to try to solve the problem with both standardized and raw attributes and report the results.

In [6]:
def scores(true_y,pred_y):
    exact_match = 0
    hamming_loss = 0
    for i in range(len(true_y)):
        if all(true_y[i] == pred_y[i]):
            exact_match += 1
            hamming_loss += 4
        else:
            for j in range(3):
                if true_y[i][j] == pred_y[i][j]:
                    hamming_loss += 1
    return exact_match/len(true_y),1-(hamming_loss/(4*len(true_y)))
        

In [7]:

kf = KFold(10,shuffle=False)
tmp = {}
for train_index, test_index in kf.split(train_x,train_y):
    cv_train, cv_test = train_x.loc[train_index], train_x.loc[test_index]
    cvy_train, cvy_test = train_y.loc[train_index], train_y.loc[test_index]
    for l in labels:
        tmp[l] = tmp.get(l,{})
        ttrain_y = cvy_train[l]
        ttest_y = cvy_test[l]
        C = [0.01,0.1,1,10,100]
        gamma = ['auto','scale',0.01,0.1,1,10]
        for i in C:
            for j in gamma:
                tmp[l][(i,j)] = tmp[l].get((i,j),[])
                svc = SVC(kernel='rbf',C=i,gamma=j)
                ovr = OneVsOneClassifier(svc)
                ovr.fit(cv_train,ttrain_y)
                pred_test = ovr.predict(cv_test)
                s = accuracy_score(np.array(ttest_y),np.array(pred_test))
                tmp[l][(i,j)].append(s)
best_param = {}
for i in tmp:
    best_param[i] = sorted(tmp[i].items(),key=lambda x: np.mean(x[1]),reverse=True)[0][0]

In [8]:
pred_res = {}
for i in best_param:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    svc = SVC(kernel='rbf',C=best_param[i][0],gamma=best_param[i][1]).fit(train_x,ttrain_y)
    pred_res[i] = svc.predict(test_x)
em,hl = scores(test_y.values,pd.DataFrame(pred_res).values)

In [9]:
print('exact matching rate is:',em,'hamming loss is:',hl)

exact matching rate is: 0.999536822603057 hamming loss is: 0.0004631773969430153


In [10]:
best_param

{'Family': (1, 0.1), 'Genus': (1, 'auto'), 'Species': (1, 'auto')}

In [11]:
'''
pred_res = {}
for i in labels:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    parameters = {'C':[0.01,0.1,1,10,100],'gamma':('auto','scale',0.01,0.1,1,10)}
    svc = SVC(kernel='rbf',decision_function_shape='ovr')
    cv = GridSearchCV(svc,parameters,cv=10)
    cv.fit(train_x,ttrain_y)
    pred_res[i] = cv.predict(test_x)
    print("model:",i,"\n","best params:",cv.best_params_)
'''

'\npred_res = {}\nfor i in labels:\n    ttrain_y = train_y[i]\n    ttest_y = test_y[i]\n    parameters = {\'C\':[0.01,0.1,1,10,100],\'gamma\':(\'auto\',\'scale\',0.01,0.1,1,10)}\n    svc = SVC(kernel=\'rbf\',decision_function_shape=\'ovr\')\n    cv = GridSearchCV(svc,parameters,cv=10)\n    cv.fit(train_x,ttrain_y)\n    pred_res[i] = cv.predict(test_x)\n    print("model:",i,"\n","best params:",cv.best_params_)\n'

In [12]:
'''
pred_res = {}
for i in labels:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    parameters = {'C':[0.01,0.1,1,10,100],'gamma':('auto','scale',0.01,0.1,1,10)}
    svc = SVC(kernel='rbf',decision_function_shape='ovr')
    cv = GridSearchCV(svc,parameters,cv=10)
    cv.fit(train_x,ttrain_y)
    pred_res[i] = cv.predict(test_x)
    print("model:",i,"\n","best params:",cv.best_params_)
'''

'\npred_res = {}\nfor i in labels:\n    ttrain_y = train_y[i]\n    ttest_y = test_y[i]\n    parameters = {\'C\':[0.01,0.1,1,10,100],\'gamma\':(\'auto\',\'scale\',0.01,0.1,1,10)}\n    svc = SVC(kernel=\'rbf\',decision_function_shape=\'ovr\')\n    cv = GridSearchCV(svc,parameters,cv=10)\n    cv.fit(train_x,ttrain_y)\n    pred_res[i] = cv.predict(test_x)\n    print("model:",i,"\n","best params:",cv.best_params_)\n'

In [13]:
#em,hl = scores(test_y.values,pd.DataFrame(pred_res).values)
#print('exact matching rate is:',em,'hamming loss is:',hl)

The best weights of SVM penalty and the width of the Gaussian Kernel are:

||weight of SVM panalty|Width of Gaussian Kernel|
|------|----------------------|-----------------------|
|Family|10|auto|
|Genus|10|auto|
|Species|10|auto|

in which, 'auto' means $\frac{1}{n_{features}}$, which is 0.043 in this dataset.

The exact matching rate is 1, and the hamming loss is 0.

iii. Repeat 1(b)ii with L1-penalized SVMs. Remember to standardize the attributes. Determine the weight of the SVM penalty using 10 fold cross validation.

In [14]:

kf = KFold(10,shuffle=False)
tmp = {}
for train_index, test_index in kf.split(train_x):
    cv_train, cv_test = train_x.loc[train_index], train_x.loc[test_index]
    cvy_train, cvy_test = train_y.loc[train_index], train_y.loc[test_index]
    for l in labels:
        tmp[l] = tmp.get(l,{})
        ttrain_y = cvy_train[l]
        ttest_y = cvy_test[l]
        C = [0.01,0.1,1,10,100,1000,10000]
        for i in C:
            tmp[l][i] = tmp[l].get(i,[])
            lsvc = LinearSVC(penalty='l1',dual=False,C=i)
            ovr = OneVsOneClassifier(lsvc)
            ovr.fit(cv_train,ttrain_y)
            pred_test = ovr.predict(cv_test)
            s = accuracy_score(np.array(ttest_y),np.array(pred_test))
            tmp[l][i].append(s)
best_param2 = {}
for i in tmp:
    best_param2[i] = sorted(tmp[i].items(),key=lambda x: np.mean(x[1]),reverse=True)[0][0]
pred_res2 = {}
for i in best_param2:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    lsvc = LinearSVC(penalty='l1',dual=False,C=best_param2[i]).fit(train_x,ttrain_y)
    pred_res2[i] = lsvc.predict(test_x)
em2,hl2 = scores(test_y.values,pd.DataFrame(pred_res2).values)
print('exact matching rate is:',round(em2,4),'hamming loss is:',round(hl2,4))

exact matching rate is: 0.9453 hamming loss is: 0.0336


In [15]:
for i in tmp:
    for j in tmp[i]:
        tmp[i][j] = np.mean(tmp[i][j])

In [16]:
best_param2

{'Family': 10, 'Genus': 10, 'Species': 10}

In [17]:
'''
pred_res2 = {}
for i in labels:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    parameters = {'C':[0.01,0.1,1,10,100,1000,10000]}
    lsvc = LinearSVC(penalty='l1',dual=False)
    cv = GridSearchCV(lsvc,parameters,cv=10)
    cv.fit(train_x,ttrain_y)
    pred_res2[i] = cv.predict(test_x)
    print("model:",i,"\n","best params:",cv.best_params_)
'''

'\npred_res2 = {}\nfor i in labels:\n    ttrain_y = train_y[i]\n    ttest_y = test_y[i]\n    parameters = {\'C\':[0.01,0.1,1,10,100,1000,10000]}\n    lsvc = LinearSVC(penalty=\'l1\',dual=False)\n    cv = GridSearchCV(lsvc,parameters,cv=10)\n    cv.fit(train_x,ttrain_y)\n    pred_res2[i] = cv.predict(test_x)\n    print("model:",i,"\n","best params:",cv.best_params_)\n'

In [18]:
#em2,hl2 = scores(test_y.values,pd.DataFrame(pred_res2).values)
#print('exact matching rate is:',round(em2,2),'hamming loss is:',round(hl2,2))

The best weights of the SVM penalty are:
||weight of SVM panalty|
|------|----------------------|
|Family|10|
|Genus|1|
|Species|1|

The exact matching rate is 0.95, and the hamming loss is 0.03.


iv. Repeat 1(b)iii by using SMOTE or any other method you know to remedy class imbalance. Report your conclusions about the classi ers you trained.

Give classes different weights to remedy the class imbalance.

In [19]:

kf = KFold(10,shuffle=False)
tmp = {}
for train_index, test_index in kf.split(train_x,train_y):
    cv_train, cv_test = train_x.loc[train_index], train_x.loc[test_index]
    cvy_train, cvy_test = train_y.loc[train_index], train_y.loc[test_index]
    for l in labels:
        tmp[l] = tmp.get(l,{})
        ttrain_y = cvy_train[l]
        ttest_y = cvy_test[l]
        C = [0.01,0.1,1,10,100]
        gamma = ['auto','scale',0.01,0.1,1,10]
        for i in C:
            for j in gamma:
                tmp[l][(i,j)] = tmp[l].get((i,j),[])
                svc = SVC(kernel='rbf',C=i,gamma=j,class_weight='balanced')
                ovr = OneVsOneClassifier(svc)
                ovr.fit(cv_train,ttrain_y)
                pred_test = ovr.predict(cv_test)
                s = accuracy_score(np.array(ttest_y),np.array(pred_test))
                tmp[l][(i,j)].append(s)
best_param3 = {}
for i in tmp:
    best_param3[i] = sorted(tmp[i].items(),key=lambda x: np.mean(x[1]),reverse=True)[0][0]
pred_res3 = {}
for i in best_param3:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    svc = SVC(kernel='rbf',C=best_param3[i][0],gamma=best_param3[i][1],class_weight='balanced').fit(train_x,ttrain_y)
    pred_res3[i] = svc.predict(test_x)
em,hl = scores(test_y.values,pd.DataFrame(pred_res3).values)
print('exact matching rate is:',em,'hamming loss is:',hl)

exact matching rate is: 0.999536822603057 hamming loss is: 0.00023158869847150765


In [20]:
best_param3

{'Family': (1, 0.1), 'Genus': (1, 'auto'), 'Species': (1, 'auto')}

In [21]:
'''
pred_res3 = {}
for i in labels:
    ttrain_y = train_y[i]
    ttest_y = test_y[i]
    parameters = {'C':[0.01,0.1,1,10,100],'gamma':('auto','scale',0.01,0.1,1,10)}
    svc = SVC(kernel='rbf',decision_function_shape='ovr',class_weight='balanced')
    cv = GridSearchCV(svc,parameters,cv=10)
    cv.fit(train_x,ttrain_y)
    pred_res3[i] = cv.predict(test_x)
    print("model:",i,"\n","best params:",cv.best_params_)
'''

'\npred_res3 = {}\nfor i in labels:\n    ttrain_y = train_y[i]\n    ttest_y = test_y[i]\n    parameters = {\'C\':[0.01,0.1,1,10,100],\'gamma\':(\'auto\',\'scale\',0.01,0.1,1,10)}\n    svc = SVC(kernel=\'rbf\',decision_function_shape=\'ovr\',class_weight=\'balanced\')\n    cv = GridSearchCV(svc,parameters,cv=10)\n    cv.fit(train_x,ttrain_y)\n    pred_res3[i] = cv.predict(test_x)\n    print("model:",i,"\n","best params:",cv.best_params_)\n'

In [22]:
#em3,hl3 = scores(test_y.values,pd.DataFrame(pred_res3).values)
#print('exact matching rate is:',round(em3,2),'hamming loss is:',round(hl3,2))

The best weights of SVM penalty and the width of the Gaussian Kernel are:

||weight of SVM panalty|Width of Gaussian Kernel|
|------|----------------------|-----------------------|
|Family|10|auto|
|Genus|1|0.1|
|Species|1|0.1|

in which, 'auto' means $\frac{1}{n_{features}}$, which is 0.043 in this dataset.

The exact matching rate is 1, and the hamming loss is 0.



Q2

Monte-Carlo Simulation: Perform the following procedures 50 times, and report the average and standard deviation of the 50 Hamming Distances that you calculate.

In [23]:
def hammings(res):
    distance = 0
    for i in range(len(res)):
        if res.loc[i]['Family_true'] != res.loc[i]['Family_pred']:
            distance += 1
        if res.loc[i]['Genus_true'] != res.loc[i]['Genus_pred']:
            distance += 1
        if res.loc[i]['Species_true'] != res.loc[i]['Species_pred']:
            distance += 1
    loss = distance/(3*len(res))
    score = (3*len(res)-distance)/(3*len(res)+distance)
    return {'distance':distance,'score':score,'loss':loss}

(a) Use k-means clustering on the whole Anuran Calls (MFCCs) Data Set (do not split the data into train and test, as we are not performing supervised learning in this exercise). Choose k ∈ {1, 2, . . . , 50} automatically based on one of the methods provided in the slides (CH or Gap Statistics or scree plots or Silhouettes) or any other method you know.

(b) In each cluster, determine which family is the majority by reading the true labels. Repeat for genus and species.

(c) Now for each cluster you have a majority label triplet (family, genus, species). Calculate the average Hamming distance, Hamming score, and Hamming loss between the true labels and the labels assigned by clusters.

Ans:
THe program choose k automatically based on max CH.

In [24]:
distance_list = []
for i in range(50):
    cluster_score = []
    for k in range(2,51):
        cluster_res = KMeans(n_clusters=k).fit_predict(data)
        cluster_score.append(calinski_harabasz_score(data,cluster_res))

    print('best k:',np.array(cluster_score).argmax()+2)

    labels['class'] = cluster_res
    class_pred = labels.groupby('class').apply(pd.DataFrame.mode)
    class_pred.index = range(len(class_pred))
    class_pred = pd.merge(labels,class_pred,on='class',suffixes=['_true','_pred'])
    hamming = hammings(class_pred)
    distance_list.append(hamming['distance'])
    print(hamming)

best k: 49
{'distance': 3, 'score': 0.999722067815453, 'loss': 0.0001389854065323141}
best k: 48
{'distance': 3, 'score': 0.999722067815453, 'loss': 0.0001389854065323141}
best k: 49
{'distance': 84, 'score': 0.9922469887858231, 'loss': 0.003891591382904795}
best k: 45
{'distance': 3, 'score': 0.999722067815453, 'loss': 0.0001389854065323141}
best k: 47
{'distance': 39, 'score': 0.996392896781354, 'loss': 0.0018068102849200835}
best k: 48
{'distance': 84, 'score': 0.9922469887858231, 'loss': 0.003891591382904795}
best k: 45
{'distance': 84, 'score': 0.9922469887858231, 'loss': 0.003891591382904795}
best k: 49
{'distance': 39, 'score': 0.996392896781354, 'loss': 0.0018068102849200835}
best k: 50
{'distance': 39, 'score': 0.996392896781354, 'loss': 0.0018068102849200835}
best k: 48
{'distance': 48, 'score': 0.9955623353210373, 'loss': 0.0022237665045170257}
best k: 45
{'distance': 84, 'score': 0.9922469887858231, 'loss': 0.003891591382904795}
best k: 50
{'distance': 3, 'score': 0.9997220

In [25]:
print('The mean of Monte-Carlo Hamming distance:', round(np.mean(distance_list),2))

The mean of Monte-Carlo Hamming distance: 50.88


In [26]:
print('The standard deviation of Monte-Carlo Hamming distance:', round(np.std(distance_list),2))

The standard deviation of Monte-Carlo Hamming distance: 33.45


The mean of 50 Hamming distances is 50.88 and the variance is 33.45.

3. ISLR 12.6.2

![1](3_1.jpeg)

![2](3_2.jpeg)

【reference】
1. Sklearn api reference: https://scikit-learn.org/stable/modules/classes.html
2. Pandas api reference: https://pandas.pydata.org/pandas-docs/stable/reference/index.html
3. GroupBy and select most common value: https://stackoverflow.com/questions/15222754/groupby-pandas-dataframe-and-select-most-common-value
4. Hamming score/loss: https://stackoverflow.com/questions/32239577/getting-the-accuracy-for-multi-label-prediction-in-scikit-learn
