# COMP47590: Advanced Machine Learning
# Assignment 1: Multi-label Classification

Name(s): Hanyuxi Zhou

Student Number(s): 19204163

## Import Packages Etc

In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import math

from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_array, check_is_fitted
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import hamming_loss
from sklearn.metrics import classification_report
from sklearn.metrics import multilabel_confusion_matrix
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from imblearn.under_sampling import RandomUnderSampler

# import other useful packages

## Task 0: Load the Yeast Dataset

In [60]:
# Write your code here
dataset = pd.read_csv("yeast.csv")
X = dataset.iloc[:,:-14]
Y = dataset.iloc[:,-14:]
X = X.values
Y = Y.values

print(X)
print(Y)
print(X.shape)
print(Y.shape)

for i in range(14):
    unique, counts = np.unique(Y[:,i], return_counts=True)
    print(unique,counts)

[[ 0.004168 -0.170975 -0.156748 ... -0.018312  0.030126  0.124722]
 [-0.103956  0.011879 -0.098986 ... -0.041471 -0.079758  0.017161]
 [ 0.509949  0.401709  0.293799 ...  0.02671  -0.066565 -0.122352]
 ...
 [ 0.082526 -0.095571 -0.022019 ... -0.066957  0.260121 -0.125303]
 [-0.13083   0.008868 -0.009457 ... -0.122332 -0.022453  0.001953]
 [-0.171578 -0.066536  0.168206 ... -0.083342 -0.063135  0.01881 ]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
(2417, 103)
(2417, 14)
[0 1] [1655  762]
[0 1] [1379 1038]
[0 1] [1434  983]
[0 1] [1555  862]
[0 1] [1695  722]
[0 1] [1820  597]
[0 1] [1989  428]
[0 1] [1937  480]
[0 1] [2239  178]
[0 1] [2164  253]
[0 1] [2128  289]
[0 1] [ 601 1816]
[0 1] [ 618 1799]
[0 1] [2383   34]


## Task 1: Implement the Binary Relevance Algorithm

In [125]:
# Write your code here
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    """ 
    Parameters
    ----------   
    base_estimator string
    the base estimator to fit on the dataset. If None, then the base estimator is a decision tree.
    
    under_sample boolean, optional (default = "false")
    whether under sampling is applied or not

    Attributes
    ----------
    n_estimators_ number of estimators
    
    base_estimator_ type of estimators
    
    estimators_ list of estimators to be trained
    
    under_sample_
    
    n_labels_ number of labels
    
    ----------
    """
    def __init__(self, base_estimator='None',n_estimators=14):
        self.n_estimators_ = n_estimators
        self.base_estimator_ = base_estimator
        
    
    def fit(self,X,Y):
        """Build classifiers from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        Y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
        
        self.n_labels_ = Y.shape[1]
        
        self.estimators_ = list()
        if self.base_estimator_ == 'None'or self.base_estimator_ == 'DecisionTree':
            self.base_estimator_ = 'DecisionTree'
            for i in range(self.n_labels_):
                self.estimators_.append(DecisionTreeClassifier(max_depth=5))
        if self.base_estimator_ == 'LogisticRegression':
            for i in range(self.n_labels_):
                self.estimators_.append(LogisticRegression())
                
        for i in range(self.n_labels_):
            y = Y[:,i]
            #X, y = check_X_y(X, y)
            self.estimators_[i].fit(X,y)
            
        return self
    
    def predict(self,X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        predictions : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
         # Check is fit had been called by confirming that the labels_ has been set up
        check_is_fitted(self, ['n_labels_'])
        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        # Initialise an empty list to store the predictions made
        predictions = list()
        
        for i in range(self.n_labels_):
            #pred = self.estimators_[i].predict(X)
            p = self.estimators_[i].predict_proba(X)
            pred = self.estimators_[i].classes_.take(np.argmax(p, axis=1), axis=0)
            predictions.append(pred)

        return np.array(predictions).T
    
    def predict_proba(self,X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. 
        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1
            The class probabilities of the input samples. The order of the classes corresponds to that in the attribute labels_.
        """
        # Check is fit had been called by confirming that the labels_ dictionary has been set up
        check_is_fitted(self, ['n_labels_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an array to store the prediction scores generated
        proba = list()
        
        for i in range(self.n_labels_):
            pred = self.estimators_[i].predict_proba(X)
            proba.append(pred[:,1])# the probability of labels are selected(the probability when the prediction is 1)
            
        return np.array(proba).T

In [126]:
binary_rele = BinaryRelevanceClassifier()
binary_rele.fit(X,Y)
pred = binary_rele.predict(X)
proba = binary_rele.predict_proba(X)
print(pred)
print(Y)
print(proba)

[[0 0 1 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 0 1 ... 1 1 0]
 ...
 [0 1 1 ... 1 1 0]
 [0 1 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[0.22145329 0.065      0.72380952 ... 0.86419753 0.83972912 0.01084689]
 [0.31404959 0.22875817 0.57971014 ... 0.4822695  0.4822695  0.01084689]
 [0.22145329 0.415      0.5483871  ... 0.77040816 0.76020408 0.01084689]
 ...
 [0.22145329 0.73611111 0.54285714 ... 0.6825054  0.66738661 0.01084689]
 [0.29378531 0.61087866 0.14953271 ... 0.6825054  0.66738661 0.01084689]
 [0.22145329 0.85046729 0.54285714 ... 0.86419753 0.83972912 0.01084689]]


## Task 2: Implement the Binary Relevance Algorithm with Under-Sampling

In [136]:
# Write your code here
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class BinaryRelevanceClassifier(BaseEstimator, ClassifierMixin):
    """ 
    Parameters
    ----------   
    base_estimator string
    the base estimator to fit on the dataset. If None, then the base estimator is a decision tree.
    
    under_sample boolean, optional (default = "false")
    whether under sampling is applied or not

    Attributes
    ----------
    n_estimators_ number of estimators
    
    base_estimator_ type of estimators
    
    estimators_ list of estimators to be trained
    
    under_sample_ 
    
    n_labels_ number of labels
    
    ----------
    """
    
    def __init__(self, base_estimator='None',n_estimators=14, under_sample=False):
        self.n_estimators_ = n_estimators
        self.under_sample_ = under_sample
        self.base_estimator_ = base_estimator

    def fit(self,X,Y):
        """Build classifiers from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        Y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
        
        self.n_labels_ = Y.shape[1]
        
        self.estimators_ = list()
        if self.base_estimator_ == 'None'or self.base_estimator_ == 'DecisionTree':
            self.base_estimator_ = 'DecisionTree'
            for i in range(self.n_labels_):
                self.estimators_.append(DecisionTreeClassifier(max_depth=5))
        if self.base_estimator_ == 'LogisticRegression':
            for i in range(self.n_labels_):
                self.estimators_.append(LogisticRegression())
                
        for i in range(self.n_labels_):
            # get the ith column of Y 
            y = Y[:,i]
            #X, y = check_X_y(X, y)
            # random under sampling
            if self.under_sample_ == True:
                rus = RandomUnderSampler(random_state=0)
                X_resampled, y_resampled = rus.fit_resample(X, y)
                self.estimators_[i].fit(X_resampled,y_resampled)
            else:
                self.estimators_[i].fit(X,y)
            
        return self
        
    
    def predict(self,X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        predictions : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
         # Check is fit had been called by confirming that the labels_ has been set up
        check_is_fitted(self, ['n_labels_'])
        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        # Initialise an empty list to store the predictions made
        predictions = list()
        
        for i in range(self.n_labels_):
            #pred = self.estimators_[i].predict(X)
            p = self.estimators_[i].predict_proba(X)
            pred = self.estimators_[i].classes_.take(np.argmax(p, axis=1), axis=0)
            predictions.append(pred)

        """
        for i in X:
            p = list()
            for instance in self.estimators_:
                result = instance.predict(i.reshape(1, -1))
                p.append(result[0])
            predictions.append(p)
        """
        return np.array(predictions).T
    
    def predict_proba(self,X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. 
        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1
            The class probabilities of the input samples. The order of the classes corresponds to that in the attribute labels_.
        """
        # Check is fit had been called by confirming that the labels_ dictionary has been set up
        check_is_fitted(self, ['n_labels_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an array to store the prediction scores generated
        proba = list()
        
        for i in range(self.n_labels_):
            pred = self.estimators_[i].predict_proba(X)
            proba.append(pred[:,1])# the probability of labels are selected(the probability when the prediction is 1)
            
        return np.array(proba).T

In [129]:
binary_rele = BinaryRelevanceClassifier(under_sample=True)
binary_rele.fit(X,Y)
pred = binary_rele.predict(X)
proba = binary_rele.predict_proba(X)
print(pred)
print(Y)
print(proba)


[[0 0 1 ... 0 0 1]
 [0 0 1 ... 0 0 1]
 [0 1 1 ... 0 0 0]
 ...
 [0 1 1 ... 0 0 0]
 [0 0 0 ... 0 1 1]
 [0 1 1 ... 0 0 0]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[0.         0.33834586 0.91071429 ... 0.44270833 0.30434783 1.        ]
 [0.2625     0.33834586 0.90147783 ... 0.3699115  0.28706625 1.        ]
 [0.24200913 0.6375     0.53448276 ... 0.3699115  0.49586777 0.        ]
 ...
 [0.24200913 0.6375     0.72207084 ... 0.3699115  0.28706625 0.        ]
 [0.16216216 0.33834586 0.22105263 ... 0.3699115  0.65853659 1.        ]
 [0.24200913 0.87096774 0.90147783 ... 0.3699115  0.49586777 0.        ]]


In [130]:
binary_rele = BinaryRelevanceClassifier(base_estimator="LogisticRegression",under_sample=True)
binary_rele.fit(X,Y)
pred = binary_rele.predict(X)
proba = binary_rele.predict_proba(X)
print(pred)
print(Y)
print(proba)

[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 1]
 [0 1 0 ... 0 0 0]
 ...
 [1 1 1 ... 0 0 0]
 [1 1 0 ... 0 0 1]
 [0 1 1 ... 1 1 1]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[0.47050366 0.42130722 0.27015383 ... 0.64266192 0.63275494 0.44447845]
 [0.15148865 0.37464066 0.87206866 ... 0.29911845 0.2333348  0.5877415 ]
 [0.4973723  0.5302785  0.35498791 ... 0.40057232 0.3575375  0.34963665]
 ...
 [0.54620083 0.58971551 0.55140682 ... 0.35482016 0.29863313 0.43422876]
 [0.80717333 0.60707851 0.126138   ... 0.32074876 0.31107938 0.51618535]
 [0.410651   0.72063298 0.77062793 ... 0.5545015  0.6260146  0.61099592]]


## Task 3: Compare the Performance of Different Binary Relevance Approaches

In [113]:
#X_train, X_test, y_train, y_test = train_test_split(X, Y, random_state=0, test_size = 0.30, train_size = 0.7)
n_splits=10
kf = KFold(n_splits=n_splits)

for train_index, test_index in kf.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]


TRAIN: [ 242  243  244 ... 2414 2415 2416] TEST: [  0   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17
  18  19  20  21  22  23  24  25  26  27  28  29  30  31  32  33  34  35
  36  37  38  39  40  41  42  43  44  45  46  47  48  49  50  51  52  53
  54  55  56  57  58  59  60  61  62  63  64  65  66  67  68  69  70  71
  72  73  74  75  76  77  78  79  80  81  82  83  84  85  86  87  88  89
  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 105 106 107
 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125
 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143
 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161
 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179
 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197
 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215
 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233
 2

In [142]:
binary_rele = BinaryRelevanceClassifier()

hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    binary_rele.fit(X_train,y_train)
    pred = binary_rele.predict(X_test)
    proba = binary_rele.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.22933884297520662
macro avg f1-score: 0.38306137433858906
hamming loss: 0.21782762691853602
macro avg f1-score: 0.36676302925399457
hamming loss: 0.2231404958677686
macro avg f1-score: 0.36219518687334734
hamming loss: 0.21487603305785125
macro avg f1-score: 0.35744320799403345
hamming loss: 0.21517119244391972
macro avg f1-score: 0.373906600830188
hamming loss: 0.22992916174734357
macro avg f1-score: 0.3349752745934652
hamming loss: 0.22786304604486424
macro avg f1-score: 0.3503488497489843
hamming loss: 0.22673384706579727
macro avg f1-score: 0.35036876176202086
hamming loss: 0.22851215174866626
macro avg f1-score: 0.36132709755570724
hamming loss: 0.2148784825133373
macro avg f1-score: 0.3623360926567663

Overall Average:
Hamming Loss: 0.2228270880383291
Macro Averaged F1 Score: 0.3602725475607097


In [137]:
# Write your code here
binary_rele = BinaryRelevanceClassifier(under_sample=True)
"""
binary_rele.fit(X_train,y_train)
pred = binary_rele.predict(X_test)
proba = binary_rele.predict_proba(X_test)

print("Hamming Loss:",hamming_loss(y_test,pred))
print("Classification Report:\n",classification_report(y_test,pred))
print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
"""
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    binary_rele.fit(X_train,y_train)
    pred = binary_rele.predict(X_test)
    proba = binary_rele.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.43358913813459266
macro avg f1-score: 0.4104368542961573
hamming loss: 0.4155844155844156
macro avg f1-score: 0.4185500752305084
hamming loss: 0.41706021251475794
macro avg f1-score: 0.4099473602783191
hamming loss: 0.4203069657615112
macro avg f1-score: 0.3993736668610673
hamming loss: 0.4359504132231405
macro avg f1-score: 0.3805858979938211
hamming loss: 0.4253246753246753
macro avg f1-score: 0.42162423261917076
hamming loss: 0.4085005903187721
macro avg f1-score: 0.41610658191663197
hamming loss: 0.4149377593360996
macro avg f1-score: 0.423554273260137
hamming loss: 0.44902193242442207
macro avg f1-score: 0.39269957880455647
hamming loss: 0.42323651452282157
macro avg f1-score: 0.3864433575974561

Overall Average:
Hamming Loss: 0.42435126171452087
Macro Averaged F1 Score: 0.40593218788578256


From evaluation of two binary relevance classifers above, we can find that hamming loss increases greatly and macro averaged f1-score increases a little after under sampling is applied. It is because data used in this assignment is imbalanced. Considering data is imbalanced, hamming loss is not suitable for evaluating models without undersampling. 

Without under sampling, our models will be overfitted, which is the reason why hamming loss is lower. But with random under sampling, balanced data makes model better at generalization, which leads to the increasing of f1-score.

In [133]:
# Write your code here
binary_rele = BinaryRelevanceClassifier(base_estimator='LogisticRegression')
"""
binary_rele.fit(X_train,y_train)
pred = binary_rele.predict(X_test)
proba = binary_rele.predict_proba(X_test)

print("Hamming Loss:",hamming_loss(y_test,pred))
print("Classification Report:\n",classification_report(y_test,pred))
print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
"""
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    binary_rele.fit(X_train,y_train)
    pred = binary_rele.predict(X_test)
    proba = binary_rele.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.20543093270365997
macro avg f1-score: 0.3451838521744275
hamming loss: 0.20484061393152303
macro avg f1-score: 0.3427805515669951
hamming loss: 0.19539551357733176
macro avg f1-score: 0.3415257969031909
hamming loss: 0.1894923258559622
macro avg f1-score: 0.34717073278934585
hamming loss: 0.18181818181818182
macro avg f1-score: 0.3626381142340496
hamming loss: 0.20690672963400236
macro avg f1-score: 0.3336461045204027
hamming loss: 0.19480519480519481
macro avg f1-score: 0.36508216713278296
hamming loss: 0.2074688796680498
macro avg f1-score: 0.3452604659058683
hamming loss: 0.2021339656194428
macro avg f1-score: 0.34971718445036654
hamming loss: 0.1982809721398933
macro avg f1-score: 0.3481433178742445

Overall Average:
Hamming Loss: 0.19865733097532418
Macro Averaged F1 Score: 0.34811482875516736


Logistic Regression takes less time than the default estimator(decision tree) and got a lower f1-score as well as a lower hamming loss than decision tree. So decision tree generalizes better than logistic regression in this case.

In [118]:
binary_rele = BinaryRelevanceClassifier(base_estimator='LogisticRegression',under_sample=True)
"""
binary_rele.fit(X_train,y_train)
pred = binary_rele.predict(X_test)
proba = binary_rele.predict_proba(X_test)

print("Hamming Loss:",hamming_loss(y_test,pred))
print("Classification Report:\n",classification_report(y_test,pred))
print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
"""
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    binary_rele.fit(X_train,y_train)
    pred = binary_rele.predict(X_test)
    proba = binary_rele.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.38016528925619836
macro avg f1-score: 0.44505864350502883
hamming loss: 0.3730814639905549
macro avg f1-score: 0.4465598290238567
hamming loss: 0.3739669421487603
macro avg f1-score: 0.43995666720085325
hamming loss: 0.36216056670602126
macro avg f1-score: 0.44405242765149505
hamming loss: 0.35360094451003543
macro avg f1-score: 0.45682417596997354
hamming loss: 0.3695395513577332
macro avg f1-score: 0.4640286038145886
hamming loss: 0.36068476977567887
macro avg f1-score: 0.462328032701984
hamming loss: 0.3755186721991701
macro avg f1-score: 0.45403175151363967
hamming loss: 0.36751630112625966
macro avg f1-score: 0.4501079308606724
hamming loss: 0.37196206283343214
macro avg f1-score: 0.44637579066345856

Overall Average:
Hamming Loss: 0.3688196563903844
Macro Averaged F1 Score: 0.45093238529055507


Logistic regression binary relevance with under sampling gives a better f1-score and a lower hamming loss than decision tree with undersampling, which means logistic regression does better not only in generalization but in accuracy when under sampling is applied.

Overall, without undersampling, decision tree preforms better than logistic regression, but with under sampling, it is opposite.

## Task 4: Implement the Classifier Chains Algorithm

In [139]:
# Write your code here
# Create a new classifier which is based on the sckit-learn BaseEstimator and ClassifierMixin classes
class ClassifierChains(BaseEstimator, ClassifierMixin):
    """ 
    Parameters
    ----------   
    base_estimator string
    the base estimator to fit on the dataset. If None, then the base estimator is a decision tree.
    
    under_sample boolean, optional (default = "false")
    whether under sampling is applied or not

    Attributes
    ----------
    n_estimators_ number of estimators
    
    base_estimator_ type of estimators
    
    estimators_ list of estimators to be trained
    
    n_labels_ number of labels
    
    ----------
    """
    
    def __init__(self, base_estimator='None',n_estimators=14, under_sample=False):
        self.n_estimators_ = n_estimators
        self.under_sample_ = under_sample
        self.base_estimator_ = base_estimator

    def fit(self,X,Y):
        """Build classifiers from the training set (X, y).
        Parameters
        ----------
        X : array-like or sparse matrix, shape = [n_samples, n_features]
            The training input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csc_matrix``.
        Y : array-like, shape = [n_samples] 
            The target values (class labels) as integers or strings.
        Returns
        -------
        self : object
        """
        
        self.n_labels_ = Y.shape[1]
        
        self.estimators_ = list()
        if self.base_estimator_ == 'None'or self.base_estimator_ == 'DecisionTree':
            self.base_estimator_ = 'DecisionTree'
            for i in range(self.n_labels_):
                self.estimators_.append(DecisionTreeClassifier(max_depth=5))
        if self.base_estimator_ == 'LogisticRegression':
            for i in range(self.n_labels_):
                self.estimators_.append(LogisticRegression())
                
        for i in range(self.n_labels_):
            # get the ith column of Y 
            y = Y[:,i]
            # combine X with previous columns of Y that are participarted in prediction 
            x = np.c_[X, Y[:,0:i]]
            #X, y = check_X_y(X, y)
            # random under sampling
            if self.under_sample_ == True:
                rus = RandomUnderSampler(random_state=0)
                X_resampled, y_resampled = rus.fit_resample(x, y)
                self.estimators_[i].fit(X_resampled,y_resampled)
            else:
                self.estimators_[i].fit(x,y)
            
        return self
        
    
    def predict(self,X):
        """Predict class labels of the input samples X.
        Parameters
        ----------
        X : array-like or sparse matrix of shape = [n_samples, n_features]
            The input samples. Internally, it will be converted to
            ``dtype=np.float32`` and if a sparse matrix is provided
            to a sparse ``csr_matrix``.
        Returns
        -------
        predictions : array of shape = [n_samples, ].
            The predicted class labels of the input samples. 
        """
         # Check is fit had been called by confirming that the labels_ has been set up
        check_is_fitted(self, ['n_labels_'])
        # Check that the input features match the type and shape of the training features
        X = check_array(X)
        # Initialise an empty list to store the predictions made
        predictions = list()
        
        for i in range(self.n_labels_):
            # combine X with previous columns of Y that are participarted in prediction 
            if i==0:
                x = X
            else:
                x = np.c_[X,np.array(predictions).T]
            #pred = self.estimators_[i].predict(x)
            p = self.estimators_[i].predict_proba(x)
            pred = self.estimators_[i].classes_.take(np.argmax(p, axis=1), axis=0)
            predictions.append(pred)
            

        """
        for i in X:
            p = list()
            for instance in self.estimators_:
                result = instance.predict(i.reshape(1, -1))
                p.append(result[0])
            predictions.append(p)
        """
        return np.array(predictions).T
    
    def predict_proba(self,X):
        """Predict class probabilities of the input samples X.
        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            The input samples. 
        Returns
        -------
        proba : ndarray of shape (n_samples, n_classes) or list of n_outputs such arrays if n_outputs > 1
            The class probabilities of the input samples. The order of the classes corresponds to that in the attribute labels_.
        """
        # Check is fit had been called by confirming that the labels_ dictionary has been set up
        check_is_fitted(self, ['n_labels_'])

        # Check that the input features match the type and shape of the training features
        X = check_array(X)

        # Initialise an array to store the prediction scores generated
        proba = list()
        pred = list()
        for i in range(self.n_labels_):
            # combine X with previous columns of Y that are participarted in prediction 
            if i==0:
                x = X
            else:
                x = np.c_[X,np.array(pred).T]
            pro = self.estimators_[i].predict_proba(x)
            pred.append(self.estimators_[i].classes_.take(np.argmax(pro, axis=1), axis=0))
            proba.append(pro[:,1])# the probability of labels are selected(the probability when the prediction is 1)
            
        return np.array(proba).T

In [75]:
CC = ClassifierChains(base_estimator="LogisticRegression")
CC.fit(X,Y)
pred = CC.predict(X)
proba = CC.predict_proba(X)
print(pred)
print(Y)
print(proba)


[[0 0 0 ... 1 1 0]
 [0 0 1 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 ...
 [0 0 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 1 ... 1 1 0]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[3.08791912e-01 1.55615313e-01 1.96510323e-01 ... 9.53132243e-01
  9.95353301e-01 5.66921610e-04]
 [5.15711182e-02 3.03474626e-01 7.37509588e-01 ... 6.12125406e-01
  9.92144130e-01 5.21859799e-02]
 [2.72429887e-01 2.72789365e-01 1.30354009e-01 ... 7.06011728e-01
  9.86026481e-01 4.62885701e-04]
 ...
 [3.36162679e-01 2.31410098e-01 3.70094188e-01 ... 4.88129905e-01
  6.43892392e-03 1.12598358e-03]
 [6.46892087e-01 8.45303185e-01 8.92985868e-02 ... 4.21264118e-01
  1.21756532e-02 3.88178391e-04]
 [2.44111785e-01 5.35477273e-01 9.66202244e-01 ... 8.02198064e-01
  9.92460437e-01 2.15122835e-03]]


In [110]:
CC = ClassifierChains(base_estimator="LogisticRegression",under_sample = True)
CC.fit(X,Y)
pred = CC.predict(X)
proba = CC.predict_proba(X)
print(pred)
print(Y)
print(proba)

[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 1]
 [0 0 0 ... 0 0 0]
 ...
 [1 1 0 ... 0 0 0]
 [1 1 0 ... 0 0 0]
 [0 1 1 ... 1 1 0]]
[[0 0 0 ... 1 1 0]
 [0 0 1 ... 0 0 0]
 [0 1 1 ... 1 1 0]
 ...
 [0 0 0 ... 1 1 0]
 [0 0 0 ... 1 1 0]
 [0 1 1 ... 1 1 0]]
[[0.47050366 0.19753709 0.24725622 ... 0.77035627 0.99129992 0.18121228]
 [0.15148865 0.4134751  0.80519207 ... 0.3821041  0.01320216 0.85315683]
 [0.4973723  0.30309845 0.20547767 ... 0.48943311 0.00680423 0.05388088]
 ...
 [0.54620083 0.91836182 0.46776808 ... 0.23790745 0.00712741 0.05368065]
 [0.80717333 0.86150763 0.11812734 ... 0.19625232 0.01320025 0.06460527]
 [0.410651   0.57346218 0.96739784 ... 0.57168147 0.94062326 0.07736906]]


## Task 5: Evaluate the Performance of the Classifier Chains Algorithm

In [140]:
CC = ClassifierChains()
'''
CC.fit(X_train,y_train)
pred = CC.predict(X_test)
proba = CC.predict_proba(X_test)

print("Hamming Loss:",hamming_loss(y_test,pred))
print("Classification Report:\n",classification_report(y_test,pred))
#print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
'''
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    CC.fit(X_train,y_train)
    pred = CC.predict(X_test)
    proba = CC.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.22756788665879574
macro avg f1-score: 0.39486739724861764
hamming loss: 0.22136953955135774
macro avg f1-score: 0.38622604627135854
hamming loss: 0.22609208972845338
macro avg f1-score: 0.35409874515091877
hamming loss: 0.22727272727272727
macro avg f1-score: 0.37743185147214614
hamming loss: 0.21782762691853602
macro avg f1-score: 0.3635909589505668
hamming loss: 0.2423258559622196
macro avg f1-score: 0.3495898417160115
hamming loss: 0.23140495867768596
macro avg f1-score: 0.3603434781027738
hamming loss: 0.23740367516301125
macro avg f1-score: 0.3542006400386482
hamming loss: 0.23977474807350327
macro avg f1-score: 0.34508249562918697
hamming loss: 0.21250740960284528
macro avg f1-score: 0.4170259280466703

Overall Average:
Hamming Loss: 0.22835465176091357
Macro Averaged F1 Score: 0.3702457382626899


Classifier chains using decision tree performs similarly to binary relevance using decision tree not only in f1-score but also in hamming loss.

In [120]:
CC = ClassifierChains(base_estimator="LogisticRegression")
"""   
CC.fit(X_train,y_train)
pred = CC.predict(X_test)
proba = CC.predict_proba(X_test)

print("Hamming Loss:",hamming_loss(y_test,pred))
print("Classification Report:\n",classification_report(y_test,pred))
print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
"""
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    CC.fit(X_train,y_train)
    pred = CC.predict(X_test)
    proba = CC.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.21074380165289255
macro avg f1-score: 0.39334739019811044
hamming loss: 0.2190082644628099
macro avg f1-score: 0.3645588640140102
hamming loss: 0.21517119244391972
macro avg f1-score: 0.36810735199379935
hamming loss: 0.20690672963400236
macro avg f1-score: 0.3769673459623609
hamming loss: 0.19303423848878395
macro avg f1-score: 0.4113057031756825
hamming loss: 0.21959858323494688
macro avg f1-score: 0.39392438988715545
hamming loss: 0.22107438016528927
macro avg f1-score: 0.3913571102473553
hamming loss: 0.22673384706579727
macro avg f1-score: 0.3763334979731222
hamming loss: 0.2110254890337878
macro avg f1-score: 0.390022446396581
hamming loss: 0.2107291049199763
macro avg f1-score: 0.3826091227063218

Overall Average:
Hamming Loss: 0.2134025631102206
Macro Averaged F1 Score: 0.38485332225545


When using logistic regression, classifier chains performs better than binary relevance but logistic regression without undersampling generalize worst with both classifier chains and binary relevance.

In [141]:
CC = ClassifierChains(under_sample = True)
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    CC.fit(X_train,y_train)
    pred = CC.predict(X_test)
    proba = CC.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.3819362455726092
macro avg f1-score: 0.39316581817477714
hamming loss: 0.36835891381345925
macro avg f1-score: 0.3803217333621572
hamming loss: 0.39728453364817
macro avg f1-score: 0.3836560228769058
hamming loss: 0.37662337662337664
macro avg f1-score: 0.3935884510081394
hamming loss: 0.37987012987012986
macro avg f1-score: 0.40265541981121655
hamming loss: 0.3574380165289256
macro avg f1-score: 0.3928407628266441
hamming loss: 0.3639315230224321
macro avg f1-score: 0.4210663115941138
hamming loss: 0.3663307646710136
macro avg f1-score: 0.4112174059159886
hamming loss: 0.4045643153526971
macro avg f1-score: 0.38293807748065667
hamming loss: 0.3790752815649081
macro avg f1-score: 0.39817598978873964

Overall Average:
Hamming Loss: 0.37754131006677216
Macro Averaged F1 Score: 0.39596259928393385


Decision tree binary relevance with undersampling performs a little better than that of classifier chains but under sampling doesn't give obvious help to decition tree.

In [122]:
CC = ClassifierChains(base_estimator="LogisticRegression",under_sample = True)
hamming = list()
f1_macro = list()
for train_index, test_index in kf.split(X):
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = Y[train_index], Y[test_index]

    CC.fit(X_train,y_train)
    pred = CC.predict(X_test)
    proba = CC.predict_proba(X_test)
    #print("Prediction:\n",pred)

    h = hamming_loss(y_test,pred)
    report = classification_report(y_test,pred,output_dict=True,zero_division=0)
    print("hamming loss:",h)
    print("macro avg f1-score:",report['macro avg']['f1-score'])
    #print("Hamming Loss:",h)
    #print("Classification Report:\n",str(report))
    #print("Confusion Matrix:\n",multilabel_confusion_matrix(y_test,pred))
    hamming.append(h)
    f1_macro.append(report['macro avg']['f1-score'])
    
print("\nOverall Average:")
print("Hamming Loss:",sum(hamming)/n_splits)
print("Macro Averaged F1 Score:",sum(f1_macro)/n_splits)

hamming loss: 0.3122786304604486
macro avg f1-score: 0.43299407258044403
hamming loss: 0.31286894923258557
macro avg f1-score: 0.4157864134643515
hamming loss: 0.3051948051948052
macro avg f1-score: 0.4235062388067331
hamming loss: 0.28187721369539553
macro avg f1-score: 0.46322308806312806
hamming loss: 0.30283353010625735
macro avg f1-score: 0.4323738840295679
hamming loss: 0.30844155844155846
macro avg f1-score: 0.4451529322727036
hamming loss: 0.3025383707201889
macro avg f1-score: 0.4352010633436255
hamming loss: 0.3275044457617072
macro avg f1-score: 0.4026163455158386
hamming loss: 0.3070539419087137
macro avg f1-score: 0.4258403570641513
hamming loss: 0.3020154119739182
macro avg f1-score: 0.42545784100177403

Overall Average:
Hamming Loss: 0.3062606857495579
Macro Averaged F1 Score: 0.43021522361423176


Logistic regression with under sampling generalizes best with both classifier chains and binary relevance but with binary relevance it performs a little better.

## Task 6: Reflect on the Performance of the Different Models Evaluated

I used only two different base estimators in this assignment. Logistic Regression usually runs faster than Decision Tree, the default one and with undersampling, it preforms much better than decition tree. Decision tree without limitation of depth turns to be overfitted easily and takes more time generally, and under sampling doesn't improve it much as well. So I think logistic regression with under sampling is the most suitable choice in this case.

As for the differences between binary relevance and classifier chains, the previous one does fewer calculations than the latter and under sampling helps to generalize more when it is used with binary relevance. So binary relavance can generalize well with really few calculations. 

All evaluation data is listed below.
#### BinaryRelevanceClassifier()
        no max_depth:
    Hamming Loss: 0.42216144850999615    Macro Averaged F1 Score: 0.4020360662730146
        max_depth=5：
    Hamming Loss: 0.2222665301503476     Macro Averaged F1 Score: 0.35877640276481165
#### BinaryRelevanceClassifier(under_sample=True)
        no max_depth:
    Hamming Loss: 0.421598563639303     Macro Averaged F1 Score: 0.4030625459701939
        max_depth=5：
    Hamming Loss: 0.42435126171452087   Macro Averaged F1 Score: 0.40593218788578256
#### BinaryRelevanceClassifier(base_estimator='LogisticRegression')
    Hamming Loss: 0.19865733097532418    Macro Averaged F1 Score: 0.34811482875516736
#### BinaryRelevanceClassifier(base_estimator='LogisticRegression',under_sample=True)
    Hamming Loss: 0.3688196563903844     Macro Averaged F1 Score: 0.45093238529055507
#### ClassifierChains()
        no max_depth:
    Hamming Loss: 0.2775019963062211     Macro Averaged F1 Score: 0.39470795897322425
        max_depth=5：
    Hamming Loss: 0.22835465176091357    Macro Averaged F1 Score: 0.3702457382626899
#### ClassifierChains(under_sample = True)
        no max_depth:
    Hamming Loss: 0.39796560474606496    Macro Averaged F1 Score: 0.39514994507419504
        max_depth=5：
    Hamming Loss: 0.37754131006677216    Macro Averaged F1 Score: 0.39596259928393385
#### ClassifierChains(base_estimator="LogisticRegression")
    Hamming Loss: 0.2134025631102206     Macro Averaged F1 Score: 0.38485332225545
#### ClassifierChains(base_estimator="LogisticRegression",under_sample = True)
    Hamming Loss: 0.3062606857495579     Macro Averaged F1 Score: 0.43021522361423176