In [16]:
from skmultilearn.dataset import load_dataset
X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading


In [17]:
import copy
import numpy as np

from scipy.sparse import hstack, issparse, lil_matrix
from skmultilearn.problem_transform import BinaryRelevance
from imblearn.under_sampling import RandomUnderSampler

class BinaryRelevanceUnderSampling(BinaryRelevance):
    def __init__(self, classifier=None, require_dense=None, seed=1):
        super(BinaryRelevanceUnderSampling, self).__init__(
            classifier, require_dense)
        self.seed = seed

    def fit(self, X, y):
        """Fits classifier to training data
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        Returns
        -------
        self
            fitted instance of self
        Notes
        -----
        .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
        """
        X = self._ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        y = self._ensure_output_format(
            y, sparse_format='csc', enforce_sparse=True)

        self.classifiers_ = []
        self._generate_partition(X, y)
        self._label_count = y.shape[1]

        rus = RandomUnderSampler(random_state=self.seed)

        for i in range(self.model_count_):
            print("Binary Relevance: Creating Model ", i)
            classifier = copy.deepcopy(self.classifier)
            y_subset = self._generate_data_subset(
                y, self.partition_[i], axis=1)
            y_to_resample = y_subset.toarray()
            flattenVer = y_to_resample.flatten()
            uniqueElementLength = len(np.unique(flattenVer))
            if uniqueElementLength > 1:
                X_resampled, y_resampled = rus.fit_resample(X,y_to_resample)
            else:
                X_resampled, y_resampled = X, y_to_resample
            if issparse(y_resampled) and y_resampled.ndim > 1 and y_resampled.shape[1] == 1:
                y_resampled = np.ravel(y_resampled.toarray())
            classifier.fit(self._ensure_input_format(
                X_resampled), self._ensure_output_format(y_resampled))
            self.classifiers_.append(classifier)
        return self


In [18]:
from skmultilearn.base import ProblemTransformationBase
# from BinaryRelevanceUnderSampling import BinaryRelevanceUnderSampling
from CocoaTripleClassTransformation import CocoaTripleClassTransformation
from imblearn.datasets import make_imbalance
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import numpy as np
import random
import copy

paramXGBoost = {
    "eta":0.2,
    "gamma":0,
    "min_child_weight": 1,
    "max_depth": 3,
    "colsample_bytree": 0.7
}

paramXGBoostMulticlass = {
    "eta":0.2,
    "gamma":0,
    "min_child_weight": 1,
    "max_depth": 3,
    "colsample_bytree": 0.7,
    "objective": "multi:softmax"
}


class CocoaXGBoostUndersampling(ProblemTransformationBase):
    def __init__(self, numMaxCouples = 10, underSamplingPercent = 1.0, seed = 1):
        super(CocoaXGBoostUndersampling, self).__init__(XGBClassifier(**paramXGBoost), None)
        self.multiclassClassifier = XGBClassifier(**paramXGBoostMulticlass)
        self.numMaxCouples = numMaxCouples
        self.underSamplingPercent = underSamplingPercent
        self.seed = seed
        self.numCouples = None

    def getNumMaxCouples(self):
        return self.numMaxCouples
    
    def getNumCouples(self):
        return self.numCouples
    
    def getUnderSamplingPercent(self):
        return self.underSamplingPercent

    def setUnderSamplingPercent(self, underSamplingPercent):
        self.underSamplingPercent = underSamplingPercent
    
    def getSeed(self):
        return self.seed

    def setSeed(self, seed):
        self.seed = seed

    def _generate_partition(self, X, y):
        """Partitions the label space into singletons
        Sets `self.partition_` (list of single item lists) and `self.model_count_` (equal to number of labels).
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            not used, only for API compatibility
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `int`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        """
        self.partition_ = list(range(y.shape[1]))
        self.labelIndices = list(range(y.shape[1]))
        self.model_count_ = y.shape[1]
        self._label_count = y.shape[1]

    def fit(self, X, y):
        """Fits classifier to training data
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        Returns
        -------
        self
            fitted instance of self
        Notes
        -----
        .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
        """
        self._generate_partition(X, y)
        self.numCouples = min(self.getNumMaxCouples(), self._label_count-1)
        self.brus = BinaryRelevanceUnderSampling(self.classifier, seed=self.seed)
        self.trt = CocoaTripleClassTransformation(y)
        self.triLabelIndices = []
        self.triClassifiers = []
        for i in range(self._label_count):
            self.triLabelIndices.append([]) # Actually init indices
            self.triClassifiers.append([]) # Actually init classifier
            for j in range(self.numCouples):
                self.triClassifiers[i].append(copy.deepcopy(self.multiclassClassifier)) 
        self.thresholds = [-1]*self._label_count #Init threshold list
        
        self.brus.fit(X, y)
		
        labelIndexList = []
        for i in range(self._label_count):
            labelIndexList.append(self.labelIndices[i])

        rnd = random.Random(self.seed)
        for i in range(self._label_count):
            rnd.shuffle(labelIndexList)
            self.triLabelIndices[i] = self.selectedLabelIndices(labelIndexList, self.labelIndices[i])
            for j in range(self.numCouples):
                print("Coupling: ", i, " and ", j)
                yTriClassIns = self.trt.transformLabels(self.labelIndices[i], self.triLabelIndices[i][j])
                xUsTriClassIns, YUsTriClassIns = self.TrirandomUnderSampling(X, yTriClassIns)
                self.triClassifiers[i][j].fit(xUsTriClassIns, YUsTriClassIns)
       	self.calculateThresholds(X, y)

    def selectedLabelIndices(self, labelIndexList, currentLabelIndex):
        result = []
        i_list = 0
        i_array = 0
        while i_array<self.numCouples:
            l=labelIndexList[i_list]
            if l!=currentLabelIndex:
                result.append(l)
                i_array+=1
            i_list+=1
        return result

    def TrirandomUnderSampling(self, X, y): 
        """
        y : numpy array
        """
        result = []
        unique_elements, counts_elements = np.unique(y, return_counts=True)
#         dictCount = dict(zip(unique_elements, counts_elements))
        numClass = len(unique_elements)
        c = [0]*numClass
        nData = len(y)
        minVal = counts_elements.min()
        sample_strategy = dict()
        for i in range(numClass):
            if i in unique_elements:
                sample_strategy[i] = minVal
        Xres, yres = make_imbalance(X,y,sample_strategy)
        return Xres, yres

    def makePredictionforThreshold(self, xData):
        confidences = [0]*self._label_count
        X = np.asarray([xData])
        yPredProba = self.brus.predict_proba(X).toarray()[0]

        for i in range(self._label_count):
            confidences+=yPredProba[i]

        for j in range(self._label_count):
            for k in range(self.numCouples):
                d = self.triClassifiers[j][k].predict_proba(np.asarray([xData]))
                confidences[j] += d[0][2]
            confidences[j] /= (self.numCouples+1)
        return confidences

    def calculateThresholds(self, X, y):
        nData = y.shape[0]
        nLabel = y.shape[1]
        predictConfidences = []
        for i in range(nData):
            predictConfidences.append(self.makePredictionforThreshold(X[i]))

        for j in range(self._label_count):
            maxVal = -1000000000000.0
            trueLabels = [ data[j]==1 for data in y]
            
            d = 0.05

            while d<1:
                predictLabels = [predictConfidences[i][j]>=d for i in list(range(nData))]
                #Using Fmeasure
                print("Calculate Threshold Label ", j, "with d=",d)
                value = f1_score(trueLabels, predictLabels, average='macro')
                if value > maxVal:
                    maxVal = value
                    self.thresholds[j] = d
                d+=0.05 

    def predict(self, X):
        nData = len(X)
        result = []
        for i in range(nData):
            bipartition, confidences = self.makePredictionSingleData(X[i])
            result.append(bipartition)
        return np.asarray(result)

    def makePredictionSingleData(self, x1):
        confidences = self.makePredictionforThreshold(x1)
        bipartition = [0]*self._label_count
        for j in range(self._label_count):
            bipartition[j] = int(confidences[j] > self.thresholds[j])

        return bipartition, confidences


In [19]:
clf = CocoaXGBoostUndersampling()

In [20]:
clf.fit(X_train.toarray(), y_train.toarray())

Binary Relevance: Creating Model  0
Binary Relevance: Creating Model  1
Binary Relevance: Creating Model  2
Binary Relevance: Creating Model  3
Binary Relevance: Creating Model  4
Binary Relevance: Creating Model  5
Coupling:  0  and  0
Coupling:  0  and  1
Coupling:  0  and  2
Coupling:  0  and  3
Coupling:  0  and  4
Coupling:  1  and  0
Coupling:  1  and  1
Coupling:  1  and  2
Coupling:  1  and  3
Coupling:  1  and  4
Coupling:  2  and  0
Coupling:  2  and  1
Coupling:  2  and  2
Coupling:  2  and  3
Coupling:  2  and  4
Coupling:  3  and  0
Coupling:  3  and  1
Coupling:  3  and  2
Coupling:  3  and  3
Coupling:  3  and  4
Coupling:  4  and  0
Coupling:  4  and  1
Coupling:  4  and  2
Coupling:  4  and  3
Coupling:  4  and  4
Coupling:  5  and  0
Coupling:  5  and  1
Coupling:  5  and  2
Coupling:  5  and  3
Coupling:  5  and  4
Calculate Threshold Label  0 with d= 0.05
Calculate Threshold Label  0 with d= 0.1
Calculate Threshold Label  0 with d= 0.15000000000000002
Calculate Thre

In [None]:
ypred = clf.predict(X_test.toarray())

In [None]:
ypred

In [11]:
from sklearn.metrics import classification_report, hamming_loss, f1_score
hamloss = hamming_loss(y_test,ypred)
hamloss

0.2070957095709571

In [15]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

classifier = BinaryRelevance(
    classifier = XGBClassifier(**paramXGBoost),
    require_dense = [False, True]
)

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

In [16]:
hamloss = hamming_loss(y_test,predictions)
hamloss

0.20544554455445543

In [18]:
print(classification_report(y_test,predictions, digits=4))

              precision    recall  f1-score   support

           0     0.6600    0.6111    0.6346        54
           1     0.6000    0.3559    0.4468        59
           2     0.6827    0.7396    0.7100        96
           3     0.8364    0.7797    0.8070        59
           4     0.8478    0.5342    0.6555        73
           5     0.7500    0.5172    0.6122        58

   micro avg     0.7273    0.6015    0.6584       399
   macro avg     0.7295    0.5896    0.6444       399
weighted avg     0.7301    0.6015    0.6510       399
 samples avg     0.6568    0.5965    0.5931       399



In [19]:
print(classification_report(y_test,ypred, digits=4))

              precision    recall  f1-score   support

           0     0.6600    0.6111    0.6346        54
           1     0.4915    0.4915    0.4915        59
           2     0.6796    0.7292    0.7035        96
           3     0.7424    0.8305    0.7840        59
           4     0.8070    0.6301    0.7077        73
           5     0.7959    0.6724    0.7290        58

   micro avg     0.6927    0.6667    0.6794       399
   macro avg     0.6961    0.6608    0.6751       399
weighted avg     0.6986    0.6667    0.6792       399
 samples avg     0.6502    0.6617    0.6254       399



In [30]:
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target
X_res, y_res = make_imbalance(X, y,sampling_strategy={0: 10, 1: 20, 2: 30},random_state=42)

In [39]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [46]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
X, y = make_classification(n_classes=2, class_sep=2,weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
y2 = [[d] for d in y]
y2

[[0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0]

In [52]:
rus = RandomUnderSampler(random_state=42)
X1, y1 = rus.fit_resample(X,y2)
np.unique(y1.flatten())

array([0, 1])

In [40]:
X1, y1 = rus.fit_resample(X,y)
y1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [22]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [28]:
neigh.kneighbors([[1]])

(array([[0., 1., 1.]]), array([[1, 0, 2]]))

In [29]:
neigh.predict([[1]])

array([0])