In [1]:
from skmultilearn.dataset import load_dataset
X_train, y_train, feature_names, label_names = load_dataset('emotions', 'train')
X_test, y_test, _, _ = load_dataset('emotions', 'test')

emotions:train - exists, not redownloading
emotions:test - exists, not redownloading


In [13]:
import copy
import numpy as np

from scipy.sparse import hstack, issparse, lil_matrix
from skmultilearn.problem_transform import BinaryRelevance
from imblearn.under_sampling import RandomUnderSampler

class BinaryRelevanceUnderSampling(BinaryRelevance):
    def __init__(self, classifier=None, require_dense=None, seed=1):
        super(BinaryRelevanceUnderSampling, self).__init__(
            classifier, require_dense)
        self.seed = seed

    def fit(self, X, y):
        """Fits classifier to training data
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        Returns
        -------
        self
            fitted instance of self
        Notes
        -----
        .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
        """
        X = self._ensure_input_format(
            X, sparse_format='csr', enforce_sparse=True)
        y = self._ensure_output_format(
            y, sparse_format='csc', enforce_sparse=True)

        self.classifiers_ = []
        self._generate_partition(X, y)
        self._label_count = y.shape[1]

        rus = RandomUnderSampler(random_state=self.seed)

        for i in range(self.model_count_):
            print("Binary Relevance: Creating Model ", i)
            classifier = copy.deepcopy(self.classifier)
            y_subset = self._generate_data_subset(
                y, self.partition_[i], axis=1)
            y_to_resample = y_subset.toarray()
            flattenVer = y_to_resample.flatten()
            uniqueElementLength = len(np.unique(flattenVer))
            if uniqueElementLength > 1:
                X_resampled, y_resampled = rus.fit_resample(X,y_to_resample)
            else:
                X_resampled, y_resampled = X, y_to_resample
            if issparse(y_resampled) and y_resampled.ndim > 1 and y_resampled.shape[1] == 1:
                y_resampled = np.ravel(y_resampled.toarray())
            classifier.fit(self._ensure_input_format(
                X_resampled), self._ensure_output_format(y_resampled))
            self.classifiers_.append(classifier)
        return self


In [14]:
from skmultilearn.base import ProblemTransformationBase
# from BinaryRelevanceUnderSampling import BinaryRelevanceUnderSampling
from CocoaTripleClassTransformation import CocoaTripleClassTransformation
from imblearn.datasets import make_imbalance
from sklearn.metrics import f1_score
from xgboost import XGBClassifier
import numpy as np
import random
import copy

paramXGBoost = {
    "eta":0.2,
    "gamma":0,
    "min_child_weight": 1,
    "max_depth": 3,
    "colsample_bytree": 0.7
}

paramXGBoostMulticlass = {
    "eta":0.2,
    "gamma":0,
    "min_child_weight": 1,
    "max_depth": 3,
    "colsample_bytree": 0.7,
    "objective": "multi:softmax"
}


class CocoaXGBoostUndersampling(ProblemTransformationBase):
    def __init__(self, numMaxCouples = 10, underSamplingPercent = 1.0, seed = 1):
        super(CocoaXGBoostUndersampling, self).__init__(XGBClassifier(**paramXGBoost), None)
        self.multiclassClassifier = XGBClassifier(**paramXGBoostMulticlass)
        self.numMaxCouples = numMaxCouples
        self.underSamplingPercent = underSamplingPercent
        self.seed = seed
        self.numCouples = None

    def getNumMaxCouples(self):
        return self.numMaxCouples
    
    def getNumCouples(self):
        return self.numCouples
    
    def getUnderSamplingPercent(self):
        return self.underSamplingPercent

    def setUnderSamplingPercent(self, underSamplingPercent):
        self.underSamplingPercent = underSamplingPercent
    
    def getSeed(self):
        return self.seed

    def setSeed(self, seed):
        self.seed = seed

    def _generate_partition(self, X, y):
        """Partitions the label space into singletons
        Sets `self.partition_` (list of single item lists) and `self.model_count_` (equal to number of labels).
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            not used, only for API compatibility
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `int`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        """
        self.partition_ = list(range(y.shape[1]))
        self.labelIndices = list(range(y.shape[1]))
        self.model_count_ = y.shape[1]
        self._label_count = y.shape[1]

    def fit(self, X, y):
        """Fits classifier to training data
        Parameters
        ----------
        X : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix, shape=(n_samples, n_features)
            input feature matrix
        y : `array_like`, :class:`numpy.matrix` or :mod:`scipy.sparse` matrix of `{0, 1}`, shape=(n_samples, n_labels)
            binary indicator matrix with label assignments
        Returns
        -------
        self
            fitted instance of self
        Notes
        -----
        .. note :: Input matrices are converted to sparse format internally if a numpy representation is passed
        """
        self._generate_partition(X, y)
        self.numCouples = min(self.getNumMaxCouples(), self._label_count-1)
        self.brus = BinaryRelevanceUnderSampling(self.classifier, seed=self.seed)
        self.trt = CocoaTripleClassTransformation(y)
        self.triLabelIndices = []
        self.triClassifiers = []
        for i in range(self._label_count):
            self.triLabelIndices.append([]) # Actually init indices
            self.triClassifiers.append([]) # Actually init classifier
            for j in range(self.numCouples):
                self.triClassifiers[i].append(copy.deepcopy(self.multiclassClassifier)) 
        self.thresholds = [-1]*self._label_count #Init threshold list
        
        self.brus.fit(X, y)
		
        labelIndexList = []
        for i in range(self._label_count):
            labelIndexList.append(self.labelIndices[i])

        rnd = random.Random(self.seed)
        for i in range(self._label_count):
            rnd.shuffle(labelIndexList)
            self.triLabelIndices[i] = self.selectedLabelIndices(labelIndexList, self.labelIndices[i])
            for j in range(self.numCouples):
                print("Coupling: ", i, " and ", j)
                yTriClassIns = self.trt.transformLabels(self.labelIndices[i], self.triLabelIndices[i][j])
                xUsTriClassIns, YUsTriClassIns = self.TrirandomUnderSampling(X, yTriClassIns)
                self.triClassifiers[i][j].fit(xUsTriClassIns, YUsTriClassIns)
       	self.calculateThresholds(X, y)

    def selectedLabelIndices(self, labelIndexList, currentLabelIndex):
        result = []
        i_list = 0
        i_array = 0
        while i_array<self.numCouples:
            l=labelIndexList[i_list]
            if l!=currentLabelIndex:
                result.append(l)
                i_array+=1
            i_list+=1
        return result

    def TrirandomUnderSampling(self, X, y): 
        """
        y : numpy array
        """
        result = []
        unique_elements, counts_elements = np.unique(y, return_counts=True)
#         dictCount = dict(zip(unique_elements, counts_elements))
        numClass = len(unique_elements)
        c = [0]*numClass
        nData = len(y)
        minVal = counts_elements.min()
        sample_strategy = dict()
        for i in range(numClass):
            if i in unique_elements:
                sample_strategy[i] = minVal
        Xres, yres = make_imbalance(X,y,sample_strategy)
        return Xres, yres

    def makePredictionforThreshold(self, xData):
        confidences = [0]*self._label_count
        X = np.asarray([xData])
        yPredProba = self.brus.predict_proba(X).toarray()[0]

        for i in range(self._label_count):
            confidences+=yPredProba[i]

        for j in range(self._label_count):
            for k in range(self.numCouples):
                d = self.triClassifiers[j][k].predict_proba(np.asarray([xData]))
                confidences[j] += d[0][2]
            confidences[j] /= (self.numCouples+1)
        return confidences

    def calculateThresholds(self, X, y):
        nData = y.shape[0]
        nLabel = y.shape[1]
        predictConfidences = []
        for i in range(nData):
            predictConfidences.append(self.makePredictionforThreshold(X[i]))

        for j in range(self._label_count):
            maxVal = -1000000000000.0
            trueLabels = [ data[j]==1 for data in y]
            
            d = 0.05

            while d<1:
                predictLabels = [predictConfidences[i][j]>=d for i in list(range(nData))]
                #Using Fmeasure
                print("Calculate Threshold Label ", j, "with d=",d)
                value = f1_score(trueLabels, predictLabels, average='macro')
                if value > maxVal:
                    maxVal = value
                    self.thresholds[j] = d
                d+=0.05 

    def predict(self, X):
        nData = len(X)
        result = []
        for i in range(nData):
            bipartition, confidences = self.makePredictionSingleData(X[i])
            result.append(bipartition)
        return np.asarray(result)

    def predict_proba(self, X):
        nData = len(X)
        result = []
        for i in range(nData):
            confidences = self.makePredictionforThreshold(X[i])
            result.append(confidences)
        return np.asarray(result) 
    
    def makePredictionSingleData(self, x1):
        confidences = self.makePredictionforThreshold(x1)
        bipartition = [0]*self._label_count
        for j in range(self._label_count):
            bipartition[j] = int(confidences[j] > self.thresholds[j])

        return bipartition, confidences


In [16]:
clf = CocoaXGBoostUndersampling()

In [20]:
clf.fit(X_train.toarray(), y_train.toarray())

Binary Relevance: Creating Model  0
Binary Relevance: Creating Model  1
Binary Relevance: Creating Model  2
Binary Relevance: Creating Model  3
Binary Relevance: Creating Model  4
Binary Relevance: Creating Model  5
Coupling:  0  and  0
Coupling:  0  and  1
Coupling:  0  and  2
Coupling:  0  and  3
Coupling:  0  and  4
Coupling:  1  and  0
Coupling:  1  and  1
Coupling:  1  and  2
Coupling:  1  and  3
Coupling:  1  and  4
Coupling:  2  and  0
Coupling:  2  and  1
Coupling:  2  and  2
Coupling:  2  and  3
Coupling:  2  and  4
Coupling:  3  and  0
Coupling:  3  and  1
Coupling:  3  and  2
Coupling:  3  and  3
Coupling:  3  and  4
Coupling:  4  and  0
Coupling:  4  and  1
Coupling:  4  and  2
Coupling:  4  and  3
Coupling:  4  and  4
Coupling:  5  and  0
Coupling:  5  and  1
Coupling:  5  and  2
Coupling:  5  and  3
Coupling:  5  and  4
Calculate Threshold Label  0 with d= 0.05
Calculate Threshold Label  0 with d= 0.1
Calculate Threshold Label  0 with d= 0.15000000000000002
Calculate Thre

In [None]:
ypred = clf.predict(X_test.toarray())

In [None]:
ypred

In [39]:
from sklearn.metrics import classification_report, hamming_loss, f1_score
hamloss = hamming_loss(y_test,ypred)
hamloss

NameError: name 'ypred' is not defined

In [45]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

classifier = BinaryRelevance(
    classifier = XGBClassifier(**paramXGBoost),
    require_dense = [False, True]
)

# train
classifier.fit(X_train, y_train)

# predict
predictions = classifier.predict(X_test)

In [46]:
hamloss = hamming_loss(y_test,predictions)
hamloss

0.20544554455445543

In [47]:
print(classification_report(y_test,predictions, digits=4))

              precision    recall  f1-score   support

           0     0.6600    0.6111    0.6346        54
           1     0.6000    0.3559    0.4468        59
           2     0.6827    0.7396    0.7100        96
           3     0.8364    0.7797    0.8070        59
           4     0.8478    0.5342    0.6555        73
           5     0.7500    0.5172    0.6122        58

   micro avg     0.7273    0.6015    0.6584       399
   macro avg     0.7295    0.5896    0.6444       399
weighted avg     0.7301    0.6015    0.6510       399
 samples avg     0.6568    0.5965    0.5931       399



In [19]:
print(classification_report(y_test,ypred, digits=4))

              precision    recall  f1-score   support

           0     0.6600    0.6111    0.6346        54
           1     0.4915    0.4915    0.4915        59
           2     0.6796    0.7292    0.7035        96
           3     0.7424    0.8305    0.7840        59
           4     0.8070    0.6301    0.7077        73
           5     0.7959    0.6724    0.7290        58

   micro avg     0.6927    0.6667    0.6794       399
   macro avg     0.6961    0.6608    0.6751       399
weighted avg     0.6986    0.6667    0.6792       399
 samples avg     0.6502    0.6617    0.6254       399



In [30]:
from sklearn.datasets import load_iris
data = load_iris()
X, y = data.data, data.target
X_res, y_res = make_imbalance(X, y,sampling_strategy={0: 10, 1: 20, 2: 30},random_state=42)

In [39]:
print(y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [46]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler # doctest: +NORMALIZE_WHITESPACE
X, y = make_classification(n_classes=2, class_sep=2,weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
y2 = [[d] for d in y]
y2

[[0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [0],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0],
 [1],
 [1],
 [1],
 [1],
 [1],
 [1],
 [0]

In [52]:
rus = RandomUnderSampler(random_state=42)
X1, y1 = rus.fit_resample(X,y2)
np.unique(y1.flatten())

array([0, 1])

In [40]:
X1, y1 = rus.fit_resample(X,y)
y1

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1])

In [64]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3,algorithm='brute')
neigh.fit(X, y)

KNeighborsClassifier(algorithm='brute', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=3, p=2,
                     weights='uniform')

In [69]:
list(neigh.kneighbors([[2]], return_distance=False)[0])

[2, 1, 3]

In [62]:
neigh.predict([[2]])

array([1])

In [181]:
# from imblearn.over_sampling.base import BaseOverSampler
import random
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
class InstanceType:
    SAFE =  0
    BORDERLINE = 1
    RARE = 2
    OUTLIER = 3
    MAJORITY = 4
    insTypeTheta = {
        0:0.5,
        1:0.75,
        2:1.0+1e-5,
        3:0.0-1e-5,
    }

class MLSOL():
    def __init__(self, numOfNeighbors = 5, ratio = 0.1, randomState = 1):
        self._weights = []
        self._C = [] # C[][] the C_ij for majority class is null
        self._insTypes = [] # insTypes[][]
        self._knnIndices = [] # type [][]
        self._minLabels = []
        self._labelIndices = []
        self._featureIndices = []
        self._sumW = None
        self._numOfNeighbors = numOfNeighbors
        self._percentageGeneratedInstance = ratio
        self._randomState = randomState

    def setRatio(self, p):
        self._percentageGeneratedInstance = p
    
    def getRatio(self):
        return self._percentageGeneratedInstance

    def setRandomState(self, randomState):
        self._randomState = randomState
    
    def getRandomState(self):
        return self._randomState

    def countC1C0(self, y, numLabels):
        c1 = [0]*numLabels
        c0 = [0]*numLabels

        for e in y:
            for j in range(numLabels):
                if e[j] == 0:
                    c0[j]+=1
                elif e[j] == 1:
                    c1[j]+=1

        return c0, c1

    def getMinLabels(self, y, numLabels):
        c0, c1 = self.countC1C0(y, numLabels)
        minLabels = []

        for i in range(numLabels):
            minLabels.append(1 if c1[i] > c0[i] else 0)
        return minLabels

    def fit_resample(self, X, y):
        rnd = random.Random(self._randomState)
        numLabels = y.shape[1]
        self._labelIndices = list(range(numLabels))
        self._featureIndices = list(range(X.shape[1]))
        nData = len(y)
        generatedNumberIns = int(nData * self._percentageGeneratedInstance)
        self.knnClassifier = KNeighborsClassifier(n_neighbors=self._numOfNeighbors)

		# weights=new double[oriNumIns];
        self._minLabels = self.getMinLabels(y, numLabels)
		
		

        self._calculate_weight(X, y)
        self._initilizeIns_types(X, y)
        xNew = X.copy()
        yNew = y.copy()
        xNewAdd = []
        yNewAdd = []
        for  i in range(generatedNumberIns):
            d = rnd.uniform(0, 1)
            centralIndex = -1
            s = 0
            for j in range(nData):
                s+= self._weights[j]
                if d<=s:
                    centralIndex = j
                    break
            referenceIndex = self._knnIndices[centralIndex][rnd.randint(0, self._numOfNeighbors - 1)]
            # Instance newData=generateSyntheticInstance(ins.get(centralIndex), ins.get(referenceIndex), centralIndex, referenceIndex, rnd);
            xNewAddTemp, yNewAddTemp = self._generate_synthetic_instance(X[centralIndex],y[centralIndex], X[referenceIndex], y[referenceIndex], centralIndex, referenceIndex, rnd)
            xNewAdd.append(xNewAddTemp)
            yNewAdd.append(yNewAddTemp)

        return np.concatenate((xNew,xNewAdd)), np.concatenate((yNew,yNewAdd)) # return new MultiLabelInstances(insNew, mlDataset.getLabelsMetaData()); X,y
    
    def _calculate_weight(self, X, y): 
        numInstances = len(y)
        numLabels = len(self._labelIndices)
        self._knnClassifer = KNeighborsClassifier(n_neighbors=self._numOfNeighbors).fit(X, y)
        self._knnIndices = [] # knnIndices=new int[nData][numOfNeighbors];
        self._C = [] # C=new Double[oriNumIns][numLabels];
        for i in range(numInstances):
            self._C.append([])
            xData = X[i]
            yData = y[i]
            result = self._knnClassifer.kneighbors([xData], return_distance=False)[0]
            for j in range(self._numOfNeighbors):
                self._knnIndices.append(result)

            for j in range(numLabels):
                numMaj = 0
                if yData[self._labelIndices[j]] == self._minLabels[j]:
                    for k in range(self._numOfNeighbors):
                        if yData[self._labelIndices[j]] != y[result[k]][self._labelIndices[j]]:
                            numMaj+=1
                    self._C[i].append(numMaj*1.0/self._numOfNeighbors)
                else:
                    self._C[i].append(None)


        # //Transform the C to scores
        scores = [ [0.0]*numLabels for e in range(numInstances)] # Double scores[][]=new Double[numIns][numLabels];

        for j in range(numLabels):
            sum=0.0
            c=0
            for i in range(numInstances):
                if self._C[i][j] != None and self._C[i][j] < 1 :
                    sum+=self._C[i][j]
                    c+=1
            if c!=0 and sum != 0.0:
                for i in range(numInstances):
                    if self._C[i][j] !=None and self._C[i][j] < 1:
                        scores[i][j] =  self._C[i][j]/sum
        
        self._sumW=0.0
        self._weights = []
        for i in range(numInstances):
            self._weights.append(0.0)
            for j in range(numLabels):
                if scores[i][j] != None:
                    self._weights[i] += scores[i][j]
            self._sumW+=self._weights[i]

    def _initilizeIns_types(self, X, y):
        numInstances = len(y)
        numLabels = len(self._labelIndices)
        self._insTypes = [] # new InstanceType[numIns][labelIndices.length];
        for i in range(numInstances):
            self._insTypes.append([])
            yData = y[i]
            for j in range(numLabels):
                if yData[self._labelIndices[j]] == self._minLabels[j]:
                    if self._C[i][j] < 0.3 :
                        self._insTypes[i].append(InstanceType.SAFE)
                    elif self._C[i][j] < 0.7:
                        self._insTypes[i].append(InstanceType.BORDERLINE)
                    elif self._C[i][j] < 1:
                        self._insTypes[i].append(InstanceType.RARE)
                    else:
                        self._insTypes[i].append(InstanceType.OUTLIER)
                else:
                    self._insTypes[i].append(InstanceType.MAJORITY)

		
		# //re-analyse the RARE type
        flag = True
        while flag:
            flag = False
            for i in range(numInstances):
                for j in range(numLabels):
                    if self._insTypes[i][j] == InstanceType.RARE:
                        for k in self._knnIndices[i]:
                            if self._insTypes[k][j]==InstanceType.SAFE or self._insTypes[i][j]==InstanceType.BORDERLINE:
                                self._insTypes[i][j]=InstanceType.BORDERLINE
                                flag = True
                                break

    def _generate_synthetic_instance(self,XCentralInstance, YCentralInstance, XReferenceInstance, YReferenceInstance, centralIndex, referenceIndex, rnd):
        numFeatures = len(self._featureIndices)
        numLabels = len(self._labelIndices)
        xNew = XCentralInstance.copy()
        yNew = YCentralInstance.copy()
        for i in range(numFeatures):
            xNew[i] += rnd.uniform(0, 1)*(XReferenceInstance[i]- XCentralInstance[i])
        d1 = np.linalg.norm(XCentralInstance - xNew)
        d2 = np.linalg.norm(XReferenceInstance - xNew)
        cd = 0.5 if d1 == 0 and d2 == 0  else (d1/(d1+d2))
        theta = 0.5

        for i in range(numLabels):
            j = self._labelIndices[i]
            if YCentralInstance[j] == YReferenceInstance[j]:
                yNew[j] = YCentralInstance[j]
            else:
                if self._insTypes[centralIndex][i] == InstanceType.MAJORITY:
                    temp = (XCentralInstance, YCentralInstance)
                    XCentralInstance, YCentralInstance =  (XReferenceInstance, YReferenceInstance)
                    XReferenceInstance, YReferenceInstance = temp
                    temp = centralIndex
                    centralIndex = referenceIndex
                    referenceIndex = temp
                    cd = 1.0 - cd
                theta = InstanceType.insTypeTheta[self._insTypes[centralIndex][i]]
                if cd <= theta:
                    yNew[j] = YCentralInstance[j]
                else:
                    yNew[j] = YReferenceInstance[j]
        return xNew, yNew

In [193]:
resampler = MLSOL(numOfNeighbors = 20, ratio = 0.3)

In [191]:
x5,y5 = resampler.fit_resample(X_train.toarray(), y_train.toarray())

In [192]:
print(x5.shape, y5.shape)

(508, 72) (508, 6)


In [170]:
y_train.shape

(391, 6)

In [204]:
print(y5)

[[0 1 1 0 0 0]
 [1 0 0 0 0 1]
 [0 1 0 0 0 1]
 ...
 [0 0 1 0 0 0]
 [0 0 1 0 0 0]
 [0 0 0 0 0 1]]


In [171]:
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.svm import SVC

# initialize Binary Relevance multi-label classifier
# with an SVM classifier
# SVM in scikit only supports the X matrix in sparse representation

classifier = BinaryRelevance(
    classifier = XGBClassifier(**paramXGBoost),
    require_dense = [False, True]
)

# train
classifier.fit(x5, y5)

# predict
predictions = classifier.predict(X_test)

In [172]:
from sklearn.metrics import classification_report, hamming_loss, f1_score
hamloss = hamming_loss(y_test,predictions)
hamloss

0.2079207920792079

In [160]:
print(classification_report(y_test,predictions, digits=4))

              precision    recall  f1-score   support

           0     0.6591    0.5370    0.5918        54
           1     0.6061    0.3390    0.4348        59
           2     0.6593    0.6250    0.6417        96
           3     0.7667    0.7797    0.7731        59
           4     0.8864    0.5342    0.6667        73
           5     0.7955    0.6034    0.6863        58

   micro avg     0.7247    0.5739    0.6406       399
   macro avg     0.7288    0.5697    0.6324       399
weighted avg     0.7286    0.5739    0.6348       399
 samples avg     0.6147    0.5668    0.5629       399



  _warn_prf(average, modifier, msg_start, len(result))


In [241]:
from sklearn.metrics import f1_score

import copy
import numpy as np

class EMLSOL:
    def __init__(self, baseMultiLabelLearner = None, mlSampling = MLSOL(), numModels = 5, samplingRatio = 0.3, randomState = 1):
        self.baseMultiLabelLearner = baseMultiLabelLearner
        self.mlSampling = mlSampling
        self.numModels = numModels
        self.samplingRatio = samplingRatio
        self.randomState = randomState
        self.thresholds = None

    def fit(self, X, y):
        self.mlls = []
        self.numLabels = y.shape[1]
        for i in range(self.numModels):
            print("Model-", i+1, "Sampling")
            mlSamplingCopy = copy.deepcopy(self.mlSampling)
            mlSamplingCopy.setRandomState(i+self.randomState)
            Xnew, ynew = mlSamplingCopy.fit_resample(X, y)
            model = copy.deepcopy(self.baseMultiLabelLearner)
            model.fit(Xnew, ynew)
            self.mlls.append(model)
        print("Calculating thresholds")
        self.calculateThresholds(X, y)

    def predict(self, X):
        nData = len(X)
        result = []
        for i in range(nData):
            bipartition, confidences = self.makePredictionSingleData(X[i])
            result.append(bipartition)
        return np.asarray(result)

    def predict_proba(self, X):
        nData = len(X)
        result = []
        for i in range(nData):
            bipartition, confidences = self.makePredictionSingleData(X[i])
            result.append(confidences)
        return np.asarray(result)

    def makePredictionSingleData(self, x1):
        conf = [0]*self.numLabels
        for i in range(self.numModels):
            a = self.mlls[i].predict_proba(np.asarray([x1]))
            confidences = a
            print(self.numLabels)
            print(confidences)
            for j in range(self.numLabels):
                conf[j]+=confidences[j]

        for j in range(self.numLabels):
            conf[j] /= self.numModels

        bipartition = []
        for j in range(self.numLabels):
            bipartition.append(conf[j] >= self.thresholds[j])

        return bipartition, conf

    def calculateThresholds(self, X, y):
        self.thresholds = []
        numInstances = y.shape[0]
        numLabels = y.shape[1]
    	# thresholdOptimizationMeasures m=measure;
    	# measure=thresholdOptimizationMeasures.None;
        
        predictConfidences = []#new double [trainingSet.getNumInstances()][trainingSet.getNumLabels()];
        for i in range(numInstances):
            Xdata, Ydata = X[i], y[i]
            bipartition, conf = self.makePredictionSingleData(Xdata)
            predictConfidences.append(conf)

        for j in range(numLabels):
            maxVal = -1000000000000.0
            trueLabels = [ data[j]==1 for data in y]
            
            d = 0.05
            while d<1:
                predictLabels = [predictConfidences[i][j]>=d for i in list(range(numInstances))]
                #Using Fmeasure
                value = f1_score(trueLabels, predictLabels, average='macro')
                if value > maxVal:
                    maxVal = value
                    self.thresholds[j] = d
                d+=0.05

    


In [242]:
resampler2 = MLSOL()
classifier2 = BinaryRelevance(
    classifier = XGBClassifier(**paramXGBoost),
    require_dense = [False, True]
)
emlsol = EMLSOL(mlSampling = resampler2, baseMultiLabelLearner = classifier2, numModels=3)

In [243]:
emlsol.fit(X_train.toarray(), y_train.toarray())

# predict
predictions = classifier.predict(X_test)

Model- 1 Sampling
Model- 2 Sampling
Model- 3 Sampling
Calculating thresholds
6
[[0.00392033 0.61693972 0.95425463 0.06847425 0.23165829 0.00269161]]


IndexError: index 1 is out of bounds for axis 0 with size 1