
# Multiclass classification with under-sampling

Some balancing methods allow for balancing dataset with multiple classes.


In [1]:
from collections import Counter

from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

from imblearn.datasets import make_imbalance
from imblearn.under_sampling import NearMiss
from imblearn.pipeline import make_pipeline
from imblearn.metrics import classification_report_imbalanced

RANDOM_STATE = 30

iris = load_iris()
X, y = make_imbalance(
    iris.data,
    iris.target,
    sampling_strategy={0: 25, 1: 50, 2: 50},
    random_state=RANDOM_STATE,
)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=RANDOM_STATE)

print(f"Training target statistics: {Counter(y_train)}")
print(f"Testing target statistics: {Counter(y_test)}")

pipeline = make_pipeline(
    StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

pipeline = make_pipeline(
    NearMiss(version=1), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

Training target statistics: Counter({1: 42, 2: 34, 0: 17})
Testing target statistics: Counter({2: 16, 0: 8, 1: 8})
                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.78      0.88      0.92      0.82      0.90      0.80         8
          2       0.93      0.88      0.94      0.90      0.91      0.82        16

avg / total       0.91      0.91      0.95      0.91      0.93      0.86        32

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.88      1.00      0.93      0.94      0.86         8
          1       0.75      0.75      0.92      0.75      0.83      0.68         8
          2       0.88      0.94      0.88      0.91      0.91      0.83        16

avg / total       0.88      0.88      0.92      0.88      0.89      0.80        32



In [3]:
pipeline = make_pipeline(
    NearMiss(version=2), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      0.88      1.00      0.93      0.94      0.86         8
          1       0.75      0.75      0.92      0.75      0.83      0.68         8
          2       0.88      0.94      0.88      0.91      0.91      0.83        16

avg / total       0.88      0.88      0.92      0.88      0.89      0.80        32



In [4]:
pipeline = make_pipeline(
    NearMiss(version=3), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.75      0.38      0.96      0.50      0.60      0.34         8
          2       0.75      0.94      0.69      0.83      0.80      0.66        16

avg / total       0.81      0.81      0.83      0.79      0.80      0.66        32





In [5]:
from imblearn.under_sampling import CondensedNearestNeighbour

pipeline = make_pipeline(
    CondensedNearestNeighbour(n_neighbors=1), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.67      0.25      0.96      0.36      0.49      0.22         8
          2       0.71      0.94      0.62      0.81      0.77      0.60        16

avg / total       0.77      0.78      0.80      0.75      0.76      0.61        32



In [6]:
from imblearn.under_sampling import TomekLinks

pipeline = make_pipeline(
    TomekLinks(), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.78      0.88      0.92      0.82      0.90      0.80         8
          2       0.93      0.88      0.94      0.90      0.91      0.82        16

avg / total       0.91      0.91      0.95      0.91      0.93      0.86        32



In [12]:
from imblearn.under_sampling import OneSidedSelection

pipeline = make_pipeline(
    OneSidedSelection(n_neighbors=2), StandardScaler(), LogisticRegression(random_state=RANDOM_STATE)
)
pipeline.fit(X_train, y_train)

#Specific metrics have been proposed to evaluate the classification performed on imbalanced dataset. 
#This report compiles the state-of-the-art metrics: precision/recall/specificity, geometric mean, and index balanced accuracy of the geometric mean.
print(classification_report_imbalanced(y_test, pipeline.predict(X_test)))

                   pre       rec       spe        f1       geo       iba       sup

          0       1.00      1.00      1.00      1.00      1.00      1.00         8
          1       0.78      0.88      0.92      0.82      0.90      0.80         8
          2       0.93      0.88      0.94      0.90      0.91      0.82        16

avg / total       0.91      0.91      0.95      0.91      0.93      0.86        32

