# Imbalance class

# Exercise 1

In [3]:
# Importing necessary libraries
import numpy as np
import pandas as pd


In [4]:
# https://www.kaggle.com/datasets/uciml/breast-cancer-wisconsin-data?resource=download

df = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data', header=None)

In [5]:
# selects all rows and columns from the third column (indicating the features) and converting to a numpy array.
# selects all rows of the second column (indicating the target variable) and converting to a numpy array

X = df.loc[:, 2:].values
y = df.loc[:, 1].values

# X matrix has dimensions (n_samples, n_features) and y is a 1-dimensional vector of length n_samples.

In [6]:
# 1) Data Loading and Initial Exploration
# Load the breast cancer dataset from the UCI repository and extract the features (X) and target variable (y). Select the appropriate columns from the dataset for features and target, and explain why those columns are selected.

# select the first five rows of the 'X' array
# pandas manages the data frame

X[:5]

array([[1.799e+01, 1.038e+01, 1.228e+02, 1.001e+03, 1.184e-01, 2.776e-01,
        3.001e-01, 1.471e-01, 2.419e-01, 7.871e-02, 1.095e+00, 9.053e-01,
        8.589e+00, 1.534e+02, 6.399e-03, 4.904e-02, 5.373e-02, 1.587e-02,
        3.003e-02, 6.193e-03, 2.538e+01, 1.733e+01, 1.846e+02, 2.019e+03,
        1.622e-01, 6.656e-01, 7.119e-01, 2.654e-01, 4.601e-01, 1.189e-01],
       [2.057e+01, 1.777e+01, 1.329e+02, 1.326e+03, 8.474e-02, 7.864e-02,
        8.690e-02, 7.017e-02, 1.812e-01, 5.667e-02, 5.435e-01, 7.339e-01,
        3.398e+00, 7.408e+01, 5.225e-03, 1.308e-02, 1.860e-02, 1.340e-02,
        1.389e-02, 3.532e-03, 2.499e+01, 2.341e+01, 1.588e+02, 1.956e+03,
        1.238e-01, 1.866e-01, 2.416e-01, 1.860e-01, 2.750e-01, 8.902e-02],
       [1.969e+01, 2.125e+01, 1.300e+02, 1.203e+03, 1.096e-01, 1.599e-01,
        1.974e-01, 1.279e-01, 2.069e-01, 5.999e-02, 7.456e-01, 7.869e-01,
        4.585e+00, 9.403e+01, 6.150e-03, 4.006e-02, 3.832e-02, 2.058e-02,
        2.250e-02, 4.571e-03, 2.357e

In [7]:
# slice the first five elements from the y array
# coloumn / target ==> malignant(M, canserous)/benign(B, non-cancerous)

y[:5]

array(['M', 'M', 'M', 'M', 'M'], dtype=object)

In [8]:
'''
Importing the LabelEncoder class from the sklearn.preprocessing module
Creating an instance of the LabelEncoder class and fitting it to the target variable y
Printing the unique classes in the target variable
'''
# 2) Label Encoding
# Encode the target variable y using LabelEncoder from sklearn.preprocessing. Print the unique classes after encoding and explain why label encoding is necessary for this dataset.
# do labeling ==> use LableEncoder
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder()

# to fit our target, change to two classes benign(B) & malignant(M) ==> B = 0 ; M = 1
y = le.fit_transform(y)

le.classes_


array(['B', 'M'], dtype=object)

- Creating a new array X_imb by stacking vertically X elements where y equals 0 and X elements where y equals 1 up to the 40th element.
- This results in a new X_imb array that is imbalanced, with many more examples where y is 0 than where y is 1.
- Creating a new array y_imb by concatenating horizontally y elements where y equals 0 and y elements where y equals 1 up to the 40th element.
- This results in a new y_imb array that is imbalanced, with many more examples where y is 0 than where y is 1.

In [19]:
# 3) Class Imbalance
# Create an imbalanced dataset where there are more samples of class 0 (benign) than class 1 (malignant). Use the first 357 samples from class 0 and the first 40 samples from class 1 to create the new imbalanced dataset. Provide an explanation of how the class imbalance is introduced.

# imb ==> imbalance
# by stacking the elements with numpy ==> np.stack
# X elements where y equals 0 and X elements where y equals 1 up to the 40th element.
X_imb = np.vstack((X[y == 0], X[y == 1][:40]))

y_imb = np.hstack((y[y == 0], y[y == 1][:40]))

- originally consisted of 357 benign tumors (class 0) and 212 malignant tumors (class 1)
- took all 357 benign tumor samples and stacked them with the first 40 malignant samples to create a stark class imbalance
-  to compute the accuracy of a model that always predicts the majority class (benign, class 0), we would achieve a prediction accuracy of approximately 90 percent:

In [20]:
# 4) Accuracy of Majority Class Model
# Create a model that always predicts the majority class (class 0). Calculate and print the accuracy of this model on the imbalanced dataset you created in the previous step. Explain what the accuracy indicates about the model’s performance.

# creates a numpy array with zeros of shape y_imb.shape[0] and assigns it to y_pred.
# predict the class
y_pred = np.zeros(y_imb.shape[0])

# computes the mean accuracy of y_pred compared to y_imb. It multiplies the boolean array resulting from the comparison
# by 100 and returns the result as a percentage.

np.mean(y_pred == y_imb) * 100


np.float64(89.92443324937027)

In [21]:
# resampling data involves randomly drawing samples from a dataset with replacement to create a new dataset
# with a different distribution of values.

from sklearn.utils import resample



In [22]:
print('Number of class 1 samples before:', X_imb[y_imb == 1].shape[0])

Number of class 1 samples before: 40


- dealing with highly unbalanced datasets is called resampling
- consists of
    - removing samples from the majority class (under-sampling)
        - randomly removing observations from the majority class to prevent its signal from dominating the learning algorithm. The most common heuristic for doing so is resampling without replacement
        - First, separate observations from each class into different datasets.
        - Next, resample the majority class without replacement, setting the number of samples to match that of the minority class.
        - Finally, combine the down-sampled majority class dataset with the original minority class dataset.
    - adding more examples from the minority class (over-sampling or up-sampling) - can cause overfitting
        - First separate observations from each class into different datasets.
        - Next, resample the minority class with replacement, setting the number of samples to match that of the majority class.
        - Finally, combine the up-sampled minority class dataset with the original majority class dataset.

![image.png](attachment:image.png)

## Oversampling

In [24]:
# 5) Oversampling
# Perform oversampling on the minority class (class 1) so that the number of samples for both classes becomes equal. Use resample from sklearn.utils to perform this operation. Print the number of samples of class 1 before and after oversampling, and explain the effect of oversampling on the dataset.

# resamples the minority class (y_imb == 1) with replacement to have the same number
# of samples as the majority class (y_imb == 0).
# random_state is a parameter (resample parameter)

X_upsampled, y_upsampled = resample(X_imb[y_imb == 1], y_imb[y_imb == 1],

                                    replace=True, #sample with replacement, by default it is FALSE

                                    n_samples=X_imb[y_imb == 0].shape[0], # to match majority class

                                    random_state=123)

In [25]:
# 6) Model Accuracy After Upsampling
# After performing oversampling, create a model that always predicts the majority class (class 0). Calculate and print the accuracy of this model on the upsampled dataset. Explain what you observe about the accuracy after balancing the dataset with oversampling.

# prints the number of samples after upsampling the minority class.

print('Number of class 1 samples after:', X_upsampled.shape[0])

Number of class 1 samples after: 357


In [26]:
# concatenate(combine) the upsampled minority class with the original majority class.
# to observe the unbalance class

x_bal = np.vstack((X[y == 0], X_upsampled))

y_bal = np.hstack((y[y == 0], y_upsampled))

In [27]:
'''
initializes y_pred to an array of zeros with the length equal to the number of samples
in the concatenated dataset (y_bal). Then it calculates the percentage of correct predictions
by comparing y_pred and y_bal.
'''
y_pred = np.zeros(y_bal.shape[0])

np.mean(y_pred == y_bal) * 100


np.float64(50.0)

## Undersampling

In [28]:
'''
This code initializes an array of zeros named 'y_pred' with the same shape as the input array 'y_imb'.
The mean of the comparison between 'y_pred' and 'y_imb' is calculated and multiplied by 100.
This gives the percentage of times 'y_pred' is equal to 'y_imb'.
'''
# 7) Undersampling
# Perform undersampling of the majority class (class 0) so that the number of samples in both classes is equal. Use resample to downsample class 0 and combine it with class 1 to create the balanced dataset. Print the number of class 0 samples before and after undersampling and explain the process.


y_pred = np.zeros(y_imb.shape[0])

np.mean(y_pred == y_imb) * 100


np.float64(89.92443324937027)

In [29]:
X_downsampled, y_downsampled = resample(X_imb[y_imb == 0], y_imb[y_imb == 0],

                                    replace=False, #sample without replacement

                                    n_samples=X_imb[y_imb == 1].shape[0], # to match minority class

                                    random_state=123)

In [30]:
print('Number of class 0 sample after:', X_downsampled.shape)

Number of class 0 sample after: (40, 30)


In [31]:
X_bal2 = np.vstack((X[y == 1][:40], X_downsampled))

y_bal2 = np.hstack((y[y == 1][:40], y_downsampled))

In [32]:
y_bal2

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [33]:
# 8) Model Accuracy After Downsampling
# After performing undersampling, create a model that always predicts the majority class (class 0). Calculate and print the accuracy of this model on the downsampled dataset. Explain what you observe about the accuracy after balancing the dataset with undersampling.

y_pred = np.zeros(y_bal2.shape[0])

np.mean(y_pred == y_bal2) * 100

np.float64(50.0)

# Exercise 2

In [35]:
import numpy as np

from sklearn.datasets import load_breast_cancer
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import StandardScaler

In [37]:
# Load the breast cancer Wisconsin dataset
data = load_breast_cancer()

In [38]:
# Split the data into features and labels

X, y = data.data, data.target

In [39]:
'''
Create an imbalanced dataset by randomly sampling 100 instances from the majority class (label 0) and
keeping all instances from the minority class (label 1)
'''
# Create an imbalanced dataset by randomly sampling 100 instances from the majority class (label 0) and
# keeping all instances from the minority class (label 1)

X_imb = np.vstack((X[y == 0][:100], X[y == 1]))

y_imb = np.hstack((y[y == 0][:100], y[y == 1]))


In [41]:
# Now, let's ensure that we create a balanced dataset using resampling

# Upsample the minority(less) class (label 1) to match the majority(more) class (label 0)

X_0 = X_imb[y_imb == 0]

y_0 = y_imb[y_imb == 0]

X_1 = X_imb[y_imb == 1]

y_1 = y_imb[y_imb == 1]


In [43]:
# Upsample the minority class (class 1) to have the same number of samples as the majority class (class 0)
X_1_upsampled, y_1_upsampled = resample(X_1, y_1,
                                        replace=True,
                                        n_samples=X_0.shape[0],
                                        random_state=123)

In [44]:
# Combine the majority and upsampled minority class

X_bal = np.vstack((X_0, X_1_upsampled))

y_bal = np.hstack((y_0, y_1_upsampled))

In [46]:
# Split the balanced data into training and testing sets using a 70/30 split

X_train, X_test, y_train, y_test = train_test_split(X_bal,
                                                    y_bal,
                                                    test_size=0.3,
                                                    random_state=123)


In [48]:
# Scale the features (standardize them to have mean 0 and variance 1)

scaler = StandardScaler()

X_train_scaled = scaler.fit_transform(X_train)

X_test_scaled = scaler.transform(X_test)


In [50]:
# Train a logistic regression model on the balanced data and test it on the testing set

clf_bal = LogisticRegression(random_state=123, max_iter=10000)  # Increase max_iter if needed, max_iter == maximum iteration also a parameter

clf_bal.fit(X_train_scaled, y_train)

y_pred_bal = clf_bal.predict(X_test_scaled)


In [51]:
# Compare with the imbalanced data model, also scaling the data

X_imb_scaled = scaler.fit_transform(X_imb)

X_test_scaled = scaler.transform(X_test)


In [52]:
clf_imb = LogisticRegression(random_state=123, max_iter=10000)  # Increase max_iter if needed

clf_imb.fit(X_imb_scaled, y_imb)

y_pred_imb = clf_imb.predict(X_test_scaled)


In [53]:
# Print the accuracy scores

print("Accuracy score (balanced data):", accuracy_score(y_test, y_pred_bal))

print("Accuracy score (imbalanced data):", accuracy_score(y_test, y_pred_imb))


Accuracy score (balanced data): 0.95
Accuracy score (imbalanced data): 0.9666666666666667
