<a href="https://colab.research.google.com/github/vineetjoshi253/Using-Deep-Reinforcement-Learning-For-Imbalanced-Data-Classification/blob/main/Generating_Undersampled_Data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [47]:
import random
import numpy as np
from keras.datasets import mnist,fashion_mnist,cifar10
from sklearn.model_selection import train_test_split

### MNIST Undersampling

In [13]:
#Load the standard training and MNIST testing data.
(trainX, trainy), (_, _) = mnist.load_data()

In [14]:
#Getting the distribution of classes. 
print(np.unique(trainy,return_counts = True))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8), array([5923, 6742, 5958, 6131, 5842, 5421, 5918, 6265, 5851, 5949]))


In [15]:
#Select class for binary classification.
class1,class2 = 7,9

In [16]:
#Sample the indexes of the selected classes
Index_0 = np.where(trainy == class1)[0]
Index_2 = np.where(trainy == class2)[0]

Index_to_keep = list(set(Index_0).union(set(Index_2)))
print(len(Index_to_keep))

12214


In [17]:
#Sample the selected classes from the dataest.
trainX = trainX[Index_to_keep]
trainy = trainy[Index_to_keep]

In [18]:
#Renamings the labels by considering class 1 as postive and class 2 as negative. 
Y_train = []
for label in trainy:
  if label == class1:
    Y_train.append(1)
  else:
    Y_train.append(0)
Y_train = np.asarray(Y_train)

In [20]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))
_,Count = np.unique(Y_train,return_counts = True)
Negative_Count = Count[0]
Positive_Count = Count[1]

Training Data:  (array([0, 1]), array([5949, 6265]))


In [24]:
#Number of instances of class 1 to remove 
imbalance_ratio = 0.4
undersample = int(Positive_Count - imbalance_ratio * Negative_Count)
print(undersample)

3885


In [27]:
#Undersampling positive class.

#Get index of positive class
Pos_ind = list(np.where(Y_train==1)[0])

#Get random indices to remove.
rem_ind = random.sample(Pos_ind,undersample)

#Get indices to keep
keep_ind = []
for i in range(len(Y_train)):
  if i not in rem_ind:
    keep_ind.append(i)

In [28]:
#Create final data
X_train = trainX[keep_ind]/255.0
Y_train = Y_train[keep_ind]

In [29]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))

Training Data:  (array([0, 1]), array([5949, 2380]))


In [32]:
#Getting the training and validation data. 
X_train,X_val,Y_train,Y_val = train_test_split(X_train,Y_train,test_size=0.20,stratify = Y_train,random_state=42)

### FMNIST Undersampling

In [34]:
#Load the standard training and MNIST testing data.
(trainX, trainy), (_, _) = fashion_mnist.load_data()

Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/train-images-idx3-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-labels-idx1-ubyte.gz
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/t10k-images-idx3-ubyte.gz


In [35]:
#Getting the distribution of classes. 
print(np.unique(trainy,return_counts = True))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8), array([6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000, 6000]))


In [36]:
#Select class for binary classification.
class1,class2 = 1,6

In [37]:
#Sample the indexes of the selected classes
Index_0 = np.where(trainy == class1)[0]
Index_2 = np.where(trainy == class2)[0]

Index_to_keep = list(set(Index_0).union(set(Index_2)))
print(len(Index_to_keep))

12000


In [38]:
#Sample the selected classes from the dataest.
trainX = trainX[Index_to_keep]
trainy = trainy[Index_to_keep]

In [39]:
#Renamings the labels by considering class 1 as postive and class 2 as negative. 
Y_train = []
for label in trainy:
  if label == class1:
    Y_train.append(1)
  else:
    Y_train.append(0)
Y_train = np.asarray(Y_train)

In [40]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))
_,Count = np.unique(Y_train,return_counts = True)
Negative_Count = Count[0]
Positive_Count = Count[1]

Training Data:  (array([0, 1]), array([6000, 6000]))


In [41]:
#Number of instances of class 1 to remove 
imbalance_ratio = 0.4
undersample = int(Positive_Count - imbalance_ratio * Negative_Count)
print(undersample)

3600


In [42]:
#Undersampling positive class.

#Get index of positive class
Pos_ind = list(np.where(Y_train==1)[0])

#Get random indices to remove.
rem_ind = random.sample(Pos_ind,undersample)

#Get indices to keep
keep_ind = []
for i in range(len(Y_train)):
  if i not in rem_ind:
    keep_ind.append(i)

In [43]:
#Create final data
X_train = trainX[keep_ind]/255.0
Y_train = Y_train[keep_ind]

In [44]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))

Training Data:  (array([0, 1]), array([6000, 2400]))


In [45]:
#Getting the training and validation data. 
X_train,X_val,Y_train,Y_val = train_test_split(X_train,Y_train,test_size=0.20,stratify = Y_train,random_state=42)

### CIFAR Undersampling

In [48]:
#Load the standard training and MNIST testing data.
(trainX, trainy), (_, _) = cifar10.load_data()

Downloading data from https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz


In [49]:
#Getting the distribution of classes. 
print(np.unique(trainy,return_counts = True))

(array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype=uint8), array([5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000, 5000]))


In [50]:
#Select class for binary classification.
class1,class2 = 1,6

In [51]:
#Sample the indexes of the selected classes
Index_0 = np.where(trainy == class1)[0]
Index_2 = np.where(trainy == class2)[0]

Index_to_keep = list(set(Index_0).union(set(Index_2)))
print(len(Index_to_keep))

10000


In [52]:
#Sample the selected classes from the dataest.
trainX = trainX[Index_to_keep]
trainy = trainy[Index_to_keep]

In [53]:
#Renamings the labels by considering class 1 as postive and class 2 as negative. 
Y_train = []
for label in trainy:
  if label == class1:
    Y_train.append(1)
  else:
    Y_train.append(0)
Y_train = np.asarray(Y_train)

In [54]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))
_,Count = np.unique(Y_train,return_counts = True)
Negative_Count = Count[0]
Positive_Count = Count[1]

Training Data:  (array([0, 1]), array([5000, 5000]))


In [55]:
#Number of instances of class 1 to remove 
imbalance_ratio = 0.4
undersample = int(Positive_Count - imbalance_ratio * Negative_Count)
print(undersample)

3000


In [56]:
#Undersampling positive class.

#Get index of positive class
Pos_ind = list(np.where(Y_train==1)[0])

#Get random indices to remove.
rem_ind = random.sample(Pos_ind,undersample)

#Get indices to keep
keep_ind = []
for i in range(len(Y_train)):
  if i not in rem_ind:
    keep_ind.append(i)

In [57]:
#Create final data
X_train = trainX[keep_ind]/255.0
Y_train = Y_train[keep_ind]

In [58]:
#Getting the distribution of classes. 
print('Training Data: ',np.unique(Y_train,return_counts = True))

Training Data:  (array([0, 1]), array([5000, 2000]))


In [59]:
#Getting the training and validation data. 
X_train,X_val,Y_train,Y_val = train_test_split(X_train,Y_train,test_size=0.20,stratify = Y_train,random_state=42)