<a href="https://colab.research.google.com/github/vineetjoshi253/Using-Deep-Reinforcement-Learning-For-Imbalanced-Data-Classification/blob/main/DQNImb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import numpy as np
import torch.nn as nn
import torch
from collections import deque
from sklearn.metrics import f1_score

In [None]:
#Load Data
X_train = np.load('/content/79_40percent_undersampled_train.npy')
Y_train = np.load('/content/79_40percent_undersampled_train_labels.npy')
X_Val = np.load('/content/79_40percent_undersampled_validation.npy')
Y_Val = np.load('/content/79_40percent_undersampled_validation_labels.npy')

#### Setting Up The Environment

In [None]:
X_train.shape

(7496, 28, 28)

In [None]:
class Environment:
  def __init__(self,X_train,Y_train,imbratio):
    #Saving The Parameters
    self.X = X_train.reshape((X_train.shape[0],1,28, 28))
    self.Y = Y_train
    self.start_state = 0
    self.action_space = [0,1]
    self.input_shape = (28,28)
    self.imbalance_ratio = imbratio
    self.ind = 0


  def reset(self):
    #Shuffling Data At Each Episode
    Ind = [i for i in range(Self.X.shape[0])]
    random.shuffle(Ind)
    self.Y = self.Y[Ind]
    self.X = self.X[Ind]
    self.ind = 0
    return self.X[self.ind]
    

  def step(self,state,action):
    self.ind += 1
    #Taking A Step In The Episode

    #If Agent Is Of Positive (Minority) Class
    if self.Y[self.ind-1] == 1:
      #Correct Classification
      if action == self.Y[self.ind-1]:
         if self.ind == len(self.Y):
           return self.X[0],1,True
         else:
           return self.X[self.ind],1,False

      else:
        #Incorrect Classification
        if self.ind == len(self.Y):
          return self.X[0],-1,True
        else:
          return self.X[self.ind],-1,True
    #If state is of negative class
    elif self.Y[self.ind-1] == 0:
      #Incorrect Classification
      if action != self.Y[self.ind-1]:
        
        if self.ind == len(self.Y):
           return self.X[0],-1 * self.imbalance_ratio,True
        else:
           return self.X[self.ind],-1 * self.imbalance_ratio,False
      
      else:
        #Correct Classification
        if self.ind == len(self.Y):
           return self.X[0],self.imbalance_ratio,True
        else:
           return self.X[self.ind],self.imbalance_ratio,False

imbalance_ratio = 0.4
env = Environment(X_train,Y_train,imbalance_ratio)

### Setting Up The Agent

In [None]:
#Discount Factor
gamma = 0.1
#Batch Size For Training The Networks
batch_size = 64
#Maximum Number Of Episodes
episodes = 500

#Maximum steps in each episode.
episode_steps = 1000
#Starting epsilon value.
start_epsilon = 1
#End epsilon value
end_epsilon = 0.001
decay_stop = 50

#Setting the exploration rate. 
exploration_rate = start_epsilon
epsilon_delta = 0.0001
replay_size = 50000
target_update_freq = 100

In [None]:
F1 = []
Rewards = []
C_Rewards = []

#Deep Q Network Class
class Deep_QNetwork:
    def __init__(self, env, gamma, batch_size, memory_max_size, episodes, exploration_rate, epsilon_delta, decay_stop, replay_size, target_update_freq, episode_steps):
        #Set up the hyperparameters.
        self.env = env
        self.gamma = gamma
        self.batch_size = batch_size
        self.memory_max_size = memory_max_size
        self.episodes = episodes
        self.exploration_rate = exploration_rate
        self.epsilon_delta = epsilon_delta
        self.decay_stop = decay_stop
        self.replay_size = replay_size
        self.target_update_freq = target_update_freq
        self.episode_steps = episode_steps
        self.steps = 0
        self.mse_loss = nn.MSELoss()

        #Create replay memory.
        self.replay_memory = deque(maxlen = replay_size)

        #Create the Q and the target networks.
        self.QNetwork = self.Create_QNetwork().to('cuda')
        self.target_network = self.Create_QNetwork().to('cuda')
    
    #Function to create the sequential models. 
    def Create_QNetwork(self):
        model = nn.Sequential(nn.Conv2d(1,32,5),nn.ReLU(),nn.MaxPool2d(2),nn.Conv2d(32,32,5),nn.ReLU(),nn.MaxPool2d(2),nn.Flatten(),nn.Linear(512,256),nn.ReLU(),nn.Linear(256,2),nn.Softmax(1))
        return model

    #Function to predict the state-action values based on the given Q Model. 
    def predict(self, model, state):
        return model(state.cuda())

    #Function to find action based on the current state and E-Greedy policy.
    def find_action(self, state):
        #Exploration
        if(np.random.rand() < self.exploration_rate):
            Action = np.random.randint(0, len(self.env.action_space))
        #Exploitation 
        else:
            Q_Values = self.predict(self.QNetwork, state)
            Action = Q_Values.argmax().item()
        return Action

    #Function to predict class based on the QNetwork. 
    def predict_class(self, state):
      Q_Values = self.predict(self.QNetwork, state)
      Action = Q_Values.argmax().item()
      return Action
    
    #Append current observations to the replay memory.
    def append_batch(self, state, action, reward, next_state, done):
        self.replay_memory.append((state, action, reward, next_state, done))

    #Sample current batch from the replay Memory
    def generate_batch(self):
        if(len(self.replay_memory) < batch_size):
            size = len(self.replay_memory)
        else:
            size = self.batch_size
        batch = random.sample(self.replay_memory, k = size)
        return batch
    
    #Update target network.
    def Update_Target(self):
        self.target_network.load_state_dict(self.QNetwork.state_dict())

    def replay(self):
        #Batch of experience is renadomly sampled from the replay memory which stores the sequence of state, action and rewards
        mini_batch = self.generate_batch()
        batch_size = len(mini_batch)

        #Creating a dictionary for all the components(state, next state, reward, done, action) of the episodes of the batch generated
        #Each component of the dictionary is intialized with a tensor filled with the scalar value 0, with the shape defined by the argument passed
        batch_dict = {'states': torch.zeros(batch_size,1,28,28),
                      'next_states': torch.zeros(batch_size,1,28,28),
                      'rewards' : torch.zeros(batch_size),
                      'dones': torch.zeros(batch_size),
                      'actions': torch.zeros(batch_size)}

        #Storing the values of the batch generated in the above defined dictionary
        index = 0
        for state, action, reward, next_state, done in mini_batch:
          batch_dict['states'][index] = state
          batch_dict['next_states'][index] = next_state
          batch_dict['rewards'][index] = reward
          batch_dict['dones'][index] = done 
          batch_dict['actions'][index] = action
          index += 1
        
        for key in batch_dict:
          batch_dict[key] = batch_dict[key].float()

        #Use the target model for computing the next Q values       
        next_state_qvalues = self.predict(self.target_network, batch_dict['next_states']).detach()

        #find the maximum value of the next Q values
        next_state_qvalues_max = next_state_qvalues.max(axis=1)[0]

        #If the episode terminates, we set the target value as reward, else (reward + gamma * max_qvalue(next state))
        mask = 1 - batch_dict['dones']
        masked_q_values = torch.mul(mask.to('cuda'), next_state_qvalues_max)

        #finding the target value (reward + gamma * max_qvalue(next state))
        target = batch_dict['rewards'].to('cuda') + gamma * masked_q_values

        #Use the Q network for computing the qvalues of the current states
        q_values = self.predict(self.QNetwork, batch_dict['states'])
        final_qvalues = q_values.gather(dim=1, index=batch_dict['actions'].unsqueeze(dim=1).long().to('cuda')).squeeze(dim=1)

        #Calculate the mean squared error
        loss = self.mse_loss(final_qvalues, target)
        return loss

    #Calcualte the F1 Score Based On The Current Model on Testing Data
    def validation_F1(self,X_test,Y_test):
      Y_pred = []
      for i in range(X_test.shape[0]):
        Inpt = torch.from_numpy(X_test[i].reshape(3,32,32)).float().unsqueeze(dim=0)
        a = DQN.predict_class(Inpt)
        Y_pred.append(a)
      f1 = f1_score(Y_test,Y_pred)
      F1.append(f1)
      return f1

    #Save Models
    def save_model(self):
      torch.save(self.QNetwork.state_dict(),'/content/drive/MyDrive/RL Project/Models/CIFAR 0.4/DQN_QNetwork_model_16_40per.h5')
      torch.save(self.target_network.state_dict(),'/content/drive/MyDrive/RL Project/Models/CIFAR 0.4/DQN_Target_model_16_40per.h5')
      print('Model Saved')
    
    def Deep_QModel(self,X_test,Y_test):
        
        Episode_Count = 0
        step_count = 0
        optimizer = torch.optim.Adam(self.QNetwork.parameters(), lr = 0.00025) 
        Total_reward_so_far = 0
        
        for episode in tq.tqdm(range(self.episodes)):
            #Get current state of the environment. 
            state = env.reset()
            state = torch.from_numpy(state).float().unsqueeze(dim=0)

            done = False
            total_reward = 0
            total_loss = 0

            #Iterate till maximum number of steps. 
            for step in range(0, self.episode_steps):
                self.steps+=1
                step_count += 1
                #Get Action
                action = self.find_action(state)
                
                #Get Next State & Reward
                next_state, reward, done = self.env.step(state,action)
                next_state = torch.from_numpy(next_state).float().unsqueeze(dim=0)

                total_reward += reward
                Total_reward_so_far+= reward

                #Append the observations to replay memory.
                self.append_batch(state, action, reward, next_state, done)

                #Update the current state.
                state = next_state

                #Calculate Loss and optimize the Q Network
                if(len(self.replay_memory) > self.batch_size):
                    loss = self.replay()
                    total_loss += loss.item()
                    loss.to('cuda')
                    optimizer.zero_grad()
                    loss.backward()
                    optimizer.step()

                #Update the exploration rate
                if self.exploration_rate > end_epsilon:
                    self.exploration_rate -= self.epsilon_delta

                #Update the target network.
                if (step_count % self.target_update_freq) == 0:
                    self.Update_Target()

                if done:
                    if episode%10 == 0:
                      f1 = self.validation_F1(X_test,Y_test)
                      print('Episode: ',episode,'Current F1: ',f1)
                      self.save_model()
                      
                    Episode_Count = Episode_Count + 1
                    break

            Rewards.append(total_reward)
            C_Rewards.append(Total_reward_so_far)

### Running The Model

In [None]:
DQN = Deep_QNetwork(env, gamma, batch_size, replay_size, episodes, exploration_rate, epsilon_delta, decay_stop, replay_size, target_update_freq, episode_steps)
Rewards = DQN.Deep_QModel(X_Val,Y_Val)

### Generating Results

In [None]:
#Generate the predictions
Y_pred = []
for i in range(X_val.shape[0]):
  Inpt = torch.from_numpy(X_val[i].reshape(3,32,32)).float().unsqueeze(dim=0)
  a = DQN.predict_class(Inpt)
  Y_pred.append(a)

In [None]:
print('F1 Score: ',f1_score(Y_val,Y_pred))
print('Accuracy: ',accuracy_score(Y_val,Y_pred))