<a href="https://colab.research.google.com/github/vineetjoshi253/Using-Deep-Reinforcement-Learning-For-Imbalanced-Data-Classification/blob/main/REINFORCEimb.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [10]:
import numpy as np
import random
import tensorflow as tf
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Input,Flatten
from keras.layers.merge import Add, Multiply
from keras.optimizers import Adam
import keras.backend as K
import tqdm.notebook as tq
from keras.layers import Conv2D
import tqdm.notebook as tq
from keras.layers import MaxPooling2D
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from collections import deque
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split
from tensorflow.python.keras.models import load_model

import logging
import os

os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # FATAL
logging.getLogger('tensorflow').setLevel(logging.FATAL)

In [2]:
#Load Data
X_train = np.load('/content/79_40percent_undersampled_train.npy')
Y_train = np.load('/content/79_40percent_undersampled_train_labels.npy')
X_Val = np.load('/content/79_40percent_undersampled_validation.npy')
Y_Val = np.load('/content/79_40percent_undersampled_validation_labels.npy')

#### Setting Up The Environment

In [3]:
X_train.shape

(7496, 28, 28)

In [22]:
class Environment:
  def __init__(self,X_train,Y_train,imbratio):
    #Saving The Parameters
    self.X = X_train.reshape((X_train.shape[0],28, 28,1))
    self.Y = Y_train
    self.start_state = 0
    self.action_space = [0,1]
    self.input_shape = (28,28)
    self.imbalance_ratio = imbratio
    self.ind = 0


  def reset(self):
    #Shuffling Data At Each Episode
    Ind = [i for i in range(self.X.shape[0])]
    random.shuffle(Ind)
    self.Y = self.Y[Ind]
    self.X = self.X[Ind]
    self.ind = 0
    return self.X[self.ind]
    

  def step(self,state,action):
    self.ind += 1
    #Taking A Step In The Episode

    #If Agent Is Of Positive (Minority) Class
    if self.Y[self.ind-1] == 1:
      #Correct Classification
      if action == self.Y[self.ind-1]:
         if self.ind == len(self.Y):
           return self.X[0],1,True
         else:
           return self.X[self.ind],1,False

      else:
        #Incorrect Classification
        if self.ind == len(self.Y):
          return self.X[0],-1,True
        else:
          return self.X[self.ind],-1,True
    #If agent is of positive class
    elif self.Y[self.ind-1] == 0:
      #Incorrect Classification
      if action != self.Y[self.ind-1]:
        
        if self.ind == len(self.Y):
           return self.X[0],-1 * self.imbalance_ratio,True
        else:
           return self.X[self.ind],-1 * self.imbalance_ratio,False
      
      else:
        #Correct Classification
        if self.ind == len(self.Y):
           return self.X[0],self.imbalance_ratio,True
        else:
           return self.X[self.ind],self.imbalance_ratio,False

imbalance_ratio = 0.4
env = Environment(X_train,Y_train,imbalance_ratio)

### Setting Up The Agent

In [23]:
gamma = 0.1
alpha= 1e-4
learning_rate= 0.00025

In [27]:
#Creating the class for the reinforce learning agent.
class REINFORCE:
  def __init__(self,env,gamma,alpha,learning_rate,path = None):
    self.env=env 
    self.state_shape=env.input_shape
    self.action_shape= len(env.action_space)
    self.gamma= gamma
    self.alpha= 1e-4
    self.learning_rate= learning_rate
    
    #Load the policy prediction model. 
    if not path:
      self.model=self._create_model() 
    else:
      self.model=self.load_model(path) 

    #Saving the data.
    self.states=[]
    self.gradients=[] 
    self.rewards=[]
    self.probs=[]
    self.discounted_rewards=[]
    self.total_rewards=[]
    self.f1 = []
  
  #Create the model. 
  def _create_model(self):
    model=Sequential()
    model.add(Conv2D(32, (5, 5), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28,1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Conv2D(32, (5, 5), activation='relu', kernel_initializer='he_uniform', input_shape=(28, 28,1)))
    model.add(MaxPooling2D((2, 2)))
    model.add(Flatten())
    model.add(Dense(100, activation='relu', kernel_initializer='he_uniform'))
    model.add(Dense(10, activation='softmax'))
    model.add(Dense(self.action_shape, activation="softmax"))
    model.compile(loss="categorical_crossentropy",optimizer=Adam(lr=self.learning_rate))
    return model

  #Encode Actions
  def hot_encode_action(self, action):
    action_encoded=np.zeros(self.action_shape, np.float32)
    action_encoded[action]=1
    return action_encoded
  
  #Update the memory with current observations.
  def update_memory(self, state, action, action_prob, reward):
    encoded_action=self.hot_encode_action(action)
    self.gradients.append(encoded_action-action_prob)
    self.states.append(state)
    self.rewards.append(reward)
    self.probs.append(action_prob)

  #Get action
  def get_action(self, state):
    #Current State 
    state=state.reshape((1,28,28,1))

    #Probability distribibution 
    action_probability_distribution=self.model.predict(state).flatten()
    action_probability_distribution/=np.sum(action_probability_distribution)

    #Select Action
    action=np.random.choice(self.action_shape,1,p=action_probability_distribution)[0]

    return action, action_probability_distribution

  #Function to generate the discounted reward.
  def get_discounted_rewards(self, rewards): 
    discounted_rewards=[]
    cumulative_total_return=0

    for reward in rewards[::-1]:      
      cumulative_total_return=(cumulative_total_return*self.gamma)+reward
      discounted_rewards.insert(0, cumulative_total_return)

    mean_rewards=np.mean(discounted_rewards)
    std_rewards=np.std(discounted_rewards)
    norm_discounted_rewards=(discounted_rewards-mean_rewards)/(std_rewards+1e-7)  

    return norm_discounted_rewards

  def update_policy(self):
    states = np.asarray(self.states)
    gradients=np.vstack(self.gradients)
    rewards=np.vstack(self.rewards)
    discounted_rewards=self.get_discounted_rewards(rewards)
    
    gradients*=discounted_rewards
    #Calculate Gradients
    gradients=self.alpha*np.vstack([gradients])+self.probs
    
    history=self.model.train_on_batch(states, gradients)
    self.states, self.probs, self.gradients, self.rewards=[], [], [], []
    return history

  def validation_F1(self,X_test,Y_test):
    Y_pred = []
    for i in range(X_test.shape[0]):
      action,_ = self.get_action(X_test[i])
      Y_pred.append(action)
    print(np.unique(Y_pred,return_counts=True))
    f1 = f1_score(Y_test,Y_pred)
    self.f1.append(f1)
    return f1

  def train(self,episodes,X_test,Y_test,rollout_n=1):
    #Create the enviroment 
    env=self.env 
    total_rewards=np.zeros(episodes)
    total_reward = 0
    
    #Run for Maximum number of episodes. 
    for episode in tq.tqdm(range(episodes)):
      #Get the current state. 
      state = env.reset()
      done=False          
      episode_reward=0 
      length = 0

      while not done:
        #Get the action.
        action, prob = self.get_action(state)
        #Get next state & reward.
        next_state, reward, done = env.step(state,action)
        
        #Update the memory. 
        self.update_memory(state, action, prob, reward)
        
        #Update the current state. 
        state=next_state
        episode_reward+=reward
        length+=1
        
        #Once episode is over. 
        if done:
          total_reward+=episode_reward

          if episode%rollout_n==0:
            #Update the policy and save the model.
            history=self.update_policy()
            #self.save_model()
            if episode%15 == 0:
              print('Episode :',episode,'Reward: ',total_reward,'Length: ',length,'F1 Score: ',self.validation_F1(X_test,Y_test))
            break
      total_rewards[episode]=episode_reward
      self.total_rewards=total_rewards

  def save_model(self):
    self.model.save('/content/drive/MyDrive/RL Project/Vineet/REINFORCE_model_79_40per.h5')
  
  def load_model(self, path):
    return load_model(path)

In [28]:
REINFORCE = REINFORCE(Environment(X_train,Y_train,0.4),gamma,alpha,learning_rate)

In [29]:
REINFORCE.train(500,X_Val,Y_Val,1)

## Get Results

In [None]:
Y_pred = []
for i in range(X_val.shape[0]):
  a,_ = REINFORCE.get_action((X_val[i].reshape(1,28,28,1)))
  Y_pred.append(a)

In [None]:
print('F1 Score: ',f1_score(Y_val,Y_pred))
print('Accuracy: ',accuracy_score(Y_val,Y_pred))