<a href="https://colab.research.google.com/github/xuexi21/RL_CO_training/blob/main/r_co_train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

<!-- ## prepare the data -->

In [1]:
# Dataset
from sklearn.datasets import make_moons as moon
from sklearn.model_selection import train_test_split


# define the classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score




In [2]:
# set the dataset.
dataset = moon(5000, noise=0.3, random_state=42)
X,y = dataset

# split the training(labeled) as 10% of dataset
X_l, X_ul, y_l, y_ul = train_test_split(X, y, test_size=0.8, random_state=0)


# split the training(labeled) as 50% of  labeled dataset
X_l_train, X_l_test, y_l_train, y_l_test = train_test_split(X_l, y_l, test_size=0.5, random_state=0)

# 2-classifier
clf_1 = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=11))
        ]
)

# clf 1
clf_2 = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("RF", RandomForestClassifier())
        ]
)



In [3]:
# define ENV

# for clustering the unlabeld data
from sklearn.cluster import KMeans
import numpy as np
import matplotlib.pyplot as plt

#
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim


class Env():
    def __init__(self, classifier_1, classifier_2, input_ul_data, k, X_test, y_test, X_reset, y_reset):
        # super().__init__
        self.model_1 = classifier_1
        self.model_2 = classifier_2
        # UN LABEL DATA
        self.X_ul = input_ul_data
        # define the evaluate data, later use for the reward
        self.X_eval = X_test
        self.y_eval = y_test
        self.X_reset = X_reset
        self.y_reset = y_reset
        # cluster the data
        self.action_size = k
        self.kmeans = KMeans(n_clusters=k,  n_init=10)
        self.cluster_label = self.kmeans.fit_predict(self.X_ul)
        self.u_cluster_label = np.unique(self.cluster_label)
        self.centroids = self.kmeans.cluster_centers_
        self.observation_size = self.get_state(reset=True).shape[1]

    # def cluster_plot(self):
    #     for i in self.u_cluster_label:
    #         plt.scatter(self.X_ul[self.cluster_label == i , 0] ,
    #                     self.X_ul[self.cluster_label == i , 1] ,
    #                     label = i)
    #     plt.scatter(self.centroids[:,0],
    #                 self.centroids[:,1],
    #                 s=80,
    #                 color='k')
    #     # plt.legend()
    #     plt.title(f'{self.k} cluster (centroids) of unlabeled data')
    #     plt.show()

    # update 2 clf
    def train_2_clf(self, X, y):
        self.model_1.fit(X, y)
        self.model_2.fit(X, y)

    def get_state(self,reset=False):
        np.random.seed(123)
        if reset:
            self.train_2_clf(self.X_reset, self.y_reset)
            print("reset")
        out_1 = self.model_1.predict_proba(self.centroids)
        out_2 = self.model_2.predict_proba(self.centroids)
        state_proba = np.concatenate((out_1, out_2), axis=1)
        return  torch.from_numpy(state_proba).to(torch.float32).reshape(1, -1)


    def get_acc(self):
        pred_1 = self.model_1.predict(self.X_eval)
        pred_2 = self.model_1.predict(self.X_eval)
        acc_1 = accuracy_score(pred_1, self.y_eval)
        acc_2 = accuracy_score(pred_2, self.y_eval)
        return acc_1, acc_2

    ######
    ######
    def get_subset(self, action):
        # choose subset
        subset = self.X_ul[self.cluster_label == action]
        return subset

    def co_training(self, subset):
        ## get posodu label
        clf_0_p_label = self.model_1.predict(subset)
        clf_1_p_label = self.model_2.predict(subset)

        ## get proba_
        clf_0_p_y = self.model_1.predict_proba(subset)
        clf_1_p_y = self.model_2.predict_proba(subset)

        #get the label size
        y_num = subset.shape[0]
        # set empty y  #type=ndarray
        y_ul_action = np.zeros(y_num,)

        #############
        # confidence_diff = 0
        # combine the lable from two classifier, choose the most conffidence
        for i in range(y_num):
            if max(clf_0_p_y[i, ]) > max(clf_1_p_y[i, ]):
                y_ul_action[i] = clf_0_p_label[i]
                # print('0')
            else:
                y_ul_action[i] = clf_1_p_label[i]
                # print('1')

        ########### update the label_set for traning
        X_updated = np.concatenate((X_l_train, subset), axis=0)
        y_updated = np.concatenate((y_l_train, y_ul_action), axis=0)

        # print(f'X shape is {X_updated.shape} \ny shape is {y_updated.shape}')

        ############# use the updated labeld dateset retrain those 2 classifier
        self.train_2_clf(X_updated, y_updated)

        # RETURN THE co-trained 2 CLASSIFIER'S accuracy.
        acc_1_, acc_2_ = self.get_acc()
        return acc_1_, acc_2_

    def step(self, action):
        # GET THE bigining state accuracy, later use to calculate the reward
        pre_acc_1, pre_acc_2 = self.get_acc()

        # choose subset
        choosen_subset =  self.get_subset(action)

        # cotraining the 2 classifier
        acc1_, acc2_ = self.co_training(choosen_subset )

        # get the next state_
        n_state = self.get_state()

        ##############
        # calculate the reward
        ##############
        reward_0 =  acc1_ - pre_acc_1
        reward_1 = acc2_ - pre_acc_2

        if reward_0 > 0 and reward_1 > 0:
            reward = reward_0 * reward_1
        else:
            reward = 0

        return n_state, reward


In [4]:
k = 20
env = Env(clf_1, clf_2, input_ul_data=X_ul, k=k, X_test=X_l_test, y_test=y_l_test, X_reset=X_l_train, y_reset=y_l_train)
state_0 = env.get_state(reset=True)
# env.cluster_plot()
# env.get_acc()
# env.get_subset(2)
state, reward = env.step(19)
# state.shape
# reward

# print(state_0)
# print(state)


reset
reset


<!-- #### DQN

<img src="https://yinyoupoet.github.io/images/%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%E4%B9%8B%E6%B7%B1%E5%BA%A6Q%E7%BD%91%E7%BB%9CDQN%E8%AF%A6%E8%A7%A3/1_8coZ4g_pRtfyoHmsuzMH6g.png" alt="Description of the image" width="400" height="300">

##### loss


<img src="https://yinyoupoet.github.io/images/%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%E4%B9%8B%E6%B7%B1%E5%BA%A6Q%E7%BD%91%E7%BB%9CDQN%E8%AF%A6%E8%A7%A3/1_YCgMUijhU4p_y3sctvu-kQ.png" alt="Description of the image" width="300" height="50">




 -->


In [5]:
## RL functions##
import random
from collections import deque




# Define the Q-network (a simple feedforward neural network)
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


class ReplayBuffer:

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, reward, next_state):
        # # bug - some times the state is not
        # if len(state) != 4:
        #     state = state[0]
        experience_tuple = (state, action, reward, next_state)
        # Append experience_tuple to the memory buffer
        self.memory.append(experience_tuple)

    def __len__(self):
        return len(self.memory)

    def sample(self, batch_size):
        # Draw a random sample of size batch_size
        batch = random.sample(self.memory, batch_size)
        # Transform batch into a tuple of lists
        states, actions, rewards, next_states = (zip(*batch))
        return states, actions, rewards, next_states



In [6]:
import math

#
EPS_START = 1
EPS_END = 0.05
EPS_DECAY = 1000
steps_done = 0

####
buffer_size = 10000
###
episodes = 500
max_step = 100
batch_size = 64
TAU = 0.005
gamma = 0.99

# hyper parameter
observation_size = env.observation_size
action_size = env.action_size
lr = 1e-4

###
# Initialize networks and optimizer
q_network = QNetwork(observation_size, action_size)
target_network = QNetwork(observation_size, action_size)
target_network.load_state_dict(q_network.state_dict())
optimizer = optim.Adam(q_network.parameters(), lr=lr)
loss_fn = nn.MSELoss()


# Replay memory
replay_buffer = ReplayBuffer(buffer_size)


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            action = torch.argmax(q_network(state)).item()
            return action
    else:
        return np.random.choice(range(k))

In [7]:
# Function to update the Q-network
def train():
    if len(replay_buffer) < batch_size:
        return
        # prepare the training data
    states, actions, rewards, next_states = replay_buffer.sample(batch_size)

    rewards = torch.FloatTensor(rewards).unsqueeze(1)
    actions = torch.tensor(np.array(actions)).unsqueeze(1)
    states = torch.tensor(np.array(states)).squeeze(1)
    next_states = torch.tensor(np.array(next_states)).squeeze(1)

    # # Compute current Q values
    q_values = q_network(states).gather(1, actions)



        # Compute target Q values
    with torch.no_grad():
        next_q_values = target_network(next_states).max(1).values.unsqueeze(1)
    targets = rewards + (gamma * next_q_values)


        # Update the network
    loss = loss_fn(q_values, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss



In [None]:
#training loop
results = []
for episode in range(episodes):
    state = env.get_state(reset=True)
    total_reward = 0

    for t in range(max_step):
        action = select_action(state)
        observation, reward = env.step(action)
# '''
#         done = terminated or truncated
#         if terminated:
#             next_state = None
#         else:
#             next_state = observation
# '''
        next_state = observation


        replay_buffer.push(state, action, reward, next_state)

        state = next_state
        total_reward += reward

        train()


        # Soft update of the target network's weights
        target_net_state_dict = q_network.state_dict()
        policy_net_state_dict = target_network.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_network.load_state_dict(target_net_state_dict)

        # if terminated or truncated:
        #     break


    results.append(total_reward)
    print(f"Episode: {episode}, Total Reward: {total_reward}")

    ## for debug - train() - just get some replay memory.
    # if episode == 3:
    #     print('break')
    #     break

reset
Episode: 0, Total Reward: 0.0005000000000000009
reset
Episode: 1, Total Reward: 0.0016000000000000033
reset
Episode: 2, Total Reward: 0.001500000000000003
reset
Episode: 3, Total Reward: 0.0017560000000000032
reset
Episode: 4, Total Reward: 0.0016040000000000032
reset
Episode: 5, Total Reward: 0.0016560000000000032
reset
Episode: 6, Total Reward: 0.0021560000000000043
reset
Episode: 7, Total Reward: 0.001932000000000004
reset
Episode: 8, Total Reward: 0.001948000000000004
reset
Episode: 9, Total Reward: 0.0025600000000000054
reset
Episode: 10, Total Reward: 0.002480000000000005
reset
Episode: 11, Total Reward: 0.0018760000000000035
reset
Episode: 12, Total Reward: 0.0023320000000000046
reset
Episode: 13, Total Reward: 0.0016480000000000034
reset
Episode: 14, Total Reward: 0.002148000000000004
reset
Episode: 15, Total Reward: 0.0016760000000000034
reset
Episode: 16, Total Reward: 0.0018000000000000034
reset
Episode: 17, Total Reward: 0.0018360000000000028
reset
Episode: 18, Total 

In [None]:
# states, actions, rewards, next_states = replay_buffer.sample(batch_size)
# print(np.array(states).shape)
# print(np.array(actions).shape)
# print(np.array(rewards).shape)
# print(np.array(next_states).shape)

In [None]:
  # states, actions, rewards, next_states = replay_buffer.sample(batch_size)

  # # # Compute a mask of non-final states and concatenate the batch elements
  # # non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
  # #                                       next_states)))
  # # non_final_next_states = torch.tensor(np.array([s for s in next_states
  # #                                             if s is not None]))
  # # #
  # rewards = torch.FloatTensor(rewards).unsqueeze(1)
  # actions = torch.tensor(np.array(actions)).unsqueeze(1)
  # states = torch.tensor(np.array(states)).squeeze(1)
  # next_states = torch.tensor(np.array(next_states)).squeeze(1)

  # # # Compute current Q values
  # q_values = q_network(states).gather(1, actions)



  #     # Compute target Q values
  # with torch.no_grad():
  #     next_q_values = target_network(next_states).max(1).values.unsqueeze(1)
  # targets = rewards + (gamma * next_q_values)

  # #     # Update the network
  # loss = loss_fn(q_values, targets)
  # optimizer.zero_grad()
  # loss.backward()
  # optimizer.step()

In [None]:
# # rewards
# # actions.shape
# # states.shape
# # next_states.shape
# q_values.shape
# targets.shape
# next_q_values.unsqueeze(1).shape