<!-- ## prepare the data -->

In [1]:
# Dataset

import numpy as np
import matplotlib.pyplot as plt

#
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from sklearn.datasets import make_moons as moon
from sklearn.model_selection import train_test_split
from sklearn.cluster import KMeans

# define the classifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

from sklearn.metrics import precision_recall_fscore_support


In [2]:
# set the dataset.
dataset = moon(5000, noise=0.3, random_state=42)
X,y = dataset

# split the training(labeled) as 10% of dataset
X_l, X_ul, y_l, y_ul = train_test_split(X, y, test_size=0.8, random_state=0)


# split the training(labeled) as 50% of  labeled dataset
X_l_train, X_l_test, y_l_train, y_l_test = train_test_split(X_l, y_l, test_size=0.5, random_state=0)

# 2-classifier
clf_1 = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("knn", KNeighborsClassifier(n_neighbors=11))
        ]
)

# clf 1
clf_2 = Pipeline(
    steps=[
        ("scaler", StandardScaler()),
        ("RF", RandomForestClassifier())
        ]
)

clfs = [clf_1,clf_2]

In [3]:
# define ENV

# for clustering the unlabeld data



class Env():
    def __init__(self, classifiers, input_ul_data, k, X_test, y_test, X_reset, y_reset):
        # super().__init__

        self.models = classifiers
        # UN LABEL DATA
        self.X_ul = input_ul_data
        # define the evaluate data, later use for the reward
        self.X_eval = X_test
        self.y_eval = y_test
        self.X_reset = X_reset
        self.y_reset = y_reset
        # cluster the data
        self.action_size = k * 2  # because always cotraining has 2 classifiers, so action size is (K * 2) shape 
        self.kmeans = KMeans(n_clusters=k,  n_init=10)
        self.cluster_label = self.kmeans.fit_predict(self.X_ul)
        self.u_cluster_label = np.unique(self.cluster_label)
        self.centroids = self.kmeans.cluster_centers_
        self.observation_size = self.get_state(reset=True).shape[1]
        self.prev_macro_f1 = 0.0

    ##############################
    # ###### FIX ###########    
    ###############################
    # update 2 clf
    def train_2_clf(self, X, y):
        for clf in self.models:
            clf.fit(X, y)

    def get_state(self,reset=False):

        if reset:
            self.train_2_clf(self.X_reset, self.y_reset)
            print("reset")

        #########################

        ########fix#################
        # only keep the most confidence of each classifier as the state views
        ''' 
        # The state vector does not need to contain the complete 
        # class probability distribution for each cluster and classifier
        # since the classifier’s largest confidence is enough to help the agents decision.”
        '''
        #########################
        out_1 = self.models[0].predict_proba(self.centroids).max(axis=1)
        out_2 = self.models[1].predict_proba(self.centroids).max(axis=1)
        # state_proba = np.concatenate((out_1, out_2), axis=0)
        ##############################
        #### UPDATED 5-OCT-2024
        ## When choosen the action neeed calculation: cluster( '//2'), classifer ('%2')
        ###########
        state_proba = [[out_1[i],out_2[i]] for i in range(len(out_1))]
        return  torch.from_numpy(np.array(state_proba).flatten()).to(torch.float32).reshape(1, -1)


    def get_f1(self):
        classifier_weights = [clf.score(self.X_eval, self.y_eval) for clf in self.models]  # Weights based on validation accuracy
        combined_probabilities = np.average(
            [clf.predict_proba(self.X_eval) for clf in self.models],
            axis=0,
            weights=classifier_weights
        )

        # Get final predictions from combined probabilities
        combined_predictions = np.argmax(combined_probabilities, axis=1)


        # Calculate F1 scores per class (harmonic means)
        precision, recall, f1_per_class, _ = precision_recall_fscore_support(self.y_eval, combined_predictions, average=None)

        # Compute Macro-F1 as arithmetic mean of F1 scores
        macro_f1 = np.mean(f1_per_class)

        return macro_f1


    ######
    ######
    def get_subset(self, action):
        # choose subset
        subset = self.X_ul[self.cluster_label == action]
        return subset

    def co_training(self, subset, clf_idx):

        # Average class probabilities across classifiers
        avg_probabilities = np.mean([clf.predict_proba(subset) for clf in self.models], axis=0)

        # Assign the label with the highest average probability
        y_ul_action = np.argmax(avg_probabilities, axis=1)

        ###########################################
        ########### update the label_set for traning
        ###########################################
        X_updated = np.concatenate((X_l_train, subset), axis=0)
        y_updated = np.concatenate((y_l_train, y_ul_action), axis=0)

        self.models[clf_idx].fit(X_updated, y_updated)

        '''
        # ## get posodu label
        # clf_0_p_label = self.model_1.predict(subset)
        # clf_1_p_label = self.model_2.predict(subset)

        # ## get proba_
        # clf_0_p_y = self.model_1.predict_proba(subset)
        # clf_1_p_y = self.model_2.predict_proba(subset)

        # #get the label size
        # y_num = subset.shape[0]
        # # set empty y  #type=ndarray
        # y_ul_action = np.zeros(y_num,)

        # #############
        # # confidence_diff = 0
        # # combine the lable from two classifier, choose the most conffidence
        # for i in range(y_num):
        #     if max(clf_0_p_y[i, ]) > max(clf_1_p_y[i, ]):
        #         y_ul_action[i] = clf_0_p_label[i]
        #         # print('0')
        #     else:
        #         y_ul_action[i] = clf_1_p_label[i]
        #         # print('1')

        # print(f'X shape is {X_updated.shape} \ny shape is {y_updated.shape}')

        ############# use the updated labeld dateset retrain those 2 classifier
        # self.train_2_clf(X_updated, y_updated)

        '''
        # RETURN THE co-trained CLASSIFIER'S mean marcof1.
        marco_f1 = self.get_f1()
        return marco_f1

    def step(self, action, clf_idx):
        # GET THE bigining state accuracy, later use to calculate the reward
        pre_marco_f1 = self.get_f1()

        # choose subset
        choosen_subset =  self.get_subset(action)

        # cotraining the 2 classifier
        new_marco_f1 = self.co_training(choosen_subset, clf_idx)

        # get the next state_
        n_state = self.get_state()

        ##############
        # calculate the reward
        ##############
        # if new_marco_f1 > pre_marco_f1:
        #     reward_0 = new_marco_f1 - pre_marco_f1
        # else:
        #     reward_0 = 0

        #########################

        ########fix#############

        #########################
        reward_0 = new_marco_f1 - pre_marco_f1
        return n_state, reward_0, new_marco_f1


In [4]:
k = 20
env = Env(clfs, input_ul_data=X_ul, k=k, X_test=X_l_test, y_test=y_l_test, X_reset=X_l_train, y_reset=y_l_train)
state_0 = env.get_state(reset=True)


# state_0.shape
# env.action_size


reset
reset


<!-- #### DQN

<img src="https://yinyoupoet.github.io/images/%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%E4%B9%8B%E6%B7%B1%E5%BA%A6Q%E7%BD%91%E7%BB%9CDQN%E8%AF%A6%E8%A7%A3/1_8coZ4g_pRtfyoHmsuzMH6g.png" alt="Description of the image" width="400" height="300">

##### loss


<img src="https://yinyoupoet.github.io/images/%E6%B7%B1%E5%BA%A6%E5%BC%BA%E5%8C%96%E5%AD%A6%E4%B9%A0%E4%B9%8B%E6%B7%B1%E5%BA%A6Q%E7%BD%91%E7%BB%9CDQN%E8%AF%A6%E8%A7%A3/1_YCgMUijhU4p_y3sctvu-kQ.png" alt="Description of the image" width="300" height="50">




 -->


In [5]:
## RL functions##
import random
from collections import deque

# Define the Q-network (a simple feedforward neural network)
class QNetwork(nn.Module):
    def __init__(self, state_size, action_size):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(state_size, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, action_size)

    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = F.relu(self.fc2(x))
        return self.fc3(x)


class ReplayBuffer:

    def __init__(self, capacity):
        self.memory = deque([], maxlen=capacity)

    def push(self, state, action, reward, next_state):
        # # bug - some times the state is not
        # if len(state) != 4:
        #     state = state[0]
        experience_tuple = (state, action, reward, next_state)
        # Append experience_tuple to the memory buffer
        self.memory.append(experience_tuple)

    def __len__(self):
        return len(self.memory)

    def sample(self, batch_size):
        # Draw a random sample of size batch_size
        batch = random.sample(self.memory, batch_size)
        # Transform batch into a tuple of lists
        states, actions, rewards, next_states = (zip(*batch))
        return states, actions, rewards, next_states



In [6]:
import math

#
EPS_START = 1
EPS_END = 0.05
EPS_DECAY = 1000
steps_done = 0

####
buffer_size = 10000
###
episodes = 500
max_step = 100
batch_size = 64
TAU = 0.005
gamma = 0.99

# hyper parameter
observation_size = env.observation_size
action_size = env.action_size
lr = 1e-4

###
# Initialize networks and optimizer
q_network = QNetwork(observation_size, action_size)
target_network = QNetwork(observation_size, action_size)
target_network.load_state_dict(q_network.state_dict())
optimizer = optim.Adam(q_network.parameters(), lr=lr)
loss_fn = nn.MSELoss()


# Replay memory
replay_buffer = ReplayBuffer(buffer_size)


def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done / EPS_DECAY)
    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return the largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.
            action = torch.argmax(q_network(state)).item()
            return action
    else:
        return np.random.choice(range(k))

In [7]:
# Function to update the Q-network
def train():
    if len(replay_buffer) < batch_size:
        return
        # prepare the training data
    states, actions, rewards, next_states = replay_buffer.sample(batch_size)

    rewards = torch.FloatTensor(rewards).unsqueeze(1)
    actions = torch.tensor(np.array(actions)).unsqueeze(1)
    states = torch.tensor(np.array(states)).squeeze(1)
    next_states = torch.tensor(np.array(next_states)).squeeze(1)

    # # Compute current Q values
    q_values = q_network(states).gather(1, actions)



        # Compute target Q values
    with torch.no_grad():
        next_q_values = target_network(next_states).max(1).values.unsqueeze(1)
    targets = rewards + (gamma * next_q_values)


        # Update the network
    loss = loss_fn(q_values, targets)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

    return loss



In [8]:
#training loop
results = []
for episode in range(episodes):
    # Change random seed for regularization purposes (randomness in training)
    seed = random.randint(0, 10000)
    np.random.seed(seed)
    random.seed(seed)
    
    state = env.get_state()
    total_reward = 0

    for t in range(max_step):
        # Agent chooses an action (which cluster and which classifier to train)
        action = select_action(state)
        action_cluster_idx = action // 2  # co trainig normally use 2 classifier,  fdin wich cluster todo
        action_clf_idx = action % 2  # find witch classifier to retrain with the new psoduolabel

        observation, reward, marco_f1 = env.step(action_cluster_idx, action_clf_idx)
# '''
#         done = terminated or truncated
#         if terminated:
#             next_state = None
#         else:
#             next_state = observation
# '''
        next_state = observation


        replay_buffer.push(state, action, reward, next_state)

        state = next_state
        total_reward += reward

        train()

        # Soft update of the target network's weights
        target_net_state_dict = q_network.state_dict()
        policy_net_state_dict = target_network.state_dict()
        for key in policy_net_state_dict:
            target_net_state_dict[key] = policy_net_state_dict[key]*TAU + target_net_state_dict[key]*(1-TAU)
        target_network.load_state_dict(target_net_state_dict)

        # if terminated or truncated:
        #     break

    results.append((episode, total_reward, marco_f1))
    print(f"Episode: {episode}, marco_f1: {marco_f1}, Total Reward: {total_reward}")

    # # for debug - train() - just get some replay memory.
    # if episode == 1:
    #     print('break')
    #     break

Episode: 0, marco_f1: 0.9019117205484937, Total Reward: 0.010051870182419265
Episode: 1, marco_f1: 0.901886580887506, Total Reward: -2.513966098771636e-05
Episode: 2, marco_f1: 0.8998702318204392, Total Reward: -0.002016349067066714
Episode: 3, marco_f1: 0.9038754225476218, Total Reward: 0.004005190727182528
Episode: 4, marco_f1: 0.9018582833611736, Total Reward: -0.0020171391864481913
Episode: 5, marco_f1: 0.897908117305575, Total Reward: -0.003950166055598614
Episode: 6, marco_f1: 0.9038754225476218, Total Reward: 0.005967305242046805
Episode: 7, marco_f1: 0.9038754225476218, Total Reward: 0.0
Episode: 8, marco_f1: 0.9019525450317953, Total Reward: -0.0019228775158264266
Episode: 9, marco_f1: 0.8998974950349158, Total Reward: -0.0020550499968795544
Episode: 10, marco_f1: 0.8999215384861732, Total Reward: 2.4043451257393933e-05
Episode: 11, marco_f1: 0.9058640677137787, Total Reward: 0.00594252922760552
Episode: 12, marco_f1: 0.8978197540461373, Total Reward: -0.008044313667641356
Epi

In [9]:

# Print model's state_dict
print("Model's state_dict:")
for param_tensor in q_network.state_dict():
    print(param_tensor, "\t", q_network.state_dict()[param_tensor].size())

# Print optimizer's state_dict
print("Optimizer's state_dict:")
for var_name in optimizer.state_dict():
    print(var_name, "\t", optimizer.state_dict()[var_name])

Model's state_dict:
fc1.weight 	 torch.Size([128, 40])
fc1.bias 	 torch.Size([128])
fc2.weight 	 torch.Size([128, 128])
fc2.bias 	 torch.Size([128])
fc3.weight 	 torch.Size([40, 128])
fc3.bias 	 torch.Size([40])
Optimizer's state_dict:
state 	 {0: {'step': tensor(49937.), 'exp_avg': tensor([[-1.3373e-06, -1.4110e-06, -1.2477e-06,  ..., -1.7410e-06,
         -1.7315e-06, -1.6749e-06],
        [ 5.6052e-45,  5.6052e-45,  5.6052e-45,  ...,  5.6052e-45,
          5.6052e-45,  5.6052e-45],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  ...,  0.0000e+00,
          0.0000e+00,  0.0000e+00],
        ...,
        [ 5.6052e-45,  5.6052e-45,  5.6052e-45,  ...,  5.6052e-45,
          5.6052e-45,  5.6052e-45],
        [ 5.6052e-45,  5.6052e-45,  5.6052e-45,  ...,  5.6052e-45,
          5.6052e-45,  5.6052e-45],
        [-9.2238e-07, -4.5835e-07, -9.5820e-07,  ..., -1.0556e-06,
         -1.1746e-06, -1.1986e-06]]), 'exp_avg_sq': tensor([[2.5861e-10, 2.0127e-10, 2.4081e-10,  ..., 3.6000e-10, 4.0082

In [10]:
#SAVING THE NETWORK.
import os

cwd = os.getcwd()
# Assume `model` is your neural network
torch.save(q_network.state_dict(), cwd +'qnetwork.pth')
# Assume `model` is your neural network
torch.save(target_network.state_dict(), cwd+'target_network.pth')

In [11]:
import pickle


# Save the deque to a file using pickle
with open('results.pkl', 'wb') as file:
    pickle.dump(results, file)

In [12]:
results

[(0, 0.010051870182419265, 0.9019117205484937),
 (1, -2.513966098771636e-05, 0.901886580887506),
 (2, -0.002016349067066714, 0.8998702318204392),
 (3, 0.004005190727182528, 0.9038754225476218),
 (4, -0.0020171391864481913, 0.9018582833611736),
 (5, -0.003950166055598614, 0.897908117305575),
 (6, 0.005967305242046805, 0.9038754225476218),
 (7, 0.0, 0.9038754225476218),
 (8, -0.0019228775158264266, 0.9019525450317953),
 (9, -0.0020550499968795544, 0.8998974950349158),
 (10, 2.4043451257393933e-05, 0.8999215384861732),
 (11, 0.00594252922760552, 0.9058640677137787),
 (12, -0.008044313667641356, 0.8978197540461373),
 (13, -0.004085418206538294, 0.893734335839599),
 (14, -0.002027159635142217, 0.8917071762044568),
 (15, 0.014156891509321867, 0.9058640677137787),
 (16, -0.01812271162936019, 0.8877413560844185),
 (17, 0.01812271162936019, 0.9058640677137787),
 (18, -0.008044313667641356, 0.8978197540461373),
 (19, 6.219748983826712e-05, 0.8978819515359756),
 (20, 0.0059934710116461565, 0.9038