In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import datetime
import random 
import copy 
import gc
import sys
sys.path.append("rl_method")
import env
import agent
import preprocess

import load_data
import utils

from torch.utils.data import Dataset, DataLoader
from tensorboardX import SummaryWriter
from sklearn.model_selection import train_test_split

In [3]:
model_path = '/home/willer/Desktop/Development/Python/MyRepo/npu-deeplearning-bci/model/PretrainNet_T1.pkl'
enet = preprocess.EncodeNet_T()
pnet = preprocess.PretrainNet_T()
pnet.load_state_dict(torch.load(model_path))

enet_dict = enet.state_dict()
for (name, param) in enet_dict.items():
    enet_dict[name] = copy.deepcopy(pnet.state_dict()[name])
enet.load_state_dict(enet_dict)
enet.eval()

n_classes = 2
ndata, nlabel = load_data.get_grazdata()

In [4]:
nlabel = nlabel.reshape(-1, 1)
train_loader, test_loader = load_data.boost_dataloader(ndata, nlabel, batch_size=512)

In [5]:
enet.to(torch.device('cuda'))
ndata = None
nlabel = None
with torch.no_grad():
    for input, label in train_loader:
        output = enet(input).cpu().numpy()
        label = label.cpu().numpy().reshape(-1)
        vec_label = np.eye(n_classes)[label]
        
        if str(type(ndata)) == "<class 'NoneType'>":
            ndata  = output
            nlabel = vec_label
        else:
            ndata  = np.concatenate([ndata, output], 0)
            nlabel = np.concatenate([nlabel, vec_label], 0)

    for input, label in test_loader:
        output = enet(input).cpu().numpy()
        label = label.cpu().numpy().reshape(-1)
        vec_label = np.eye(n_classes)[label]

        ndata  = np.concatenate([ndata, output], 0)
        nlabel = np.concatenate([nlabel, vec_label], 0)

print(ndata.shape, nlabel.shape)

(48416, 10, 128) (48416, 2)


In [6]:
np.save('rl_method/encode_data/encode_data_tem1.npy', ndata)
np.save('rl_method/encode_data/encode_label_tem1.npy', nlabel)

In [None]:
ndata = np.load('rl_method/encode_data/encode_data_tem1.npy')
nlabel = np.load('rl_method/encode_data/encode_label_tem1.npy')

In [184]:

def get_reward_net(input_size, model_path):

    rnet = RewardNet(input_size)
    pnet = PretrainNet_T()
    pnet.load_state_dict(torch.load(model_path))

    rnet_dict = rnet.state_dict()
    for (name, param)  in rnet_dict.items():
        rnet_dict[name] = copy.deepcopy(pnet.state_dict()[name])
    rnet.load_state_dict(rnet_dict)
    rnet.eval()
    return rnet

class PretrainNet_T(nn.Module):

    def __init__(
        self,
        in_channel=3,
        sequence_lens=1000,
        time_lens=10,
        hidden_size=64,
        output_size=2,
        layer_size=1,
        bidirectional=True
    ):
        super(PretrainNet_T, self).__init__()

        if sequence_lens % time_lens != 0:
            raise ValueError("Invalid time lens")

        self.in_channel  = in_channel
        self.time_lens   = time_lens
        self.hidden_size = hidden_size
        self.layer_size  = layer_size
        self.window_size = sequence_lens // time_lens
        self.device      = torch.device('cuda')

        self.subconv    = SubConvNet(in_channel=in_channel, out_channel=4)
        self.input_size = self._adaptive_feature_size()

        self.lstm = nn.LSTM(self.input_size, hidden_size, layer_size, bidirectional=bidirectional)
        if bidirectional:
            self.layer_size *= 2

        self.fn1  = nn.Linear(hidden_size * self.layer_size, 128)
        self.fn2  = nn.Linear(128, output_size)

    def forward(self, x):

        batch_size = x.shape[0]

        x = x.chunk(self.time_lens, 2)
        x = torch.stack(x, 1)
        x = x.reshape(batch_size * self.time_lens, self.in_channel, self.window_size)

        x = self.subconv(x)
        x = x.view(batch_size, self.time_lens, self.input_size)
        x = x.permute(1, 0, 2)

        h_0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(self.device)
        c_0 = torch.zeros(self.layer_size, batch_size, self.hidden_size).to(self.device)
        x, (h_final, c_final) = self.lstm(x, (h_0, c_0))
        # seq, batch, feature
        x = x.permute(1, 2, 0)

        x = F.avg_pool1d(x, self.time_lens)
        x = x.view(batch_size, -1)
        x = F.relu(self.fn1(x), inplace=True)
        x = F.softmax(self.fn2(x), dim=-1)
        return x

    def _adaptive_feature_size(self):
        x = torch.zeros(1, self.in_channel, self.window_size)
        return self.subconv(x).view(-1).shape[0]


In [189]:
import preprocess
def calculate_reward(vector_a, vector_b):
    return -np.sum(np.abs(vector_a - vector_b))

def cal_cosine_similarity(vector_a, vector_b):
    inner = np.dot(vector_a, vector_b.transpose())
    norm = np.linalg.norm(vector_a) * np.linalg.norm(vector_b)
    return inner / norm

class FeatureManager:
    def __init__(self, data):
        self.data = data
        self.size = data.shape[-1]
        self.index = list(range(data.shape[0]))

    def drop(self, action):
        self.index.remove(self.index[action])

    def state(self):
        new_state = self.data[self.index]
        new_size  = new_state.shape[0]
        new_index = list(range(new_size))

        avg_state = torch.mean(new_state, 0)
        ret_state = []
        for i in range(new_size):
            remaining = new_state[new_index[:i] + new_index[i+1:]]
            mean_state, var_state = self.mean_var_state(remaining)
            ret_state.append([new_state[i], mean_state, var_state])
        return avg_state, ret_state

    def mean_var_state(self, remaining_feature):
        shape = remaining_feature.shape[1]
        size  = remaining_feature.shape[0]
        mean_state = torch.mean(remaining_feature, 0)
        var_state  = torch.mean(torch.pow(remaining_feature - mean_state, 2), 0)
        return mean_state, var_state


class DropEnv:
    def __init__(self, tdata, vdata, tlabel, vlabel, drop_reward, reward_model_path):

        self.drop_reward = drop_reward
        self.random_stop = 0.8
        self.training = True
        
        self.tdata  = torch.from_numpy(tdata)
        self.vdata  = torch.from_numpy(vdata)
        self.tlabel = tlabel
        self.vlabel = vlabel
        
        self.tdata_size = self.tdata.shape[0]
        self.vdata_size = self.vdata.shape[0]
        self.channel_size = self.tdata.shape[1]
        
        self.reward_module = get_reward_net(tdata[0].shape[-1], reward_model_path)

    def step(self, action):
        self.manager.drop(action)
        avg_state, state = self.manager.state()
        cls_vector = self.reward_module(avg_state).numpy()
        self.new_sim = calculate_reward(cls_vector, self.current_label)
        if self.training:
            reward = self.new_sim - self.old_sim + self.drop_reward
        else:
            reward = self.new_sim - self.old_sim
        self.old_sim = self.new_sim

        done = False
        if reward < 0 and (self.training == True and random.random() < self.random_stop \
                           or self.training == False):
            done = True
        
        if self.training:
            return state, reward, done
        else:
            res = np.argmax(cls_vector)
            return state, float(res==self.current_num), done

    def train(self):
        self.training = True

    def eval(self):
        self.training = False

    def reset(self):
        if self.training:
            index = random.randint(0, self.tdata_size-1)
            single_data = self.tdata[index]
            self.current_label = self.tlabel[index]
        else:
            index = random.randint(0, self.vdata_size-1)
            single_data = self.vdata[index]
            self.current_label = self.vlabel[index]
            
        self.current_num = np.argmax(self.current_label)
        self.manager = FeatureManager(single_data)
            
        random_drop_num = 0
        
        if self.training:
            random_drop_num = random.randint(1, self.channel_size//4)
            random_drop_idx = random.sample(range(0, self.channel_size - random_drop_num - 1), random_drop_num)
            for idx in random_drop_idx:
                self.manager.drop(idx)

        init_avg_state, init_state = self.manager.state()
        init_cls_vector = self.reward_module(init_avg_state).numpy()
        init_cls_num = np.argmax(init_cls_vector)
        self.old_sim = calculate_reward(init_cls_vector, self.current_label)
        
        gc.collect()
        
        if self.training:
            return init_state, random_drop_num
        else:
            return init_state, float(init_cls_num==self.current_num), random_drop_num

In [194]:
def reward_shape(origin_reward, discount):
    length = len(origin_reward)
    new_reward = np.zeros_like(origin_reward, dtype=np.float32)
    for i in reversed(range(length)):
        new_reward[i] = origin_reward[i] + (discount * new_reward[i+1] if i+1 < length else 0)
    return new_reward

class Q_Net(nn.Module):

    def __init__(self, v_dim, h_dim=64):
        super(Q_Net,self).__init__()

        self.v_dim = v_dim
        self.h_dim = h_dim
        self.pre_feature = nn.Sequential(
            nn.Linear(v_dim, h_dim),
            nn.Tanh(),
        )
        self.first_order_feature= nn.Sequential(
            nn.Linear(v_dim, h_dim),
            nn.Tanh(),
        )
        self.second_order_feature = nn.Sequential(
            nn.Linear(v_dim, h_dim),
            nn.Tanh(),
        )
        self.fc1 = nn.Linear(h_dim*3, h_dim)
        self.fc2 = nn.Linear(h_dim  , 1)

    def forward(self, feature, mean_feature=None, var_feature=None):

        if mean_feature == None:
            feature, mean_feature, var_feature = feature.chunk(3, -1)
        f_pre = self.pre_feature(feature)
        f_fst = self.first_order_feature(mean_feature - feature)
        f_scd = self.second_order_feature(var_feature)

        f_merge = torch.cat([f_pre, f_fst, f_scd], -1)
        q = F.relu(self.fc1(f_merge))
        q = self.fc2(q)
        return q
    
class ReplayBuffer:
    def __init__(self, size):
    
        self._storage  = []
        self._maxsize  = size
        self._next_idx = 0

    def __len__(self):
        return len(self._storage)

    def add(self, obs_t, reward, obs_tp1, done):
        data = (obs_t, reward, obs_tp1, done)

        if self._next_idx >= len(self._storage):
            self._storage.append(data)
        else:
            self._storage[self._next_idx] = data
        self._next_idx = (self._next_idx + 1) % self._maxsize

    def _encode_sample(self, idxes):
        obses_t, rewards, obses_tp1, dones = [], [], [], []
        for i in idxes:
            data = self._storage[i]
            obs_t, reward, obs_tp1, done = data
            obses_t.append(obs_t)
            rewards.append(reward)
            obses_tp1.append(obs_tp1,)
            dones.append(done)
        return obses_t, rewards, obses_tp1, dones
        
    def sample(self, batch_size):
        idxes = [random.randint(0, len(self._storage) - 1) for _ in range(batch_size)]
        return self._encode_sample(idxes)

    
    
class ADAgent:
    def __init__(
        self,
        data,
        label,
        reward_model_path,
        gamma=0.98,
        epsilon=0.3,
        max_drop=9,
        buffer_size=2000,
        batch_size=512,
        train_epoch=1000,
        drop_reward=3e-3,
    ):

        self.gamma = gamma
        self.epsilon = epsilon
        self.max_drop = max_drop
        self.train_epoch = train_epoch
        self.batch_size = batch_size
        
        tdata, vdata, tlabel, vlabel = train_test_split(data, label, test_size=0.2)
        self.env = DropEnv(tdata, vdata, tlabel, vlabel, drop_reward, reward_model_path)
        
        self.device = torch.device('cuda')
        self.eval_net = Q_Net(data.shape[-1])
        self.eval_net_gpu = Q_Net(data.shape[-1]).to(self.device)
        
        self.val_net = get_reward_net(data[0].shape[-1], reward_model_path)
        self.memory = ReplayBuffer(buffer_size)
        self.writer = SummaryWriter("adrl-runs/ADAgent_" + str(datetime.datetime.now()))

        self.interaction_counter = 0
        self.validation_counter = 0
        self.learn_step_counter = 0
        self.step_update = 10
        self.training = True
        
        self.optimizer = torch.optim.Adam(self.eval_net.parameters(), lr=1e-3, weight_decay=1e-5)
        self.scheduler = optim.lr_scheduler.StepLR(self.optimizer, step_size=15, gamma=0.98)

    def select_action(self, state):
        action = -1
        q_max = None
        action_dim = len(state) - 1
        if random.random() < self.epsilon or not self.training:
            for index, (fea, mean, var) in enumerate(state):
                q = self.eval_net(fea, mean, var)
                if action == -1:
                    action = index
                    q_max = q
                elif q > q_max:
                    action = index
                    q_max = q
        else:
            action = random.randint(0, action_dim)
            q_max  = self.eval_net(*state[action])
        return torch.cat(state[action], 0), action, q_max
    
    def test(self, test_size=25):
        self.validation_counter += 1
        total_init_sim = 0
        total_drop_sim = 0
        total_drop_lens = 0
        
        i = 0
        self.training = False
        for _ in range(test_size):
            state, init_sim, drop_size = self.env.reset()
            for i in range(self.max_drop - drop_size):
                state_tp1, action, q_pred = self.select_action(state)
                new_state, result, done = self.env.step(action)
                if done:
                    break
                state = new_state
                
            total_init_sim  += init_sim
            total_drop_sim  += result
            total_drop_lens += i
            
        self.training = True 
        self.writer.add_scalar('accuracy/raw', total_init_sim / test_size, self.validation_counter)
        self.writer.add_scalar('accuracy/drop', total_drop_sim / test_size, self.validation_counter)
        self.writer.add_scalar('accuracy/radio(drop_to_raw)', total_drop_sim / total_init_sim, self.validation_counter)
        self.writer.add_scalar('drop_lens/val', total_drop_lens / test_size, self.validation_counter)
            
    def train(self):
        for _ in range(self.train_epoch):
            self.interaction_counter += 1
            state, drop_size = self.env.reset()

            i = 0
            for i in range(self.max_drop - drop_size):
                state_tp1, action, q_pred = self.select_action(state)
                new_state, reward, done = self.env.step(action)

                if done:
                    break
                if i != 0:
                    self.memory.add(state_t, reward, state_tp1, done)
                state = new_state
                state_t = state_tp1
                
            self.writer.add_scalar('drop_lens/train', i + drop_size, self.interaction_counter)
            
            if len(self.memory) >= self.batch_size:
                self.learn()
            if self.interaction_counter % 100 == 0:
                self.env.eval()
                self.test()
                self.env.train()

    def learn(self):

        self.learn_step_counter += 1

        self.epsilon *= 1.005
        self.epsilon = min(self.epsilon, 0.98)
        
        batch_state, batch_reward, batch_next_state, _ = self.memory.sample(self.batch_size)
        batch_state  = torch.stack(batch_state, 0).to(self.device)
        batch_reward = torch.FloatTensor(batch_reward).view(-1, 1).to(self.device)
        batch_next_state = torch.stack(batch_next_state, 0).to(self.device)
        
        q_eval = self.eval_net_gpu(batch_state)
        q_next = self.eval_net_gpu(batch_next_state)
        q_target = batch_reward + self.gamma * q_next

        loss = F.mse_loss(q_eval, q_target)
        
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()
        self.scheduler.step()

        self.eval_net.load_state_dict(self.eval_net_gpu.state_dict())
        self.writer.add_scalar('loss', loss, self.learn_step_counter)


    def save(self, filename):
        torch.save(self.eval_net.state_dict(), filename + "_Q_net" + str(datetime.datetime.now()))
        torch.save(self.optimizer.state_dict(), filename + "_optimizer_" + str(datetime.datetime.now()))

    def load(self, filename):
        self.eval_net.load_state_dict(torch.load(filename + "_Q_net"))
        self.optimizer.load_state_dict(torch.load(filename + "_optimizer"))

In [197]:
ndata = ndata[:1000]
nlabel = nlabel[:1000]

In [None]:
model_path = '/home/willer/Desktop/Development/Python/MyRepo/npu-deeplearning-bci/model/PretrainNet_T1.pkl'
agent = ADAgent(ndata, nlabel, model_path, train_epoch=8000)
agent.train()

In [77]:
def map_to_origin(action):
    res = [0 for i in range(10)]
    map_move = 0
    for act in action:
        res[act + map_move] += 1
        map_move += 1
    return res

In [79]:
map_to_origin([1,2,1,2])

[0, 1, 0, 2, 0, 1, 0, 0, 0, 0]

In [164]:
random.randint(0, 0)

0

In [173]:
a = np.array([0, 1])
b = np.array([0.1, 0.9])

In [188]:
np.argmax(b)

1