In [60]:
import gym
import math
import random
import numpy as np
from collections import namedtuple, deque
from estimator import cost_estimate
from env import env
import torch
import torch.nn as nn
import torch.optim as optim
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
def setup_seed(seed):
  random.seed(seed)
  torch.manual_seed(seed)
  np.random.seed(seed)
setup_seed(0)

In [61]:
Transition = namedtuple('Transition',
                        ('state', 'action', 'next_state', 'reward'))


class ReplayMemory(object):

    def __init__(self, capacity):
        self.memory = deque([],maxlen=capacity)

    def push(self, *args):
        """Save a transition"""
        self.memory.append(Transition(*args))
    def push_numpy(self, args):
        # tensor_args = [torch.from_numpy(args[0])]
        # tensor_args.append(args[1])
        # tensor_args+=[torch.from_numpy(arg) for arg in args[2:]]
        # print('tensor_args len:')
        # print(len(tensor_args))
        self.push(torch.from_numpy(args[0]),args[1],torch.from_numpy(args[2]),torch.from_numpy(args[3]))

    def sample(self, batch_size):
        return random.sample(self.memory, batch_size)

    def __len__(self):
        return len(self.memory)

In [62]:
n_states=67
n_actions=67
class DQN(nn.Module):
    def __init__(self):
        super(DQN, self).__init__()
        # self.flatten = nn.Flatten()
        self.linear_relu_stack = nn.Sequential(
            # nn.Linear(np.array(env.observation_space.shape).prod(), 128, bias=False),
            nn.Linear(n_states, 128, bias=False),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, n_actions),
        )
    def forward(self, x):
        x = x.float()
        logits = self.linear_relu_stack(x)
        return logits

In [63]:
BATCH_SIZE = 32
GAMMA = 0.99
EPS_START = 0.9
EPS_END = 0.05
EPS_DECAY = 0.997
# EPS_DECAY = 200
TARGET_UPDATE = 10
learning_rate=5e-4
# np.random.seed(0)


policy_net=DQN().float().to(device)
target_net=DQN().float().to(device)
# policy_net=DQN().to(device)
# target_net=DQN().to(device)
target_net.load_state_dict(policy_net.state_dict())
target_net.eval()

optimizer = optim.Adam(policy_net.parameters(),learning_rate)
memory = ReplayMemory(10000)

mask_fact_table=[True]*67
mask_fact_table[64]=False # not replicate lineorder
action_mask=torch.from_numpy(np.array(mask_fact_table)).to(device=device) 

steps_done = 0
def select_action(state):
    global steps_done
    sample = random.random()
    eps_threshold = EPS_END + (EPS_START - EPS_END) * \
        math.exp(-1. * steps_done * EPS_DECAY)
        # math.exp(-1. * steps_done / EPS_DECAY)

    steps_done += 1
    if sample > eps_threshold:
        with torch.no_grad():
            # t.max(1) will return largest column value of each row.
            # second column on max result is index of where max element was
            # found, so we pick action with the larger expected reward.

            output=policy_net(torch.FloatTensor(state).unsqueeze(0)) # mask fact table replication
            max_idx=torch.masked_select(output.squeeze(0), action_mask).max(0)[1]
            kept_idx=action_mask.nonzero()
            return kept_idx[max_idx].view(1,1)
            # return policy_net(torch.FloatTensor(state).unsqueeze(0)).max(1)[1].view(1, 1)
    else:
        # output=torch.tensor([[random.randrange(2)]], device=device, dtype=torch.long)
        # return  torch.masked_select(output.squeeze(0), action_mask)
        return torch.tensor([[random.randrange(2)]], device=device, dtype=torch.long)


In [64]:
(action_mask.nonzero()).shape

torch.Size([66, 1])

In [65]:
# def select_action_test(state):
    
#     mask_fact_table=[True]*67
#     mask_fact_table[64]=False # not replicate lineorder
#     action_mask=torch.from_numpy(np.array(mask_fact_table)).to(device=device) 
#     with torch.no_grad():
#         # t.max(1) will return largest column value of each row.
#         # second column on max result is index of where max element was
#         # found, so we pick action with the larger expected reward.

#         output=policy_net(torch.FloatTensor(state).unsqueeze(0))
#         max_idx=torch.masked_select(output.squeeze(0), action_mask).max(0)[1]
#         mapped=action_mask.nonzero()
#         return mapped[max_idx].view(1,1)
#         # return torch.masked_select(output.squeeze(0), action_mask)
#         # return torch.masked_select(output.squeeze(0), action_mask).max(0)[1].view(1,1)
#         # output=policy_net(torch.FloatTensor(state).unsqueeze(0))
#         # return output
# db_env=env()
# action = select_action(db_env.get_current_state())

In [66]:
def optimize_model():
    if len(memory) < BATCH_SIZE:
        return
    transitions = memory.sample(BATCH_SIZE)
    # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for
    # detailed explanation). This converts batch-array of Transitions
    # to Transition of batch-arrays.
    batch = Transition(*zip(*transitions))
    state_batch = torch.stack(batch.state).to(device=device)
    action_batch = torch.stack(batch.action).to(device=device)
    reward_batch = torch.stack(batch.reward).to(device=device)
    next_batch=torch.stack(batch.next_state).to(device=device)
    # Compute a mask of non-final states and concatenate the batch elements
    # (a final state would've been the one after which simulation ended)
    # non_final_mask = torch.tensor(tuple(map(lambda s: s is not None,
    #                                       batch.next_state)), device=device, dtype=torch.bool)
    # non_final_next_states = torch.cat([s for s in batch.next_state
    #                                             if s is not None])


    # Compute Q(s_t, a) - the model computes Q(s_t), then we select the
    # columns of actions taken. These are the actions which would've been taken
    # for each batch state according to policy_net
    output = policy_net(state_batch)
    # try:
    #     output = policy_net(state_batch)
    # except:
    #     print(type(state_batch))
    action_values=output[torch.arange(output.size(0)), action_batch]
    # action_values = output.gather(1, action_batch)
    state_action_values=action_values
    # state_action_values = policy_net(state_batch).gather(1, action_batch)

    # try:
    #     state_action_values = policy_net(state_batch).gather(1, action_batch)
    # except:
    #     print(state_batch.shape,action_batch.shape)
    #     exit()

    # Compute V(s_{t+1}) for all next states.
    # Expected values of actions for non_final_next_states are computed based
    # on the "older" target_net; selecting their best reward with max(1)[0].
    # This is merged based on the mask, such that we'll have either the expected
    # state value or 0 in case the state was final.
    next_state_values= target_net(next_batch).max(1)[0].detach()

    # next_state_values = torch.zeros(BATCH_SIZE, device=device)
    # try:
    #     next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # except:
    #     # print(batch.next_state.shape)
    #     print(len(batch.next_state))
    #     exit()
    # next_state_values[non_final_mask] = target_net(non_final_next_states).max(1)[0].detach()
    # Compute the expected Q values
    expected_state_action_values = (next_state_values * GAMMA) + reward_batch

    # Compute Huber loss
    criterion = nn.SmoothL1Loss()
    loss = criterion(state_action_values, expected_state_action_values.unsqueeze(1))

    # Optimize the model
    optimizer.zero_grad()
    loss.backward()
    for param in policy_net.parameters():
        param.grad.data.clamp_(-1, 1)
    optimizer.step()

In [67]:
num_episodes = 100 #600
t_max=20
# TARGET_UPDATE = 10e-3
TARGET_UPDATE = 10
from env import env

db_env=env()

for i_episode in range(num_episodes):
    # Initialize the environment and state
    db_env.reset()
    current_state =db_env.get_current_state()
    for t in range(t_max):
        # Select and perform an action
        # TODO reduce actions when generating or stepping
        action = select_action(current_state)

        # try:
        #     next_state = db_env.step(action.item())
        # except IndexError as e:
        #     print(action.item())
        next_state = db_env.step(action.item())
        
        reward=cost_estimate(db_env)

        # Store the transition in memory
        memory.push_numpy((current_state, action, next_state, reward))

        # # Move to the next state
        current_state = next_state

        # Perform one step of the optimization (on the policy network)
        optimize_model()
    # # Update the target network, copying all weights and biases in DQN
    if i_episode % TARGET_UPDATE == 0:
        target_net.load_state_dict(policy_net.state_dict())
        print('episode batch {} completed'.format(i_episode//TARGET_UPDATE))

    # params=(1-TARGET_UPDATE)*np.array(list(target_net.state_dict().values()))+TARGET_UPDATE*np.array(list(policy_net.state_dict().values()))
    # tmp_dict=target_net.state_dict()
    # dict_keys=list(tmp_dict.keys())
    # for idx,key in enumerate(dict_keys):
    #     tmp_dict[key]=params[idx]
    # target_net.load_state_dict(tmp_dict)
print('Complete')

episode batch 0 completed
episode batch 1 completed
episode batch 2 completed
episode batch 3 completed
episode batch 4 completed
episode batch 5 completed
episode batch 6 completed
episode batch 7 completed
episode batch 8 completed
episode batch 9 completed
Complete


In [68]:
def inference(q_network):
    reward_cache=[]
    ps_cache=[]
    replicate_tbl_cache=[]
    db_env.reset()
    current_state =db_env.get_current_state()
    for iter in range(t_max):
        # action= q_network(torch.FloatTensor(current_state).unsqueeze(0)).max(1)[1].view(1, 1)
        output=policy_net(torch.FloatTensor(current_state).unsqueeze(0)) # mask fact table replication
        max_idx=torch.masked_select(output.squeeze(0), action_mask).max(0)[1]
        kept_idx=action_mask.nonzero()
        action= kept_idx[max_idx].view(1,1)  
        
        next_state = db_env.step(action.item())
        reward=cost_estimate(db_env)     
        current_state=next_state
        reward_cache.append(reward)
        ps_cache.append(db_env.partition_scheme)
        replicate_tbl_cache.append(db_env.replicated_table)
    max_idx=np.argmax(np.array(reward_cache))
    return ps_cache[max_idx],replicate_tbl_cache[max_idx]

In [69]:
ps,replicate=inference(policy_net)

In [70]:
def display(scheme):
    ret_dict={}
    for table,values in scheme.items():
        for attr,val in values.items():
            if val==1:
                ret_dict[table]=attr
                break
    print(ret_dict)

In [71]:
# print(ps)
print(replicate)

{'customer': 0, 'dim_date': 0, 'lineorder': 0, 'part': 1, 'supplier': 1}


In [72]:
display(ps)

{'customer': 'c_city', 'dim_date': 'd_datekey', 'lineorder': 'lo_orderdate'}


In [73]:
# db_env.db_config