In [1]:
import sys 
sys.path.append('../src')

In [6]:
"""
Main.py file for GAIL implementation on dialog datasets.

Uses command line arguments to maximize flexibility, and run many options in parallel

"""

import sys 
sys.path.append('../src')
import os
import pickle
import argparse
import numpy as np
from collections import deque

import torch
import torch.optim as optim
from tensorboardX import SummaryWriter 

from models.actor import Actor
from models.critic import Critic
from models.discriminator import Discriminator
from GAIL import *

from dialog_environment import DialogEnvironment

device='cuda' # for now


parser = argparse.ArgumentParser(description='Limitation Learning')

parser.add_argument('--load_model', 
                    type=str, default=None, 
                    help='path to load the saved model')

parser.add_argument('--gamma', 
                    type=float, default=0.99, 
                    help='discounted factor (default: 0.99)')

parser.add_argument('--lamda', 
                    type=float, default=0.98, 
                    help='GAE hyper-parameter (default: 0.98)')


parser.add_argument('--learning_rate', 
                    type=float, default=3e-4, 
                    help='learning rate of models (default: 3e-4)')

parser.add_argument('--l2_rate', 
                    type=float, default=1e-3, 
                    help='l2 regularizer coefficient (default: 1e-3)')

parser.add_argument('--clip_param', 
                    type=float, default=0.2, 
                    help='clipping parameter for PPO (default: 0.2)')

parser.add_argument('--discrim_update_num', 
                    type=int, default=2, 
                    help='update number of discriminator (default: 2)')

parser.add_argument('--actor_critic_update_num', 
                    type=int, default=10, 
                    help='update number of actor-critic (default: 10)')

parser.add_argument('--total_sample_size', 
                    type=int, default=2048, 
                    help='total sample size to collect before PPO update (default: 2048)')

parser.add_argument('--batch_size', 
                    type=int, default=128, 
                    help='batch size to update (default: 128)')

parser.add_argument('--suspend_accu_exp', 
                    type=float, default=None,
                    help='accuracy for suspending discriminator about expert data (default: None)')

parser.add_argument('--suspend_accu_gen', 
                    type=float, default=None,
                    help='accuracy for suspending discriminator about generated data (default: None)')

parser.add_argument('--max_iter_num', 
                    type=int, default=4096,
                    help='maximal number of main iterations (default: 4000)')

parser.add_argument('--seed', 
                    type=int, default=42,
                    help='random seed (default: 500)')

parser.add_argument('--logdir', 
                    type=str, default='logs/EXPERIMENTNAME',
                    help='tensorboardx logs directory (default: logs/EXPERIMENTNAME)')

parser.add_argument('--hidden_size', 
                    type=int, default=128,
                    help='New sequence length of the representation produced by the encoder/decoder RNNs. (default: 1024)')
parser.add_argument('--num_layers', 
                    type=int, default=2,
                    help='Number of layers in the respective RNNs (default: 2)')

parser.add_argument('--seq_len', 
                    type=int, default=10,
                    help='length of input and response sequences (default: 60, which is also max)')
parser.add_argument('--input_size', 
                    type=int, default=300,
                    help='DO NOT CHANGE UNLESS NEW EMBEDDINGS ARE MADE. Dimensionality of embeddings (default: 300)')




def main():
    env = DialogEnvironment()

    torch.manual_seed(args.seed)

    #TODO
    actor = Actor(hidden_size=args.hidden_size,num_layers=args.num_layers,device='cuda',input_size=args.input_size,output_size=args.input_size)
    critic = Critic(hidden_size=args.hidden_size,num_layers=args.num_layers,input_size=args.input_size,seq_len=args.seq_len)
    discrim = Discriminator(hidden_size=args.hidden_size,num_layers=args.hidden_size,input_size=args.input_size,seq_len=args.seq_len)
    
    actor.to(device), critic.to(device), discrim.to(device)
    
    actor_optim = optim.Adam(actor.parameters(), lr=args.learning_rate)
    critic_optim = optim.Adam(critic.parameters(), lr=args.learning_rate, 
                              weight_decay=args.l2_rate) 
    discrim_optim = optim.Adam(discrim.parameters(), lr=args.learning_rate)

    # load demonstrations

    writer = SummaryWriter(args.logdir)

    if args.load_model is not None: #TODO
        saved_ckpt_path = os.path.join(os.getcwd(), 'save_model', str(args.load_model))
        ckpt = torch.load(saved_ckpt_path)

        actor.load_state_dict(ckpt['actor'])
        critic.load_state_dict(ckpt['critic'])
        discrim.load_state_dict(ckpt['discrim'])


    
    episodes = 0
    train_discrim_flag = True

    for iter in range(args.max_iter_num):
        actor.eval(), critic.eval()
        memory = deque()

        steps = 0
        scores = []
        similarity_scores = []
        while steps < args.total_sample_size: 
            state, expert_action, raw_state, raw_expert_action = env.reset()
            score = 0
            similarity_score = 0
            state = state[:args.seq_len,:]
            expert_action = expert_action[:args.seq_len,:]
            state = state.to(device)
            expert_action = expert_action.to(device)
            for _ in range(10000): 

                steps += 1

                mu, std = actor(state.resize(1,args.seq_len,args.input_size)) #TODO: gotta be a better way to resize. 
                action = get_action(mu.cpu(), std.cpu())[0]
                done= env.step(action)
                irl_reward = get_reward(discrim, state, action, args)
                if done:
                    mask = 0
                else:
                    mask = 1


                memory.append([state, torch.from_numpy(action).to(device), irl_reward, mask,expert_action])
                score += irl_reward
           #     similarity_score += get_cosine_sim(action,expert_action)
                if done:
                    break

            episodes += 1
            scores.append(score)
           # similarity_scores.append(similarity_score)

        score_avg = np.mean(scores)
      #  similarity_score_avg = np.mean(similarity_scores)
        print('{}:: {} episode score is {:.2f}'.format(iter, episodes, score_avg))
       # print('{}:: {} episode similarity score is {:.2f}'.format(iter, episodes, similarity_score_avg))

        actor.train(), critic.train(), discrim.train()
        if train_discrim_flag:
            expert_acc, learner_acc = train_discrim(discrim, memory, discrim_optim, args) 
            print("Expert: %.2f%% | Learner: %.2f%%" % (expert_acc * 100, learner_acc * 100))
            writer.add_scalar('log/expert_acc', float(expert_acc), iter) #logg
            writer.add_scalar('log/learner_acc', float(learner_acc), iter) #logg
            writer.add_scalar('log/avg_acc', float(learner_acc + expert_acc)/2, iter) #logg
            if args.suspend_accu_exp is not None: #only if not None do we check.
                if expert_acc > args.suspend_accu_exp and learner_acc > args.suspend_accu_gen:
                    train_discrim_flag = False
                    
        train_actor_critic(actor, critic, memory, actor_optim, critic_optim, args)
        writer.add_scalar('log/score', float(score_avg), iter)
     #   writer.add_scalar('log/similarity_score', float(similarity_score_avg), iter)
        writer.add_text('log/raw_state', raw_state[0],iter)
        raw_action = get_raw_action(action) #TODO
        writer.add_text('log/raw_action', raw_action,iter)
        writer.add_text('log/raw_expert_action', raw_expert_action,iter)

        if iter % 100:
            score_avg = int(score_avg)

            print(raw_action)
            model_path = os.path.join(os.getcwd(),'save_model')
            if not os.path.isdir(model_path):
                os.makedirs(model_path)

            ckpt_path = os.path.join(model_path, 'ckpt_'+ str(score_avg)+'.pth.tar')

            save_checkpoint({
                'actor': actor.state_dict(),
                'critic': critic.state_dict(),
                'discrim': discrim.state_dict(),
                'args': args,
                'score': score_avg,
            }, filename=ckpt_path)




In [9]:

from argparse import Namespace
args = Namespace(load_model=None,
                render=False,
                gamma=.99,
                lamda=.98,
                learning_rate=1e-4,
                l2_rate=1e-3,
                clip_param=.2,
                discrim_update_num=2,
                actor_critic_update_num=10,
                total_sample_size=100,
                batch_size=100,
                suspend_accu_exp=None,# won't stop
                suspend_accu_gen=None,
                max_iter_num=4000,
                seed=500,
                logdir='logs/noah321',
                 hidden_size=1,
                 num_layers=1,
                 seq_len=5,
                 input_size=50
                )

In [10]:
main()

0:: 100 episode score is 0.79
Expert: 100.00% | Learner: 0.00%
1:: 200 episode score is 0.79
Expert: 100.00% | Learner: 0.00%
- - - - -
2:: 300 episode score is 0.78
Expert: 100.00% | Learner: 0.00%
- - - - -
3:: 400 episode score is 0.78
Expert: 100.00% | Learner: 0.00%
- - - - -
4:: 500 episode score is 0.78
Expert: 100.00% | Learner: 0.00%
- - - - -
5:: 600 episode score is 0.78
Expert: 100.00% | Learner: 0.00%
- - - - -
6:: 700 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
7:: 800 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
8:: 900 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
9:: 1000 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
10:: 1100 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
11:: 1200 episode score is 0.77
Expert: 100.00% | Learner: 0.00%
- - - - -
12:: 1300 episode score is 0.76
Expert: 100.00% | Learner: 0.00%
- - - - -
13:: 1400 episode score is 0.76
Expert: 100.00% | Lear

109:: 11000 episode score is 0.63
Expert: 57.00% | Learner: 100.00%
- - - - -
110:: 11100 episode score is 0.63
Expert: 62.00% | Learner: 100.00%
- - - - -
111:: 11200 episode score is 0.63
Expert: 58.00% | Learner: 100.00%
- - - - -
112:: 11300 episode score is 0.63
Expert: 64.00% | Learner: 100.00%
- - - - -
113:: 11400 episode score is 0.63
Expert: 66.00% | Learner: 100.00%
- - - - -
114:: 11500 episode score is 0.63
Expert: 58.00% | Learner: 100.00%
- - - - -
115:: 11600 episode score is 0.63
Expert: 57.00% | Learner: 100.00%
- - - - -
116:: 11700 episode score is 0.63
Expert: 68.00% | Learner: 100.00%
- - - - -
117:: 11800 episode score is 0.63
Expert: 64.00% | Learner: 100.00%
- - - - -
118:: 11900 episode score is 0.62
Expert: 64.00% | Learner: 100.00%
- - - - -
119:: 12000 episode score is 0.63
Expert: 65.00% | Learner: 100.00%
- - - - -
120:: 12100 episode score is 0.62
Expert: 66.00% | Learner: 100.00%
- - - - -
121:: 12200 episode score is 0.62
Expert: 64.00% | Learner: 100.

KeyboardInterrupt: 

In [11]:
%debug

> [0;32m/scratch/nsk367/deepRL/limitation-learning/src/dialog_environment.py[0m(73)[0;36mreset[0;34m()[0m
[0;32m     71 [0;31m        [0mraw_state[0m [0;34m=[0m [0mlist[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mraw_conversations[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[[0m[0mself[0m[0;34m.[0m[0mi[0m[0;34m][0m[0;34m,[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     72 [0;31m[0;34m[0m[0m
[0m[0;32m---> 73 [0;31m        [0mraw_expert_action[0m [0;34m=[0m [0mself[0m[0;34m.[0m[0mraw_conversations[0m[0;34m[[0m[0mlist[0m[0;34m([0m[0mself[0m[0;34m.[0m[0mraw_conversations[0m[0;34m.[0m[0mkeys[0m[0;34m([0m[0;34m)[0m[0;34m)[0m[0;34m[[0m[0mself[0m[0;34m.[0m[0mi[0m[0;34m][0m[0;34m][0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     74 [0;31m        [0;31m#TODO: truncate sequences?[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[0m[0;32m     75 [0;31m        [0;32mreturn[0m [0mstate[0m[0;34m,[0m [0mexp

ipdb> similarity_score
0
ipdb> similarity_scores
[]
ipdb> exit


The action space is too obvious. Need to constrain to same, and retry. Z

!python ../src/main.py

In [None]:
%debug

In [None]:
ratio = torch.ones(100,5,1)
advants = torch.randn(100,1)

In [None]:
new = advants.unsqueeze(dim=1) * ratio

In [None]:
new