# 作业 

In [None]:
import numpy as np 
from scipy.special import softmax 
from scipy.stats import bernoulli
import time 
from scipy.optimize import minimize
from ddm import pdf # ddm probility density function

from IPython.display import clear_output
%matplotlib inline
%config InlineBackend.figure_format='retina'

import matplotlib.pyplot as plt 
import seaborn as sns 

import sys 
sys.path.append("..") 
from utils.env import frozen_lake
from utils.viz import viz 
viz.get_style()

本次作业主要涉及两种参数估计方法(MLE+MCMC)和强化学习(dynamic programming+Q-learning)

## Part 1 参数估计之MLE
* 采用`exampledata.txt`的数据

In [None]:
# load data
data = np.loadtxt('exampledata.txt')
print(data.shape)

###

**Q1.1: 请写出ddm的负对数似然函数`negloglikeli`(negative log-likelihood function)**


In [None]:
# define a negative log-likelihood objective functions
def negloglikeli(params):
    '''
    <params>:(4,) array, drift coefficient, decision boundary, initial bias, non-decision time
    '''
    # specify your loaded params
    ##------------------------------##
    ##           your answer        ##
    ##------------------------------##
    # do initialization    
    ##------------------------------##
    ##           your answer        ##
    ##------------------------------##
    # loop trial
    ##------------------------------##
    ##           your answer        ##
    ##------------------------------##
    pp=0.999*pp + np.finfo(np.float32).eps # to avoid p=0
    # take log, sum，add negative
    return ##      your answer      ##


###

**Q1.2: 利用最大似然估计(maximum likelihood estimation)的方法求解ddm模型参数k,b.a,ndt**

参数k,b.a,ndt的bounds分别为((0, 20), (0, 5), (0, 1), (0, 1))


In [None]:
## now start your MLE

# print your result
print('\nfitted drift coefficient is ', res.x[0])
print('fitted decision boundary is ', res.x[1])
print('fitted initial bias is ', res.x[2])
print('fitted nondecision time is ', res.x[3])

## Part 2 参数估计之MCMC

###
**Q2: 利用Metropolitan-Hasting的方法求解DDM模型参数.不能借助工具包，必须手写M-H**

hint：
* 提议分布建议使用正态分布
* 注意每个输入参数的取值范围(especially:ndt)
* $max[\frac{f(x)}{f(y)}, 1] = max[log(f(x))-log(f(y)), 0]$

In [None]:
from scipy.stats import norm, uniform

def metropolis_hastings(data, n_sample=10000, n_burnin=1500):
    sample_chain = []
    # 参数的初始化
    ##---------------------------------------##
    ##               your answer             ##
    ##---------------------------------------##
    for i in range(n_sample + n_burnin):
        ##---------------------------------------##
        ##               your answer             ##
        ##---------------------------------------##  
           
        # burn-in 之后才开始收集参数
        if i >= n_burnin:
            sample_chain.append(current_param)

    return np.array(sample_chain)


# 开始M-H采样
samples = metropolis_hastings(data)

## Part 3 强化学习之dynamic programming

### 
我们接下来的游戏需要借助一个冰湖任务完成。

这个游戏的规则很简单：掉进蓝色冰窟窿内即失败(游戏结束)；顺利到达goal(G)即成功。

先来看看我们已经拥有的冰湖任务的环境。

In [None]:
seed = 1234
env = frozen_lake(seed=seed)
env.reset()
fig, ax = plt.subplots(1, 1, figsize=(4, 4))
env.render(ax)

在这个冰湖环境中，agent的动作为：
* 0: 上
* 1: 下
* 2: 左
* 3: 右

这个冰湖的形状是8*8的。每一个小方块就是一个state，即，state形状为8 * 8

###
**Q3.1：现在有nA个action，nS个state。请先随机生成一个policy。每一行代表一个state，每一列代表一个动作.并且令inverse temperature = 5**

* inverse temperature(beta)→softmax中加入 beta：$ \frac{e^{\beta*z_i}}{Σ_i e^{\beta*z_I}} $

* 你可以直接调用softmax这个函数，且加入inv temp的形式为：$p = softmax(A*\beta,axis=1)$, A为矩阵


In [None]:
## a random policy
seed = 1234
rng = np.random.RandomState(seed)
###-------------###
### your answer ###
###-------------###
print(pi_rand)

###
**Q3.2：定义一个评估policy的函数。你需要： 1.在已知当前步的state、action和transition function的情况下，估计下一个可能state的reward和value；2.根据v_new 和 v_old，判断收敛**

hint：

* reward和value的计算可以参考env.r()

* delta的计算公式是：$ delta = max(delta,|v_{old}-v_{new}|)$

* check convergence就是为了令：

* $delta<theta(theta为一个极小值)$

In [None]:

def policy_eval(pi, V, env, theta=1e-4, gamma=.99):
    for s in env.s_termination:
        V[s] = 0 
    while True:
        delta = 0
        for s in env.S:
            if s not in env.s_termination:
                v_old = V[s].copy()
                v_new = 0
                for a in env.A:
                    p = env.p_s_next(s,a)
                    for s_next in env.S:
                    ###-------------------------------###
                    ###         your answer           ###
                    ###-------------------------------###
              V[s] = v_new
                ## check convergence
                    ###-------------------------------###
                    ###         your answer           ###
                    ###-------------------------------###            
        fig, ax = plt.subplots(1, 1, figsize=(4, 4))
        clear_output(True)
        env.show_v(ax, V)
        time.sleep(1)
        plt.show()   
        if delta < theta:
            break
    return V
        

In [None]:
## 可视化
V = np.zeros([env.nS])
V = policy_eval(pi_rand, V, env)

###  随机生成的policy显然是不够好的，现在我们就来试着改进policy
**Q3.3：改进policy。你需要： 1.在已知当前步的state、action和transition function的情况下，估计下一个可能state的reward和value；2.计算pi[s]，并且循环直至平稳**

hint：

* $q += Σ_a P(s'|s,a)*[r_t+gamma*V(s')]$
    
* 在对应的state下，pi[s]就是意味着能返回q最大值的action

* 如果新的pi和原先的pi之间的差值$ delta<theta(极小值) $，我们认为这个policy稳定。也就是stable

In [None]:
## policy_improv
def policy_improv(pi, V, env, theta=1e-4, gamma=.99):
    pi_old = pi.copy()
    for s in env.S:
        # 每一次循环都要重新初始化包含所有action的 q
        q = np.zeros([env.nA])
        for a in env.A:
            p = env.p_s_next(s,a)
            for s_next in env.S:
                ###-------------------------------###
                ###         your answer           ###
                ###-------------------------------###
    # loop until stable
    ###-------------------------------###
    ###         your answer           ###
    ###-------------------------------###
    return pi, stable   
            

###
**Q3.4：完成policy iteration。你需要：1.首先评估policy得到V(value),2.根据得到的V，pi，env来完成迭代**

In [None]:
def policy_iter(env, seed=1234):
    rng = np.random.RandomState(seed)
    V = rng.randn(env.nS)*0.0001
    # initialize V 除goal为0
    for s in env.s_termination:
        V[s] = 0
        ## 先随意生成一个pi(policy)
        ###-------------------------------###
        ###         your answer           ###
        ###-------------------------------###

    while True:
        ###-------------------------------###
        ###         your answer           ###
        ###-------------------------------###
        # visualize 
        fig, axs = plt.subplots(1, 2, figsize=(8, 4))
        clear_output(True)
        ax = axs[0]
        env.show_v(ax, V)
        ax = axs[1]
        env.show_pi(ax, pi)
        time.sleep(.1)
        plt.show()   
        
        if stable:break
    return V, pi  
    

In [None]:
## 我们来看一下policy iteration的结果
V1, pi1 = policy_iter(env)

###
**Q3.5：完成value iteration。提示步骤已在代码中**

In [None]:
def value_iter(env, seed=1234, theta=1e-4, gamma=.99):
    
    rng = np.random.RandomState(seed)
    # initialize V(s), arbitrarily except V(terminal)=0
    V = np.zeros(env.nS)
    if s in env.s_termination:
        V[s] = 0
    # init policy 
    pi = softmax(rng.randn(env.nS,env.nA)*5, axis=1)    
    # loop until converge
    while True:
        delta = 0
        for s in env.S:
            v_old = V[s].copy()
            v_new = 0
            for a in env.A:
                p = env.p_s_next(s, a)
                for s_next in env.S:
                    # calculate v_new
                    ###-------------------------------###
                    ###         your answer           ###
                    ###-------------------------------###
            # calculate V
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------###
            # get new policy 
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------###
            # calculate delta
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------###
        if delta < theta:
            break 
    for s in env.s_termination:
        V[s] = 0
    return V, pi 

## Part 4 强化学习中之 Q-learning (sarsa)
"transition function and reward fucntion cannot be always known"

### 
**Q4.1：写一段函数，随机生成action**


In [None]:
def e_greedy(q, rng, env, eps):
    a_max = np.argwhere(q==np.max(q)).flatten()
    policy = np.sum([np.eye(env.nA)[i] for i in a_max], axis=0) / len(a_max)
    if rng.rand() < 1-eps:
        # you know the policy,set ps according to your pi
        ## -----------------------##
        ##      your answer       ##
        ## -----------------------##   
    else:
        # just random
        a = rng.choice(env.nA)
    return a 

### 
**Q4.2：补充sarsa 代码**

hint：
* Sarsa建立在TD-learning 基础上，它的核心是：

    $Q_{(s,a)} = Q_{(s,a)} + \alpha * [r + \gamma*Q_{(s',a')}-Q_{(s,a)}]$
* 可以借助env.step()这个函数

In [None]:
def Sarsa(env, alpha=.1, eps=.1, gamma=.99, max_epi=2000, seed=1234, theta=1e-4):
    # rng
    rng = np.random.RandomState(seed)
    Q = np.zeros([env.nS, env.nA])
    for epi in range(max_epi):
        s, r, done = env.reset()
        q_old = Q.copy()
        G = 0
        while True:
            # sample At, observe Rt, St+1
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------### 
            # calc s_next
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------### 
            # given s_next, calc a_next
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------###   
            # update Q,s,a
            ###-------------------------------###
            ###         your answer           ###
            ###-------------------------------###       
            G += r                          
            if done:
                break     
        if (np.abs(q_old - Q)<theta).all():
            break
    pi = np.eye(env.nA)[np.argmax(Q, axis=1)]
    return Q, pi

In [None]:
## 可视化
Q_sarsa, pi_sarsa = Sarsa(env)
V3 = Q_sarsa.max(1)
fig, axs = plt.subplots(1, 2, figsize=(8, 4))
ax = axs[0]
env.show_v(ax, V3)
ax = axs[1]
env.show_pi(ax, pi_sarsa)