In [1]:
import random
from numpy import arange
from environment import Agent, Environment
from planner import RoutePlanner
from simulator import Simulator

In [59]:
class LearningAgent(Agent):
    """An agent that learns to drive in the smartcab world."""

    def __init__(self, env, alpha=.5, gamma=.2,epsilon=.1):
        super(LearningAgent, self).__init__(env)  # sets self.env = env, state = None, next_waypoint = None, and a default color
        self.color = 'red'  # override color
        self.planner = RoutePlanner(self.env, self)  # simple route planner to get next_waypoint
        # TODO: Initialize any additional variables here
        self.next_waypoint = None
        self.state = None
        self.state_for_train = None
        self.count_reward=0
        self.qlearn = Qlearner( alpha, gamma ,epsilon)

        self.turn = 0
        self.num_of_learn = 0
        self.learns = {}


    def reset(self, destination=None):
        self.planner.route_to(destination)
        self.turn = 0
        self.count_reward=0
        self.num_of_learn += 1
        # TODO: Prepare for a new trip; reset any variables here, if required

#         print '*****',self.learns



    def update(self, t):
        # Gather inputs
        self.next_waypoint = self.planner.next_waypoint()  # from route planner, also displayed by simulator
        inputs = self.env.sense(self)
        deadline = self.env.get_deadline(self)

        # TODO: Update state
        self.state = (inputs, self.next_waypoint)
        should_move = True

        if self.next_waypoint == 'left' and (inputs['oncoming'] == 'forward' or inputs['oncoming'] == 'right'):
            should_move = False

        if inputs['light'] == 'red':
            if self.next_waypoint == 'straight' or self.next_waypoint == 'left':
                should_move = False
            elif self.next_waypoint == 'right' and inputs['left'] == 'forward':
                should_move = False

        # TODO: Select action according to your policy
        action = None
        # if should_move:
        #     action = random.choice(Environment.valid_actions)

        # Execute action and get reward
        # reward = self.env.act(self, action)
        # self.count_reward += reward


        # TODO: Learn policy based on state, action, reward
        go_learn = True #set if should use qlearn
        if should_move:
            if go_learn:

                #print '**************1',inputs
                self.state_for_train = (inputs['light'], inputs['oncoming'], inputs['left'], self.next_waypoint)

                #learn a action use Qlearn
                action = self.qlearn.select_action(self.state_for_train)

                reward = self.env.act(self, action)

                next_inputs = self.env.sense(self)
                next_state =  (next_inputs['light'], next_inputs['oncoming'], next_inputs['left'], self.next_waypoint)
                # print 'state now:',self.state_for_train ,'next state:',next_state


                self.qlearn.learn(self.state_for_train, next_state, action, reward)

            else:
                action = random.choice(Environment.valid_actions)

        reward = self.env.act(self, action)
        self.turn += 1
        self.count_reward += reward
        self.learns[self.num_of_learn] =( self.turn,self.count_reward )

#         print "LearningAgent.update(): deadline = {}, inputs = {}, action = {}, reward = {}".format(deadline, inputs, action, reward)  # [debug]
#         print "use turn is:",self.turn, ",count reward is now",self.count_reward

class Qlearner():
    """docstring for ."""
    def __init__(self, alpha=.5, gamma=.2 ,epsilon=.1):
        self.q = {}
        self.actions = [ 'forward', 'left', 'right' ]
        self.alpha = alpha # learning rate
        self.gamma = gamma # memory / discount factor of max Q(s',a')
        self.epsilon = epsilon # probability of doing random move

    def select_action(self,state):
        q={}
        if random.random() < self.epsilon:
            action = random.choice(self.actions)
        else:
            for action in self.actions:
                q[action] = self.get_q(state, action)

            max_q = max(q.items(), key=lambda x: x[1])
            action = max_q[0]
        # print '**********2',q
        return action


    def learn_q(self, state, action, reward, value):
        q = self.q.get((state, action), None)

        if q == None:
            q = reward
        else:
            q = q + self.alpha * (value - q)

        self.set_q(state, action, q) #update table

    def learn(self, state, new_state, action, reward):
        q = [self.get_q(new_state, a) for a in self.actions]
        next_reward = max(q)

        self.learn_q(state, action, reward, reward - self.gamma * next_reward)

    def get_q(self, state, action):
        return self.q.get((state, action), .0)

    def set_q(self, state, action, q):
        self.q[(state, action)] = q


def run():
    """Run the agent for a finite number of trials."""

    # Set up environment and agent
    e = Environment()  # create environment (also adds some dummy traffic)
    a = e.create_agent(LearningAgent)  # create agent
    e.set_primary_agent(a, enforce_deadline=True)  # specify agent to track
    # NOTE: You can set enforce_deadline=False while debugging to allow longer trials

    # Now simulate it
    sim = Simulator(e, update_delay=0., display=False)  # create simulator (uses pygame when display=True, if available)
    # NOTE: To speed up simulation, reduce update_delay and/or set display=False

    sim.run(n_trials=100)  # run for a specified number of trials
    # NOTE: To quit midway, press Esc or close pygame window, or hit Ctrl+C on the command-line
    print '****  this score is ****',a_learns_score(a.learns)

In [69]:
def run_for_choose(alpha=.5, gamma=.2,epsilon=.1):
    """Run the agent for a finite number of trials."""

    # Set up environment and agent
    e = Environment()  # create environment (also adds some dummy traffic)
    a = e.create_agent(LearningAgent,alpha=.5, gamma=.2,epsilon=.1)  # create agent
    e.set_primary_agent(a, enforce_deadline=True)  # specify agent to track
    # NOTE: You can set enforce_deadline=False while debugging to allow longer trials

    # Now simulate it
    sim = Simulator(e, update_delay=0., display=False)  # create simulator (uses pygame when display=True, if available)
    # NOTE: To speed up simulation, reduce update_delay and/or set display=False

    sim.run(n_trials=100)  # run for a specified number of trials
    # NOTE: To quit midway, press Esc or close pygame window, or hit Ctrl+C on the command-line
    return a_learns_score(a.learns)

In [17]:
def a_learns_score(a_dict):
    score = 0
    for i in a_dict:
        score += a_dict[i][1]/a_dict[i][0]
        
    return score

In [18]:
a_learns_score(a.learns)

95.7656640784428

In [41]:
def show_me_the_best():
    step = .01
    range_list = arange(step,1+step,step)

    alpha_list = range_list
    gamma_list = range_list
    epsilon_list = range_list
    
    best_score = 0
    best_alpha = 0
    for a in alpha_list:
        score_now = run_for_choose(alpha=a)
        if score_now > best_score:
            best_score = score_now
            best_param = a
            
    best_score = 0
    best_alpha = 0
    for b in gamma_list:
        score_now = run_for_choose( gamma = b )
        if score_now > best_score:
            best_score = score_now
            best_param = b


In [77]:
def find_para(alpha_list,gamma_list,epsilon_list):
    best_score = 0
    best_param_a = 0
    best_param_b = 0
    best_param_c = 0

    for a in alpha_list:
        for b in gamma_list:
            for c in epsilon_list:
                score_now = run_for_choose(alpha=a, gamma=b,epsilon=c)
        if score_now > best_score:
            best_score = score_now
            best_param_a = a
            best_param_b = b
            best_param_c = c
            
    print 'best alpha is:',best_param_a,';it score is:',best_score
    print 'best gamma is:',best_param_b,';it score is:',best_score
    print 'best epsilon is:',best_param_c,';it score is:',best_score

In [None]:
step = .1
alpha_list = arange(step,1+step,step)

step = .05
gamma_list = arange(step,.5+step,step)

step = .05
epsilon_list = arange(step,.5+step,step)

find_para(alpha_list,gamma_list,epsilon_list)

Simulator.run(): Trial 0
Environment.reset(): Trial set up with start = (4, 6), destination = (3, 3), deadline = 20
RoutePlanner.route_to(): destination = (3, 3)
Environment.act(): Primary agent has reached destination!
Simulator.run(): Trial 1
Environment.reset(): Trial set up with start = (7, 1), destination = (2, 1), deadline = 25
RoutePlanner.route_to(): destination = (2, 1)
Environment.act(): Primary agent has reached destination!
Simulator.run(): Trial 2
Environment.reset(): Trial set up with start = (6, 4), destination = (2, 5), deadline = 25
RoutePlanner.route_to(): destination = (2, 5)
Environment.step(): Primary agent ran out of time! Trial aborted.
Simulator.run(): Trial 3
Environment.reset(): Trial set up with start = (3, 3), destination = (6, 1), deadline = 25
RoutePlanner.route_to(): destination = (6, 1)
Environment.step(): Primary agent ran out of time! Trial aborted.
Simulator.run(): Trial 4
Environment.reset(): Trial set up with start = (3, 2), destination = (7, 5), de

In [66]:
e = Environment()  # create environment (also adds some dummy traffic)
a = e.create_agent(LearningAgent,alpha=.6)  # create agent
e.set_primary_agent(a, enforce_deadline=True)  # specify agent to track
# NOTE: You can set enforce_deadline=False while debugging to allow longer trials

# Now simulate it
sim = Simulator(e, update_delay=0., display=False)  # create simulator (uses pygame when display=True, if available)
# NOTE: To speed up simulation, reduce update_delay and/or set display=False
sim.run(n_trials=100)  # run for a specified number of trials
# NOTE: To quit midway, press Esc or close pygame window, or hit Ctrl+C on the command-line
print '*****',a_learns_score(a.learns)

Simulator.run(): Trial 0
Environment.reset(): Trial set up with start = (8, 3), destination = (3, 4), deadline = 30
RoutePlanner.route_to(): destination = (3, 4)
Environment.step(): Primary agent ran out of time! Trial aborted.
Simulator.run(): Trial 1
Environment.reset(): Trial set up with start = (7, 6), destination = (4, 3), deadline = 30
RoutePlanner.route_to(): destination = (4, 3)
Environment.act(): Primary agent has reached destination!
Simulator.run(): Trial 2
Environment.reset(): Trial set up with start = (8, 2), destination = (3, 6), deadline = 45
RoutePlanner.route_to(): destination = (3, 6)
Environment.step(): Primary agent ran out of time! Trial aborted.
Simulator.run(): Trial 3
Environment.reset(): Trial set up with start = (5, 1), destination = (4, 5), deadline = 25
RoutePlanner.route_to(): destination = (4, 5)
Environment.act(): Primary agent has reached destination!
Environment.act(): Primary agent has reached destination!
Simulator.run(): Trial 4
Environment.reset(): 

In [35]:
score = 0
for i in a.learns:
#     print a.learns[i][1]/a.learns[i][0]
    score += a.learns[i][1]/a.learns[i][0]
print  score

95.6549243824


In [None]:
if __name__ == '__main__':
    run()