# Super-deep reinforcement learning with scikit-learn

Which environment to choose for deep reinforcement learning experiments?
Theano or Tensorflow? GPU or MPI? 

Who the hack needs any of these? Now you have Scikit-Learn!

Jokes aside, this demo shows the awesome scikit-learn deep reinforcement learning agent trained with __crossentropy method__ _(because TD is for sissies)_.

Requires: gym, scikit-learn (with MLPClassifier), numpy

Read more about crossentropy method [in general](https://people.smp.uq.edu.au/DirkKroese/ps/aortut.pdf), [for rl](https://people.smp.uq.edu.au/DirkKroese/ps/eormsCE.pdf), [for rl again](https://esc.fnwi.uva.nl/thesis/centraal/files/f2110275396.pdf) 

In [1]:
import gym, gym.wrappers
gym.logger.level=0 #gym.youre("drunk").shut_up()
import numpy as np
from sklearn.neural_network import MLPClassifier

#Create environment
env = gym.make("CartPole-v0")
env = gym.wrappers.Monitor(env,directory="videos",force=True)

n_actions = env.action_space.n


#Create agent
agent = MLPClassifier(hidden_layer_sizes=(20,20),
                      activation='tanh',
                      solver='adam',
                      warm_start=True,max_iter=1
                     )
#initialize agent by feeding it with some random bullshit
agent.fit([env.reset()]*n_actions,range(n_actions));




In [2]:
def generate_session():
    """
    Just ask agent to predict action and see how env reacts - repeat until exhaustion.
    :param greedy: if True, picks most likely actions, else samples actions"""
    states,actions,total_reward = [],[],0
    
    s = env.reset()    
    while True:
        a = np.random.choice(n_actions,p=agent.predict_proba([s])[0])
        
        states.append(s)
        actions.append(a)
        
        s,r,done,_ = env.step(a)
        total_reward+=r
        if done:break
        
    return states,actions,total_reward
        

In [3]:
#training loop
n_samples = 100 #take 100 samples
percentile = 70 #fit on top 30% (30 best samples)

for i in range(50):
    #sample sessions
    sessions = [generate_session() for _ in range(n_samples)]
    batch_states,batch_actions,batch_rewards = map(np.array,zip(*sessions))
    
    #choose threshold on rewards
    threshold = np.percentile(batch_rewards,percentile)
    elite_states = np.concatenate(batch_states[batch_rewards>=threshold])
    elite_actions = np.concatenate(batch_actions[batch_rewards>=threshold])
    
    #fit our osom neural network >.<
    agent.fit(elite_states,elite_actions)

    #report progress
    print("epoch %i \tmean reward=%.2f\tthreshold=%.2f"%(i,batch_rewards.mean(),threshold))


epoch 0 	mean reward=33.50	threshold=39.00
epoch 1 	mean reward=38.31	threshold=44.00
epoch 2 	mean reward=42.69	threshold=50.60
epoch 3 	mean reward=48.38	threshold=58.30
epoch 4 	mean reward=47.56	threshold=58.00
epoch 5 	mean reward=53.88	threshold=61.30
epoch 6 	mean reward=56.37	threshold=61.30
epoch 7 	mean reward=60.88	threshold=72.60
epoch 8 	mean reward=70.49	threshold=81.00
epoch 9 	mean reward=91.38	threshold=109.80
epoch 10 	mean reward=96.06	threshold=110.30
epoch 11 	mean reward=114.96	threshold=151.30
epoch 12 	mean reward=142.47	threshold=187.60
epoch 13 	mean reward=134.30	threshold=176.30
epoch 14 	mean reward=154.61	threshold=200.00
epoch 15 	mean reward=174.48	threshold=200.00
epoch 16 	mean reward=185.59	threshold=200.00
epoch 17 	mean reward=188.76	threshold=200.00
epoch 18 	mean reward=193.16	threshold=200.00
epoch 19 	mean reward=189.80	threshold=200.00
epoch 20 	mean reward=195.49	threshold=200.00
epoch 21 	mean reward=193.79	threshold=200.00
epoch 22 	mean rew

In [4]:
#finish recording
env.close()
gym.upload("./videos/",api_key="<...>")

In [5]:
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices