# Super-deep reinforcement learning with scikit-learn

Which environment to choose for deep reinforcement learning experiments?
Theano or Tensorflow? GPU or MPI? 

Who the hack needs any of these? Now you have Scikit-Learn!

Jokes aside, this demo shows the awesome scikit-learn deep reinforcement learning agent trained with __crossentropy method__ _(because TD is for sissies)_.

Requires: gym, scikit-learn (with MLPClassifier), numpy, joblib

Also this demo utilizes all CPU cores you have. To change that, tweak n_jobs param.

Read more about crossentropy method [in general](https://people.smp.uq.edu.au/DirkKroese/ps/aortut.pdf), [for rl](https://people.smp.uq.edu.au/DirkKroese/ps/eormsCE.pdf), [for rl again](https://esc.fnwi.uva.nl/thesis/centraal/files/f2110275396.pdf) 

In [1]:
import gym, gym.wrappers
gym.logger.level=0 #gym.youre("drunk").shut_up()
import numpy as np
from sklearn.neural_network import MLPClassifier

#Create environment
env = gym.make("LunarLander-v2")
n_actions = env.action_space.n


#Create agent
agent = MLPClassifier(hidden_layer_sizes=(256,512),
                      activation='tanh',
                      solver='adam',
                      warm_start=True,max_iter=1
                     )
#initialize agent by feeding it with some random bullshit
agent.fit([env.reset()]*n_actions,range(n_actions));


env: DISPLAY=:1




In [2]:
from itertools import count
def generate_session(t_max=10**3):
    """
    Just ask agent to predict action and see how env reacts - repeat until exhaustion.
    :param t_max: after this many steps the session is forcibly stopped. MAKE SURE IT'S ENOUGH!"""
    states,actions,total_reward = [],[],0
    
    s = env.reset()    
    for t in count():
        a = np.random.choice(n_actions,p=agent.predict_proba([s])[0])
        states.append(s)
        actions.append(a)
        
        s,r,done,_ = env.step(a)
        total_reward+=r
        
        if done or t>t_max:break
    return states,actions,total_reward

from joblib import Parallel,delayed
generate_sessions = lambda n,n_jobs=-1: Parallel(n_jobs)(n*[delayed(generate_session)()])

In [3]:
#training loop
#if you want faster stochastic iterations, try n_samples=100,percentile=50~70. Also maybe tune learning rate.
n_samples = 500   #takes 500 samples
percentile = 80   #fits to 20% best (100 samples) on each epoch
n_jobs = -1       #uses all cores


for i in range(150):
    #sample sessions
    sessions = generate_sessions(n_samples,n_jobs)
    batch_states,batch_actions,batch_rewards = map(np.array,zip(*sessions))
    
    #choose threshold on rewards
    threshold = np.percentile(batch_rewards,percentile)
    elite_states = np.concatenate(batch_states[batch_rewards>=threshold])
    elite_actions = np.concatenate(batch_actions[batch_rewards>=threshold])
    
    #fit our osom neural network >.<
    agent.fit(elite_states,elite_actions)

    #report progress
    print("epoch %i \tmean reward=%.2f\tthreshold=%.2f"%(i,batch_rewards.mean(),threshold))


epoch 0 	mean reward=-249.90	threshold=-151.86
epoch 1 	mean reward=-235.98	threshold=-132.11
epoch 2 	mean reward=-218.98	threshold=-129.94
epoch 3 	mean reward=-191.62	threshold=-140.58
epoch 4 	mean reward=-159.24	threshold=-118.25
epoch 5 	mean reward=-135.37	threshold=-101.08
epoch 6 	mean reward=-164.81	threshold=-103.56
epoch 7 	mean reward=-144.51	threshold=-84.46
epoch 8 	mean reward=-137.36	threshold=-101.81
epoch 9 	mean reward=-102.69	threshold=-55.57
epoch 10 	mean reward=-94.40	threshold=-59.96
epoch 11 	mean reward=-98.63	threshold=-65.33
epoch 12 	mean reward=-89.13	threshold=-62.03
epoch 13 	mean reward=-68.67	threshold=-38.45
epoch 14 	mean reward=-71.03	threshold=-20.68
epoch 15 	mean reward=-61.06	threshold=-34.60
epoch 16 	mean reward=-46.84	threshold=-25.76
epoch 17 	mean reward=-15.51	threshold=-10.40
epoch 18 	mean reward=-21.44	threshold=-1.58
epoch 19 	mean reward=-15.27	threshold=-1.39
epoch 20 	mean reward=0.90	threshold=99.29
epoch 21 	mean reward=-9.35	thr

In [4]:
#finish recording
env = gym.wrappers.Monitor(env,directory="videos",force=True)
sessions = [generate_session() for _ in range(500)]
env.close()
gym.upload("./videos/",api_key="<...>")



In [5]:
from IPython.display import HTML
import os

video_names = list(filter(lambda s:s.endswith(".mp4"),os.listdir("./videos/")))

HTML("""
<video width="640" height="480" controls>
  <source src="{}" type="video/mp4">
</video>
""".format("./videos/"+video_names[-1])) #this may or may not be _last_ video. Try other indices