In [1]:
import ipynb.fs.full.utils as utils

In [2]:
import ipynb.fs.full.app as app

In [3]:
%matplotlib inline

### OUR PROJECT OUTLINE

- We build and design a simple Snake game and applied reinforcement learning to the game.
- Methods that we applied in our RL were **Epsilon-Greedy** and **Decaying Epsilon-Greedy**.
- Multiple agents were trained and tested with different hyperparameters.


- Environment:
    - 0s = empty cell
    - 1s = elements of snake
    - 2  = apple (food)
- Agent:
    - The snake (1s)
    - The snake has a head which is used for calculations within the game.
- Actions:
    - Up
    - Right
    - Down
    - Left
- Reward:
    - Eat apple: 30
    - Move towards apple: 1
    - Move away from apple: 0
    - Out of bounds: -100
    - Eat itself: -100
- State:
    - The state consists of an array containing 8 boolean elements.
    - The 4 first elements represent the obstacle state relative to the snake's head.
    - The 4 last elements represent the directional state of the snake's head in relation to the apple's current position.

### OUR IMPLEMENTATION

<img src="implementation.svg" alt="Drawing" style="width: 700px;"/>

### WHAT TRAINING LOOKS LIKE

policy_score, policy, epoch_scores = app.training(**{
    'gamma': 0.3,
    'epsilon': {
        'value': 0.2,
        'decay': True
    },
    'rewards': {
        'death': -100,
        'direction': 1,
        'grow': 30
    },
    'epochs': 101,
    'evaluation': {
        'breakpoint': 50,
        'epochs': 25
    },
    'maze_size': [10, 10],
    'max_steps': 100,
    'delay': 0.001,
    'display': True
})

### OUR HYPOTHESIS

- A low **EPSILON** value is probably better than a high one.
- A high **GAMMA** value is probably better than a low one.

- We want to try a reduce the **EPSILON** value over time.

### MULTIPLICATIVELY DECAYING EPSILON

In [79]:
# EPSILON -= EPSILON / MAXIMUM NUMBER EPOCHS

In [70]:
# 0.2 STARTING EPSILON & 1MIL EPOCHS

In [4]:
epsilon_decay = utils.load('epsilon')

In [53]:
utils.visualize([
    ['DECAYING EPSILON', epsilon_decay]
], cluster=10000)

### 100K EPOCHS, 0.8 GAMMA, [DYNAMIC / STATIC] 0.2 EPSILON - RUNTIME ~5 SECONDS

In [64]:
# HIGH GAMMA = PREFERS A LONG-TERM REWARD

In [63]:
# LOW EPSILON = LOW CHANCE OF EXPLORATION

In [65]:
dacaying = utils.load('100k_high_gamma_dynamic_epsilon')

In [66]:
static = utils.load('100k_high_gamma_static_epsilon')

In [68]:
utils.visualize([
    ['DECAYING EPSILON', dacaying],
    ['STATIC EPSILON', static]
], cluster=500)

### 1M EPOCHS, DECAYING 0.2 EPSILON & [0.8 / 0.3] GAMMA - RUNTIME ~10 MINUTES

In [60]:
# HIGH GAMMA = PREFERS LONG-TERM REWARD

In [61]:
# LOW GAMMA = PREFERS SHORT-TERM REWARD

In [7]:
high_gamma = utils.load('1m_high_gamma_dynamic_epsilon')

In [8]:
low_gamma = utils.load('1m_low_gamma_dynamic_epsilon')

In [9]:
utils.visualize([
    ['HIGH GAMMA', high_gamma],
    ['LOW GAMMA', low_gamma],
], cluster=10000)

### 100K EPOCHS, 0.3 GAMMA & [0.2, 0.5, 0.8] DECAYING EPSILON - RUNTIME ~5 SECONDS

In [13]:
low_eps = utils.load('100k_low_gamma_dynamic_0.2_epsilon')

In [14]:
med_eps = utils.load('100k_low_gamma_dynamic_0.5_epsilon')

In [15]:
hig_eps = utils.load('100k_low_gamma_dynamic_0.8_epsilon')

In [16]:
utils.visualize([
    ['0.2 EPSILON', low_eps],
    ['0.5 EPSILON', med_eps],
    ['0.8 EPSILON', hig_eps],
], cluster=1000)

### THREE IDENTICAL RUNS - 7M EPOCHS, 0.3 GAMMA & 0.2 DECAYING EPSILON - RUNTIME ~80 MIN EACH

In [21]:
first = utils.load('7mil_low_gamma_low_dynamic_epsilon_0')

In [22]:
second = utils.load('7mil_low_gamma_low_dynamic_epsilon_1')

In [23]:
third = utils.load('7mil_low_gamma_low_dynamic_epsilon_2')

In [29]:
utils.visualize([
    ['FIRST', first],
    ['SECOND', second],
    ['THIRD', third],
], cluster=50000)

### SCORES PER RUN

In [46]:
print('FIRST RUN:', max(first))

FIRST RUN: 36


In [48]:
print('SECOND RUN:', max(second))

SECOND RUN: 35


In [47]:
print('THIRD RUN:', max(third))

THIRD RUN: 34


### PLAYING WITH THE BEST POLICY

In [75]:
# MATRIX SIZE = 2^8 * 4

In [72]:
# ALL POSSIBLE STATES * NUMBER OF ACTIONS

In [41]:
best_policy = utils.load('7mil_policy_0')

In [43]:
for p in best_policy:
    print(p)

[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[0. 0. 0. 0.]
[-142.85714286 -142.85714286 -142.85714286 -142.85714286]
[ 30.50557035 -90.61482267 -98.78213643 -99.55204065]
[-99.87085638 -99.80073592 -99.89094315   0.49960772]
[-41.85714286 -99.63704002 -99.63704002   1.20981878]
[-99.98841274   0.10904285 -99.96671663 -99.98647312]
[ 30.44970238   0.36364271 -98.71738776 -99.6612572 ]
[-9.96295996e+01  6.02989163e-02 -9.98497168e+01  6.61228123e-01]
[ 3.06653559e+01  4.49137408e-02 -9.69695126e+01  1.21157606e+00]
[-99.86506943 -99.89037223   1.2180864  -99.86952446]
[  2.23271709 -98.78208168   0.35989921 -99.33018487]
[-99.89635422 -99.80037054   0.50111579   0.13047413]
[ 30.6612173  -99.26415543   1.28395998   1.21157185]
[-99.84837983   0.15028787   1.21157609 -99.87085314]
[  1.66981513   0.19921309   0.36594635 -96.98