In [1]:
from __future__ import print_function

import argparse
import math
import pickle
import random
import datetime
import numpy as np

import gym
import gym_2048

from q_learning import QLearning
import matplotlib.pyplot as plt

import warnings
warnings.filterwarnings("ignore")

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
env = gym.make("2048-v0")
env.reset()
env.seed(1)

RENDER_ENV = False
TRAINING_ON = True
EPISODES = 1000
Q_MODEL_PATH = "outputs/keras-models/2048_q_model.h5"
Q_MODEL_WEIGHTS_PATH = "outputs/keras-models/2048_q_model_weights.h5"
T_MODEL_PATH = "outputs/keras-models/2048_t_model.h5"

board_size = int(math.sqrt(env.observation_space.shape[0]))
n_output = env.action_space.n

[33mWARN: gym.spaces.Box autodetected dtype as <class 'numpy.float32'>. Please provide explicit dtype.[0m
[33mWARN: Environment '<class 'gym_2048.envs.game2048_env.Game2048Env'>' has deprecated methods. Compatibility code invoked.[0m


In [3]:
QL = QLearning (
    n_x=board_size,
    n_y=n_output,
    q_save_path = Q_MODEL_PATH,
    q_weights_save_path=Q_MODEL_WEIGHTS_PATH,
    t_save_path = T_MODEL_PATH,
    total_episodes=EPISODES,
    restore_model=True,
    is_training_on=TRAINING_ON,
    T=10
)

In [None]:
for episode in range(EPISODES):
    observation = env.reset()
    QL.curr_episode = episode
    
    while True:
        if RENDER_ENV: env.render()
            
        valid_move = False    
        action = None
        
        while not valid_move:
            
            # Choose an action based on observation
            if action == None: action = QL.choose_action(observation)
            
            observation_, reward, done, info = env.step(action)
            valid_move = info['valid']
            
            reward = QL.calculate_reward(valid_move, done, reward, observation_)
        
            QL.save_experience(observation=observation, action=action, 
                               reward=reward, observation_=observation_, is_game_over=done, is_move_valid=valid_move)
            
            action = (action + 1) % QL.n_y
        
        features, labels = QL.sample_from_experience()
        QL.train_model(features=features, labels=labels)
        
        if done:
            highest_tile_value = QL.get_highest_tile_value(observation_)
            QL.episodic_highest_tiles_track.append(highest_tile_value)
            print("Episode #", (episode + 1), " : Highest Tile: ", highest_tile_value)
            env.render()
            QL.plot_progress(y_data=QL.episodic_highest_tiles_track, y_label="Highest_Tile_Value", n_episode=episode)
            break
            
    QL.transfer_model()
    QL.save_q_model()
        