# Learning to play Connect4 using Reinforcement Learning
Improveing on the orignial algorithm presented at : https://www.kaggle.com/alexisbcook/deep-reinforcement-learning

This will have 3 sections:
1. Building and training the original agent descibed on Kaggle
2. Building and training an improved agent who plays against itself

# Part 1 - Building and training the original agent
1. Init
2. Build Gym environment 
3. Build the NN
4. Train agent

In [None]:
# Initializations
#from learntools.core import binder
#binder.bind(globals())
#from learntools.game_ai.ex4 import *

import os
import random
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

!pip install 'tensorflow==1.15.0'

import tensorflow as tf
from kaggle_environments import make, evaluate
from gym import spaces

!apt-get update
!apt-get install -y cmake libopenmpi-dev python3-dev zlib1g-dev
!pip install "stable-baselines[mpi]==2.9.0"

from stable_baselines.bench import Monitor 
from stable_baselines.common.vec_env import DummyVecEnv
from stable_baselines import PPO1, PPO2, A2C, ACER, ACKTR, TRPO
from stable_baselines.a2c.utils import conv, linear, conv_to_fc
from stable_baselines.common.policies import CnnPolicy

# Create directory for logging training information
log_dir = "log/"
os.makedirs(log_dir, exist_ok=True)

#initalizing variables        
iterations = 10
steps = 10000

In [None]:
# Building the Connect4 Gym Enviroment
# change the reward function from original implementation (see note below)

class ConnectFourGym:
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(self.rows,self.columns,1), dtype=np.int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: 
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(self.rows,self.columns,1), reward, done, _
    


In [None]:
# Neural network for predicting action values
def modified_cnn(scaled_images, **kwargs):
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_2 = conv_to_fc(layer_2)
    return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))  
         
class CustomCnnPolicy(CnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy, self).__init__(*args, **kwargs, cnn_extractor=modified_cnn)    

In [None]:
# Learn by playing against random agent

# Create ConnectFour environment
env = ConnectFourGym(agent2="random")
monitor_env = Monitor(env, log_dir, allow_early_resets=True) # Logging progress
vec_env = DummyVecEnv([lambda: monitor_env]) # Create a vectorized environment
 
# Initialize agent
simple_model = PPO1(CustomCnnPolicy, vec_env, verbose=0)
print("Training started...")

iterations = 25
for x in range(iterations):
  # Train agent
  simple_model.learn(total_timesteps=steps)

  # Print results
  with open(os.path.join(log_dir, "monitor.csv"), 'rt') as fh:    
      firstline = fh.readline()
      assert firstline[0] == '#'
      df1 = pd.read_csv(fh, index_col=None)['r']
      df1 = df1.tail(steps)
      print("Iteration=", x, " Score=",df1.mean())
    
print("Training Finished!"," Score=",df1.mean())

In [None]:
#from kaggle_environments import make, evaluate

def learnt_from_random_agent(obs, config):
    # Use the best model to select a column
    col, _ = simple_model.predict(np.array(obs['board']).reshape(6,7,1))
    # Check if selected column is valid
    is_valid = (obs['board'][int(col)] == 0)
    # If not valid, select random move. 
    if is_valid:
        return int(col)
    else:
        return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])
    
# Utility function used to compare agents performance
def get_win_percentages(agent1, agent2, n_rounds=100):
    # Use default Connect Four setup
    config ={'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
    return np.round(outcomes.count([1,-1])/len(outcomes), 2), np.round(outcomes.count([-1,1])/len(outcomes), 2)
    
  #  print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
  #  print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
  #  print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0.5]))
  #  print("Number of Invalid Plays by Agent 2:", outcomes.count([0.5, None]))

In [None]:
print("Training using random player finished!")
get_win_percentages(agent1="random", agent2=learn_from_self_play_agent)
print("Random agent win =",win1, "My agent win = ",win2) 


# Part 2 - Building and training the self-play agent
1. Build the NN (we use the same NN as before)
3. Train agent

In [None]:
#Building the NN

def extra_layer_cnn(scaled_images, **kwargs):
    # Original implementation
    activ = tf.nn.relu
    layer_1 = activ(conv(scaled_images, 'c1', n_filters=32, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_2 = activ(conv(layer_1, 'c2', n_filters=64, filter_size=3, stride=1, init_scale=np.sqrt(2), **kwargs))
    layer_2 = conv_to_fc(layer_2)
    return activ(linear(layer_2, 'fc1', n_hidden=512, init_scale=np.sqrt(2)))  
        
class CustomCnnPolicy2(CnnPolicy):
    def __init__(self, *args, **kwargs):
        super(CustomCnnPolicy2, self).__init__(*args, **kwargs, cnn_extractor=extra_layer_cnn)

In [None]:
# Self play / learn
from random import choice
        
# Create the initial ConnectFour environment
env = ConnectFourGym(agent2="random")
vec_env = DummyVecEnv([lambda: env]) # Create a vectorized environment

# Initialize agents
selfplay1_model = PPO1(CustomCnnPolicy2, vec_env, verbose=0) 
selfplay2_model = PPO1(CustomCnnPolicy2, vec_env, verbose=0) 

def agent1_play(obs, config):
        # Use the best model to select a column
        col, _ = selfplay1_model.predict(np.array(obs['board']).reshape(6,7,1))
        # Check if selected column is valid
        is_valid = (obs['board'][int(col)] == 0)
        # If not valid, select random move. 
        if is_valid:
            return int(col)
        else:
            return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])
        
def agent2_play(obs, config):
        #return choice([c for c in range(configuration.columns) if observation.board[c] == 0])
        # Use the best model to select a column
        col, _ = selfplay2_model.predict(np.array(obs['board']).reshape(6,7,1))
        # Check if selected column is valid
        #is_valid = (obs['board'][int(col)] == 0)
        # If not valid, select random move. 
        #if is_valid:
        return int(col)
        #else:
        #    return random.choice([col for col in range(config.columns) if obs.board[int(col)] == 0])        
        

win2=0
steps=10000
counter=1

win1 , win2 = get_win_percentages(agent1=agent1_play, agent2="random") 
print("Before training stated. Agent win =",win1, "Random win = ",win2) 
    
print("Training started...")    
while (win1 < 0.85): #playing aginst again that was trainer from random
    # selfplay_model.save("my_model")
    # copy_model = PPO1.load("my_model")
    
    # Build the new env base on the existing algorithm
    selfplay1_model.save("my_model") 
    selfplay2_model = PPO1.load("my_model")
    env = ConnectFourGym(agent2=agent2_play)
    vec_env = DummyVecEnv([lambda: env]) # Create a vectorized environment
    
    # Keep training the model
    selfplay1_model.learn(total_timesteps=10000)
      
    # Measure sucess    
    win1 , win2 = get_win_percentages(agent1=agent1_play, agent2="random") 
    print("iteration=", counter, " Agent win =",win1, "Random win = ",win2) 
    
    counter += 1

print("Training finished!")



In [None]:
# Final results
get_win_percentages(agent1="random", agent2=learn_from_self_play_agent)
print("Random agent win =",win1, "My agent win = ",win2) 

In [None]:
#from kaggle_environments import make, evaluate

# Utility function used to compare agents performance
def get_win_percentages(agent1, agent2, n_rounds=1000):
    # Use default Connect Four setup
    config = {'rows': 6, 'columns': 7, 'inarow': 4}
    # Agent 1 goes first (roughly) half the time          
    outcomes = evaluate("connectx", [agent1, agent2], config, [], n_rounds//2)
    # Agent 2 goes first (roughly) half the time      
    outcomes += [[b,a] for [a,b] in evaluate("connectx", [agent2, agent1], config, [], n_rounds-n_rounds//2)]
       
    print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2))
    print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2))
    print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0.5]))
    print("Number of Invalid Plays by Agent 2:", outcomes.count([0.5, None]))
    print("Number of Draws (in {} game rounds):".format(n_rounds), outcomes.count([0.5, 0.5]))

    with open(os.path.join(log_dir, "results.txt"), 'w') as f:
        print("Agent 1 Win Percentage:", np.round(outcomes.count([1,-1])/len(outcomes), 2), file=f)
        print("Agent 2 Win Percentage:", np.round(outcomes.count([-1,1])/len(outcomes), 2), file=f)
        print("Number of Invalid Plays by Agent 1:", outcomes.count([None, 0.5]), file=f)
        print("Number of Invalid Plays by Agent 2:", outcomes.count([0.5, None]), file=f)
        print("Number of Draws (in {} game rounds):".format(n_rounds), outcomes.count([0.5, 0.5]), file=f)

# Optional: Saving the agent

In [None]:
#import inspect
#import os

#def write_agent_to_file(function, file):
#    with open(file, "a" if os.path.exists(file) else "w") as f:
#        f.write(inspect.getsource(function))
#        print(function, "written to", file)

#write_agent_to_file(learn_from_self_play_agent, "submission.py")


Then, follow these steps:

Begin by clicking on the blue Save Version button in the top right corner of this window. This will generate a pop-up window.
Ensure that the Save and Run All option is selected, and then click on the blue Save button.
This generates a window in the bottom left corner of the notebook. After it has finished running, click on the number to the right of the Save Version button. This pulls up a list of versions on the right of the screen. Click on the ellipsis (...) to the right of the most recent version, and select Open in Viewer. This brings you into view mode of the same page. You will need to scroll down to get back to these instructions.
Click on the Output tab on the right of the screen. Then, click on the Submit to Competition button to submit your results to the leaderboard.
Go to "My Submissions" to view your score and episodes being played.
You have now successfully submitted to the competition!