In [None]:
import random
import numpy as np
import pandas as pd
import gym
import matplotlib.pyplot as plt
%matplotlib inline

from kaggle_environments import make, evaluate
from gym import spaces

In [None]:
def test(obs, config):
    print(config.rows)
    print(config.columns)
    print(obs["board"])
    return 0


In [None]:
test_env = make("connectx", debug=True)
test_env.run([test,"random"])
test_env.render()

In [None]:
#建置openAI GYM的環境
class ConnectFourGym(gym.Env):
    def __init__(self, agent2="random"):
        ks_env = make("connectx", debug=True)
        self.env = ks_env.train([None, agent2])
        self.rows = ks_env.configuration.rows
        self.columns = ks_env.configuration.columns
        # Learn about spaces here: http://gym.openai.com/docs/#spaces
        self.action_space = spaces.Discrete(self.columns)
        self.observation_space = spaces.Box(low=0, high=2, 
                                            shape=(1,self.rows,self.columns), dtype=int)
        # Tuple corresponding to the min and max possible rewards
        self.reward_range = (-10, 1)
        # StableBaselines throws error if these are not defined
        self.spec = None
        self.metadata = None
    def reset(self):
        self.obs = self.env.reset()
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns)
    def change_reward(self, old_reward, done):
        if old_reward == 1: # The agent won the game
            return 1
        elif done: # The opponent won the game
            return -1
        else: # Reward 1/42
            return 1/(self.rows*self.columns)
    def step(self, action):
        # Check if agent's move is valid
        is_valid = (self.obs['board'][int(action)] == 0)
        if is_valid: # Play the move
            self.obs, old_reward, done, _ = self.env.step(int(action))
            reward = self.change_reward(old_reward, done)
        else: # End the game and penalize agent
            reward, done, _ = -10, True, {}
        return np.array(self.obs['board']).reshape(1,self.rows,self.columns), reward, done, _

In [97]:
from stable_baselines3 import PPO
from stable_baselines3.common.policies import BaseFeaturesExtractor
import torch
from torch import nn
from tool.check_win import check_win, get_position


In [104]:

def opponent(obs, config):
    valid_moves = [col for col in range(config.columns) if obs.board[col] == 0]
    for i in range(7):
        if (check_win(board=obs.board, row=config.rows, col=config.columns, choice=i, player=2)):
            return i
    for i in range(7):
        if (check_win(board=obs.board, row=config.rows, col=config.columns, choice=i, player=1)):
            print(i)
            return i
            

    return random.choice(valid_moves)

In [92]:
class connect_x_policy(BaseFeaturesExtractor): # Custom
    def __init__(self, observation_space: gym.spaces.Box, features_dim: int = 256):
        super(connect_x_policy, self).__init__(observation_space, features_dim)
        
        input_channel = observation_space.shape[0]
        self.output_shape = observation_space.shape[1]

        # 定义自定义网络
        # 假设observation_space是一个向量
        self.input_cnn = nn.Sequential(
            nn.Conv2d(in_channels=input_channel, out_channels=64, kernel_size=3, stride=1),
            nn.ReLU(),
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=2, stride=1),
            nn.ReLU(),
            nn.Flatten())
        
        with torch.no_grad():
            
            n_flatten = self.input_cnn(torch.as_tensor(observation_space.sample()[None]).float()).shape[1]
            

        
        self.output = nn.Sequential(nn.Linear(n_flatten, features_dim), nn.ReLU())


    def forward(self, observations: torch.Tensor) -> torch.Tensor:
        x = self.input_cnn(observations)
        
        x = self.output(x)
        
        return x



    

In [94]:
policy_kwargs = dict(
    features_extractor_class=connect_x_policy)
env = ConnectFourGym(agent2=opponent)
model = PPO("CnnPolicy", env, policy_kwargs=policy_kwargs, learning_rate=0.0003, n_steps=2048, n_epochs=10)


    

model.learn(total_timesteps=10000)

<stable_baselines3.ppo.ppo.PPO at 0x21f0c0b5b80>

In [102]:
policy = model.policy

def use_policy(obs, config):

    #將資料整理成輸入格式
    board = np.array(obs["board"])
    board = board.reshape(1, 1, config.rows, config.columns)
    board_tensor = torch.tensor(board)
    

    board_tensor = board_tensor.to(device="cuda", dtype=torch.float32)


    predict = int(policy.forward(board_tensor)[0])


    #防止選到不能放棋子的格子
    while(board[0, 0, 0, predict] != 0):
        predict = int(policy.forward(board_tensor)[0])

    #陽春顯示畫面....
    new_board, _ = get_position(board.reshape(config.rows, config.columns), predict, 1)
    print(new_board)


    return predict

def human_play(obs, config):
    board = np.array(obs["board"])
    


In [105]:
test_env = make("connectx", debug=True)
test_env.run([use_policy,"random"])
test_env.render()


[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 1 0 0 0]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [1 0 2 1 0 0 0]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [1 0 2 1 0 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 1]
 [1 2 2 1 0 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 1]
 [2 0 1 0 0 0 1]
 [1 2 2 1 0 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 1]
 [2 0 1 0 0 0 1]
 [1 2 2 1 2 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 1]
 [2 0 1 1 2 0 1]
 [1 2 2 1 2 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 2 1 0 1]
 [2 0 1 1 2 0 1]
 [1 2 2 1 2 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 0 0 0 0]
 [0 0 1 2 1 0 1]
 [2 2 1 1 2 0 1]
 [1 2 2 1 2 0 2]]
[[0 0 0 0 0 0 0]
 [0 0 0 0 0 0 0]
 [0 0 1 2 0 0 0]
 [0 0 1 2 1 0 1]
 [2 2