In [8]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Categorical
import numpy as np
import time
import random

In [9]:
#@title Игра
class Kalah:
    def __init__(self, num_holes=6, num_seeds=6.):
        self.num_holes = num_holes
        self.num_seeds = num_seeds

        self.board = [float(num_seeds) for _ in range(num_holes * 2 + 2)]  # Заполняем лунки

        self.kalah1_index = self.num_holes  # Индекс корзины первого игрока
        self.kalah2_index = self.num_holes + 1  # Индекс корзины второго игрока

        self.board[self.kalah1_index] = 0.
        self.board[self.kalah2_index] = 0.

        self.current_player = 0  # Текущий игрок

        self.diff1 = self.num_holes + 2  # Разница между противоположными лунками относительно прямого порядка
        self.diff2 = self.num_holes  # Разница между противоположными лунками относительно обратного порядка

    def step(self, action):
        hole = action
        player = self.current_player
        boardCopy = self.board.copy()
        if player == 1: hole -= self.num_holes

        # Если в выбранной лунке 0 камней, заканчиваем игру и отдаем все камни противоположному игроку
        if boardCopy[hole] == 0:
            for i in range(len(boardCopy)):
                boardCopy[i] = 0.
            boardCopy[self.num_holes + 1 - player] = 2 * self.num_holes * self.num_seeds
            return [boardCopy, player]
        
        seeds = boardCopy[hole]  # Запоминаем количество в выбранной лунке

        boardCopy[hole] = 0.  # Обнуляем выбранную лунку

        # Запускаем распределение камней
        while seeds > 0:
            hole = hole + 1 if hole >= 0 else hole - 1

            # Если прошли свою корзину, переходим на другую сторону
            if hole == self.num_holes + 1:
                hole = -1
            if hole == -1 * self.num_holes - 2:
                hole = 0

            # Если распределение дошло до корзины соперника, пропускаем её
            if hole == self.num_holes and player == 1:
                continue
            if hole == -1 * self.num_holes - 1 and player == 0:
                continue

            boardCopy[hole] += 1.  # Увеличиваем кол-во камней в лунке
            seeds -= 1.  # Уменьшаем кол-во камней в выбранной лунке

        # Если последний камень оказался в корзине, выходим без смены хода
        if hole == self.num_holes:
            return [boardCopy, player]
        if hole == -1 * self.num_holes - 1:
            return [boardCopy, player]

        # Если последний камень попал в пустую лунку принадлежащую ему и противоположная лунка соперника не пуста, то этот камень и все камни из противоположной лунки соперника игрок переносит себе в корзину
        if boardCopy[hole] == 1 and boardCopy[hole + self.diff1 if player == 0 else hole + self.diff2] > 0:
            if hole >= 0 and player == 0 or hole < 0 and player == 1:
                boardCopy[self.num_holes if hole >= 0 else self.num_holes + 1] += boardCopy[hole + self.diff1 if hole >= 0 else hole + self.diff2] + 1.
                boardCopy[hole] = 0.
                boardCopy[hole + self.diff1 if player == 0 else hole + self.diff2] = 0.

        player = 1 - player
        return [boardCopy, player]
    
    def do_step(self, action):
        tmp = self.step(action)
        self.board = tmp[0]
        self.current_player = tmp[1]

    def game_over(self):
        return sum(self.board[:self.num_holes]) == 0 or sum(self.board[self.diff1:]) == 0 or self.board[
            -1 * self.num_holes - 1] > (self.num_holes * self.num_holes) or self.board[self.num_holes] > (
                           self.num_holes * self.num_holes)

    def get_winner(self):
        if not self.game_over():
            return None
        return 0 if self.board[self.num_holes] > self.board[-1 * self.num_holes - 1] else 1

    def get_value(self, action):
        hole = action
        if self.current_player == 1: hole -= self.num_holes
        return self.board[hole]

    def get_state(self):
        return [self.board, self.current_player]
    
    def get_valid_moves(self):
        moves = []
        for i in range(self.num_holes):
            if self.get_value(i) != 0:
                moves.append(i)
        return moves

    def play_game(self):
        move = 0
        while not self.game_over():
            print("=========== Move ", move, "===========")
            print("player 2:", self.board[-1 * self.num_holes - 1], self.board[self.num_holes + 2:])
            print("player 1:  ", self.board[:self.num_holes], self.board[self.num_holes])
            hole = int(input("Player {}'s turn. Enter hole number(0-5): ".format(self.current_player + 1)))
            while True:
                if hole >= 0 and hole <= 5: break
                hole = int(input("Player {}'s turn. Enter hole number(0-5): ".format(self.current_player + 1)))
            self.do_step(hole)
            move += 1
        print("Game over. Player {} wins!".format(self.get_winner() + 1))
        print("player 2:", self.board[-1 * self.num_holes - 1], self.board[self.num_holes + 2:])
        print("player 1:  ", self.board[:self.num_holes], self.board[self.num_holes])

    def playBolvanVsBolvan(self):
        move = 0
        while not self.game_over():
            print("=========== Move ", move, "===========")
            print("player 2:", self.board[-1 * self.num_holes - 1], self.board[self.num_holes + 2:])
            print("player 1:  ", self.board[:self.num_holes], self.board[self.num_holes])
            hole = bolvan(self.board, self.current_player)
            print("bolvan{} action {}; value {}\n".format(self.current_player+1, hole, self.get_value(hole)))
            self.do_step(hole)
            move += 1
        print("Game over. Player {} wins!".format(self.get_winner() + 1))
        print("player 2:", self.board[-1 * self.num_holes - 1], self.board[self.num_holes + 2:])
        print("player 1:  ", self.board[:self.num_holes], self.board[self.num_holes])

In [31]:
#@title Функция для инициализации модели
file_name = 'kallah_neyron'

def model_init():
    DL = 14
    Prs = 14

    model = 0

    if os.path.exists(file_name):
        model = torch.load(file_name)
        print("model loaded")
    # else:
    #     model = nn.Sequential(
    #         nn.Linear(DL,Prs),
    #         nn.Sigmoid(),
    #         nn.Linear(Prs, 6),
    #         nn.Softmax()
    #     )
    #     print("model init")

    return model

In [27]:
#@title Болван
def bolvan(game):
    return random.choice(game.get_valid_moves())

In [28]:
def space(board):
    return torch.tensor(board[0:])

In [32]:
#@title Функция тренировки
def train(gamma = 0.9, alpha = 0.45, max_ep = 99999999):
    time_1 = time.time()
    zero_flag = False
    count_errors = 0
    wins_neural = 0
    model = model_init()
    optimizer = torch.optim.Adam(model.parameters())
    move = 0
    reward_for_zero = -500

    for episode in range(max_ep):
        if episode % 1000 == 0:
            print("Neural wins per 1000 plays {}: errors per 1000 plays {}".format(wins_neural, count_errors))
            count_errors = 0
            wins_neural = 0

            torch.save(model, file_name)
        
        tr = []
        game = Kalah()
        move = 0
        while not game.game_over():
            state = game.get_state()
            cur_board = state[0]
            cur_player = state[1]
            SPACE = space(cur_board)

            if cur_player == 0:
                probs = model(SPACE)
                m = Categorical(probs)
                action = m.sample()
            else:
                action = bolvan(game)

            # print("\n=========== Ep {}; Move {}; Player {}: action {}: ===========".format(episode, move, cur_player+1, action))
            # print("player 2:", game.board[-1 * game.num_holes - 1], game.board[game.num_holes + 2:])
            # print("player 1:  ", game.board[:game.num_holes], game.board[game.num_holes])

            # if(game.get_value(action) == 0):
            #     print("=========== Ep {}; Move {}; Player {}: action {}: ===========".format(episode, move, cur_player+1, action))
            #     print("player 2:", game.board[-1 * game.num_holes - 1], game.board[game.num_holes + 2:])
            #     print("player 1:  ", game.board[:game.num_holes], game.board[game.num_holes])

            if cur_player == 0 and game.get_value(action) != 0:
                reward = 1
                #reward = game.board[game.num_holes] - game.board[-1 * game.num_holes - 1]
                tr.append((SPACE.clone(), action.clone(), reward))
            elif cur_player == 0 and game.get_value(action) == 0:
                reward = reward_for_zero
                tr.append((SPACE.clone(), action.clone(), reward))
                count_errors += 1

            game.do_step(action)
            move += 1
        if game.get_winner() == 0:
          wins_neural += 1 

        # print("\nplayer 2:", game.board[-1 * game.num_holes - 1], game.board[game.num_holes + 2:])
        # print("player 1:  ", game.board[:game.num_holes], game.board[game.num_holes])
        loss = 0.

        T = len(tr)
        for t in range(T):
            R = 0.
            for i in range(t,T):
                R += (gamma**(i - t))*tr[i][2]

            loss += -alpha*R*Categorical(model(tr[t][0])).log_prob(tr[t][1])

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(time.time() - time_1)

In [None]:
train()

model loaded
Neural wins per 1000 plays 0: errors per 1000 plays 0


  input = module(input)


Neural wins per 1000 plays 32: errors per 1000 plays 3
Neural wins per 1000 plays 33: errors per 1000 plays 1
Neural wins per 1000 plays 44: errors per 1000 plays 3
Neural wins per 1000 plays 32: errors per 1000 plays 3
Neural wins per 1000 plays 36: errors per 1000 plays 1
Neural wins per 1000 plays 35: errors per 1000 plays 3
Neural wins per 1000 plays 32: errors per 1000 plays 2
Neural wins per 1000 plays 42: errors per 1000 plays 5
Neural wins per 1000 plays 30: errors per 1000 plays 0
Neural wins per 1000 plays 50: errors per 1000 plays 7
Neural wins per 1000 plays 42: errors per 1000 plays 5
Neural wins per 1000 plays 36: errors per 1000 plays 2
Neural wins per 1000 plays 39: errors per 1000 plays 4
Neural wins per 1000 plays 44: errors per 1000 plays 4
Neural wins per 1000 plays 346: errors per 1000 plays 7
Neural wins per 1000 plays 371: errors per 1000 plays 3
Neural wins per 1000 plays 645: errors per 1000 plays 5
Neural wins per 1000 plays 595: errors per 1000 plays 9
Neural