In [11]:
import flappy_bird_env
import gymnasium
import gym
import torch
from gymnasium.utils.play import play
import numpy as np
import pygame
import math

In [4]:
env = gymnasium.make("FlappyBird-v0", render_mode="rgb_array")

In [5]:
env.reset()
env.render()

array([[[ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202],
        ...,
        [ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202]],

       [[ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202],
        ...,
        [ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202]],

       [[ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202],
        ...,
        [ 78, 192, 202],
        [ 78, 192, 202],
        [ 78, 192, 202]],

       ...,

       [[222, 216, 149],
        [222, 216, 149],
        [222, 216, 149],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[222, 216, 149],
        [222, 216, 149],
        [222, 216, 149],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]],

       [[222, 216, 149],
        [222, 216, 149],
        [222, 216, 149],
        ...,
        [222, 216, 149],
        [222, 216, 149],
        [222, 216, 149]]

In [6]:
# play(env, keys_to_action={(pygame.K_SPACE,): np.array([1])}, noop=np.array([0]),
#      fps=24)

In [7]:
env.action_space

Discrete(2)

In [8]:
def gen_epsilon_greedy_policy(estimator, epsilon, n_action):
    def policy_function(state):
            probs = torch.ones(n_action) * epsilon / n_action
            q_values = estimator.predict(state)
            best_action = torch.argmax(q_values).item()
            probs[best_action] += 1.0 - epsilon
            action = torch.multinomial(probs, 1).item()
            return action 
    return policy_function

In [13]:
class LR_Estimate():
    def __init__(self, n_feat, n_state, n_action, lr=0.05):#n_state - размерность состояний теперь
        self.w, self.b = self.get_gaussian_wb(n_feat,n_state)
        self.n_feat = n_feat
        self.models = []
        self.optimizers = []
        self.criterion = torch.nn.MSELoss()
        for _ in range(n_action):
            model = torch.nn.Linear(n_feat,1)
            self.models.append(model)
            optimizer = torch.optim.SGD(model.parameters(), lr)
            self.optimizers.append(optimizer)

    def get_gaussian_wb(self, n_feat, n_state, sigma=.2):
        torch.manual_seed(0)
        w = torch.randn((n_state,n_feat))/sigma
        b = torch.rand((n_feat))* 2.0 * math.pi
        return w, b

    def get_feature(self, s):
        features = (2.0 / self.n_feat) ** 0.5 * torch.cos(torch.matmul(torch.tensor(s).float(), self.w) + self.b)
        return features

    def update(self, s, a, y):
        features = Variable(self.get_feature(s))
        y_pred = self.models[a](features)

        loss = self.criterion(y_pred, Variable(torch.Tensor([y])))

        self.optimizers[a].zero_grad()
        loss.backward()
        self.optimizers[a].step()

    def predict(self,s):
        features = self.get_features(s)
        with torch.no_grad():
            return torch.tensor([model(features) for model in self.models])

In [15]:
estimator = LR_Estimate(10, 10, 10)
policy = gen_epsilon_greedy_policy(estimator, 0.1, 10)

In [16]:
print(policy)

<function gen_epsilon_greedy_policy.<locals>.policy_function at 0x7fa02b501a80>
