In [1]:
!pip install gymnasium flappy-bird-gymnasium opencv-python
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
import cv2
from collections import deque
import gymnasium as gym
import flappy_bird_gymnasium


class FlappyDQN(nn.Module):
    def __init__(self, action_size=2):
        super(FlappyDQN, self).__init__()
        self.conv1 = nn.Conv2d(1, 32, kernel_size=8, stride=4)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=4, stride=2)
        self.conv3 = nn.Conv2d(64, 64, kernel_size=3, stride=1)
        self.fc1 = nn.Linear(64 * 4 * 4, 512)
        self.fc2 = nn.Linear(512, action_size)

    def forward(self, x):
        x = x.float() / 255.0
        x = torch.relu(self.conv1(x))
        x = torch.relu(self.conv2(x))
        x = torch.relu(self.conv3(x))
        x = x.view(x.size(0), -1)
        x = torch.relu(self.fc1(x))
        return self.fc2(x)


def preprocess_frame(frame):
    if frame is None:
        return np.zeros((64, 64), dtype=np.uint8)
    gray = cv2.cvtColor(frame, cv2.COLOR_RGB2GRAY)
    resized = cv2.resize(gray, (64, 64))
    return resized.astype(np.uint8)


class Agent:
    def __init__(self, action_size=2):
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.action_size = action_size
        self.policy_net = FlappyDQN(action_size).to(self.device)
        self.optimizer = optim.Adam(self.policy_net.parameters(), lr=1e-4)
        self.memory = deque(maxlen=20000)

        self.gamma = 0.99
        self.epsilon = 1.0
        self.epsilon_decay = 0.99
        self.epsilon_min = 0.01
        self.batch_size = 64

    def select_action(self, state):
        if random.random() < self.epsilon:
            return random.randrange(self.action_size)
        with torch.no_grad():
            state_t = torch.as_tensor(state, device=self.device).unsqueeze(0).unsqueeze(0)
            return torch.argmax(self.policy_net(state_t)).item()

    def train(self):
        if len(self.memory) < self.batch_size:
            return

        batch = random.sample(self.memory, self.batch_size)
        states, actions, rewards, next_states, dones = zip(*batch)

        states_t = torch.as_tensor(np.array(states), device=self.device).unsqueeze(1)
        actions_t = torch.as_tensor(actions, device=self.device).unsqueeze(1)
        rewards_t = torch.as_tensor(rewards, device=self.device, dtype=torch.float)
        next_states_t = torch.as_tensor(np.array(next_states), device=self.device).unsqueeze(1)
        dones_t = torch.as_tensor(dones, device=self.device, dtype=torch.float)

        current_q = self.policy_net(states_t).gather(1, actions_t).squeeze(1)

        with torch.no_grad():
            max_next_q = self.policy_net(next_states_t).max(1)[0]
            target_q = rewards_t + (self.gamma * max_next_q * (1 - dones_t))

        loss = nn.MSELoss()(current_q, target_q)
        self.optimizer.zero_grad()
        loss.backward()
        self.optimizer.step()


def evaluate_agent(agent, env, episodes=20):
    print("\n=== EVALUARE (multiple runs, epsilon = 0) ===")
    old_epsilon = agent.epsilon
    agent.epsilon = 0.0

    scores = []

    for ep in range(episodes):
        env.reset()
        state = preprocess_frame(env.render())
        done = False
        pipes_passed = 0

        while not done:
            action = agent.select_action(state)
            _, _, terminated, truncated, info = env.step(action)
            done = terminated or truncated

            pipes_passed = info.get("score", 0)
            state = preprocess_frame(env.render())

        scores.append(pipes_passed)
        print(f"Eval Episod {ep+1:02d} → Pipe-uri: {pipes_passed}")

    agent.epsilon = old_epsilon

    scores = np.array(scores, dtype=np.int32)
    print("\n=== REZULTATE FINALE ===")
    print(f"Mean pipes   : {scores.mean():.2f}")
    print(f"Std deviation: {scores.std():.2f}")
    print(f"Best run     : {scores.max()}")
    print(f"Worst run    : {scores.min()}")

    return scores


env = gym.make("FlappyBird-v0", render_mode="rgb_array")
agent = Agent()
episodes = 5000


for ep in range(episodes):
    env.reset()
    state = preprocess_frame(env.render())
    total_reward = 0
    pipes_passed = 0
    done = False

    while not done:
        action = agent.select_action(state)
        _, reward, terminated, truncated, info = env.step(action)
        done = terminated or truncated

        pipes_passed = info.get("score", 0)
        next_state = preprocess_frame(env.render())

        adj_reward = reward if not done else -15

        agent.memory.append((state, action, adj_reward, next_state, done))
        state = next_state
        total_reward += reward

        agent.train()

    if agent.epsilon > agent.epsilon_min:
        agent.epsilon *= agent.epsilon_decay

    if ep % 5 == 0:
        print(f"Episod: {ep:4} | Pipe-uri: {pipes_passed:2} | Scor RL: {total_reward:5.1f} | Epsilon: {agent.epsilon:.3f}")

    if pipes_passed > 10:
        print(f"--- Performanta buna ! Model salvat cu {pipes_passed} pipe-uri ---")
        torch.save(agent.policy_net.state_dict(), "flappy_expert.pth")


scores = evaluate_agent(agent, env, episodes=20)

env.close()


Collecting flappy-bird-gymnasium
  Downloading flappy_bird_gymnasium-0.4.0-py3-none-any.whl.metadata (4.5 kB)
Downloading flappy_bird_gymnasium-0.4.0-py3-none-any.whl (37.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m37.3/37.3 MB[0m [31m41.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: flappy-bird-gymnasium
Successfully installed flappy-bird-gymnasium-0.4.0


  logger.warn(f"{pre} is not within the observation space.")
  logger.warn(f"{pre} is not within the observation space.")


Episod:    0 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.990
Episod:    5 | Pipe-uri:  0 | Scor RL:  -7.5 | Epsilon: 0.941
Episod:   10 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.895
Episod:   15 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.851
Episod:   20 | Pipe-uri:  0 | Scor RL:  -6.3 | Epsilon: 0.810
Episod:   25 | Pipe-uri:  0 | Scor RL:  -6.3 | Epsilon: 0.770
Episod:   30 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.732
Episod:   35 | Pipe-uri:  0 | Scor RL:  -7.5 | Epsilon: 0.696
Episod:   40 | Pipe-uri:  0 | Scor RL:  -6.9 | Epsilon: 0.662
Episod:   45 | Pipe-uri:  0 | Scor RL:  -6.9 | Epsilon: 0.630
Episod:   50 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.599
Episod:   55 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.570
Episod:   60 | Pipe-uri:  0 | Scor RL:  -6.3 | Epsilon: 0.542
Episod:   65 | Pipe-uri:  0 | Scor RL:  -8.1 | Epsilon: 0.515
Episod:   70 | Pipe-uri:  0 | Scor RL:  -3.3 | Epsilon: 0.490
Episod:   75 | Pipe-uri:  0 | Scor RL:  -5.7 | Epsilon: 0.466
Episod: 