In [None]:
# [1. 환경 정의]

import numpy as np
import random

class CardFlipGame:
    def __init__(self, cards):
        """
        cards: List of cards, where positive integers are point cards and 'X' is the penalty card.
        """
        self.cards = cards
        self.reset()

    def reset(self):
        random.shuffle(self.cards)
        self.remaining_cards = self.cards[:]
        self.flipped_cards = []  # 뒤집은 카드 기록
        self.score = 0
        self.done = False
        self.last_score_before_x = None  # X 직전 점수 기록
        return self.get_state()

    def get_state(self):
        # 상태는 점수, 남은 카드 수, 남은 'X' 카드 수로 구성
        return (self.score, len(self.remaining_cards), self.remaining_cards.count('X'))

    def step(self, action):
        if self.done:
            raise ValueError("Game is already finished.")

        if action == 0:  # Stop playing
            self.done = True  # 게임 종료
            return self.get_state(), self.score, self.done

        if action == 1:  # Flip a card
            flipped_card = self.remaining_cards.pop(0)  # 카드 제거
            self.flipped_cards.append(flipped_card)  # 뒤집은 카드 기록
            if flipped_card == 'X':
                self.last_score_before_x = self.score  # X 이전 점수 저장
                self.score = 0  # 점수 초기화
                self.done = True  # 게임 종료
            else:
                self.score += flipped_card  # 점수 누적

        return self.get_state(), self.score, self.done

In [None]:
# [2. 강화학습 알고리즘]

import tensorflow as tf
from collections import deque
import numpy as np
import random

class DQNAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # 할인율
        self.epsilon = 1.0  # 탐험 비율
        self.epsilon_min = 0.01  # 최소 탐험 비율
        self.epsilon_decay = 0.995  # 탐험 비율 감소 속도
        self.learning_rate = 0.001
        self.model = self._build_model()
        self.target_model = self._build_model()  # 타깃 네트워크

    def _build_model(self):
        model = tf.keras.Sequential([
            tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'),
            tf.keras.layers.Dense(24, activation='relu'),
            tf.keras.layers.Dense(self.action_size, activation='linear')
        ])
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=self.learning_rate),
                      loss='mse')
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(np.array([state]))
        return np.argmax(act_values[0])

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target += self.gamma * np.amax(self.target_model.predict(np.array([next_state]))[0])
            target_f = self.model.predict(np.array([state]))
            target_f[0][action] = target
            self.model.fit(np.array([state]), target_f, epochs=1, verbose=0)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

    def update_target_model(self):
        self.target_model.set_weights(self.model.get_weights())  # 타깃 네트워크 동기화

In [None]:
# [3. 훈련 루프]

import matplotlib.pyplot as plt

# 환경과 에이전트 초기화
cards = []  # 카드 세트
env = CardFlipGame(cards)
state_size = 3  # (현재 점수, 남은 카드 수, 남은 X 카드 수)
action_size = 2  # (0: 그만두기, 1: 카드 뒤집기)
agent = DQNAgent(state_size, action_size)

episodes = 1000  # 학습 에피소드 수
batch_size = 32

# 에피소드 결과 저장
rewards = []
heights = []
colors = []  # 성공: 파란색, 실패: 빨간색

# 훈련 루프
for e in range(episodes):
    state = env.reset()
    state = np.array(state)
    total_reward = 0

    while True:
        action = agent.act(state)  # 행동 선택
        next_state, reward, done = env.step(action)  # 행동 수행
        next_state = np.array(next_state)

        # 보상 계산
        if done and env.last_score_before_x is not None:  # X를 뒤집었을 때
            height = env.last_score_before_x
            reward = 0
            colors.append('red')  # 실패는 빨간색
        elif done:  # 그만두기
            height = env.score
            reward = env.score
            colors.append('blue')  # 성공은 파란색

        agent.remember(state, action, reward, next_state, done)  # 경험 저장
        state = next_state
        total_reward += reward

        if done:
            x_feedback = f", Last score before 'X': {env.last_score_before_x}" if env.last_score_before_x is not None else ""
            print(f"Episode {e+1}/{episodes}, Reward: {reward}{x_feedback}")
            print(f"Flipped cards in this episode: {env.flipped_cards}")
            rewards.append(reward)
            heights.append(height)
            break

    # 경험 리플레이
    agent.replay(batch_size)

    if e % 10 == 0:
        agent.update_target_model()  # 타깃 네트워크 갱신

# 그래프 출력
plt.figure(figsize=(15, 6))
plt.bar(range(len(heights)), heights, color=colors, alpha=0.6)
plt.title("Reward per Episode")
plt.xlabel("Episode")
plt.ylabel("Reward")
plt.legend(["Success (Blue)", "Failure (Red)"], loc="upper left")
plt.show()

# 마지막 100회차 분석
last_100_rewards = rewards[-100:]
success_count = len([r for r, c in zip(last_100_rewards, colors[-100:]) if c == 'blue'])  # 파란색 성공 횟수
average_reward = np.mean(last_100_rewards)

print(f"Last 100 episodes - Success count: {success_count}, Average reward: {average_reward:.2f}")