In [7]:
import numpy as np
import matplotlib.pyplot as plt

# STEP 1: The Absolute Basics - Just Record Everything
print("=" * 60)
print("STEP 1: Basic Record Keeping")
print("=" * 60)

class BasicBanditTracker:
    """
    The simplest possible bandit - just keeps track of what happens.
    No intelligence yet, just data collection.

    This helps students see what information we're working with
    before we worry about how to use it smartly.
    """
    def __init__(self, k_arms):
        self.k = k_arms

        # Store every single action and reward we see
        # This is inefficient but helps students understand the data
        self.action_history = []    # Which arm was pulled each time
        self.reward_history = []    # What reward we got each time

        print(f"Created a {k_arms}-armed bandit tracker")
        print("I'll remember every action and reward, but I won't make smart decisions yet")

    def record_action_and_reward(self, action, reward):
        """Just record what happened - no learning yet."""
        self.action_history.append(action)
        self.reward_history.append(reward)

        step = len(self.action_history)
        print(f"Step {step}: Chose action {action}, got reward {reward:.2f}")

    def show_history(self):
        """Let students see all the data we've collected."""
        print(f"\nHistory after {len(self.action_history)} steps:")
        print(f"Actions chosen: {self.action_history}")
        print(f"Rewards received: {[round(r, 2) for r in self.reward_history]}")

STEP 1: Basic Record Keeping


In [8]:
# Let's try it out with manual action selection
tracker = BasicBanditTracker(3)
#0, 1, 2
# Simulate some manual decisions (students can try different sequences)
manual_actions = [0, 1, 2, 0, 1, 0]  # Let students pick these
simulated_rewards = [1.2, 0.8, -0.3, 1.5, 0.9, 1.1]  # From environment

print("\nLet's manually try some actions and see what happens:")
for action, reward in zip(manual_actions, simulated_rewards):
    tracker.record_action_and_reward(action, reward)

tracker.show_history()

print("\nThinking questions for students:")
print("- Which action seems to be giving the best rewards so far?")
print("- How confident are you in that assessment?")
print("- What would you choose next, and why?")


Created a 3-armed bandit tracker
I'll remember every action and reward, but I won't make smart decisions yet

Let's manually try some actions and see what happens:
Step 1: Chose action 0, got reward 1.20
Step 2: Chose action 1, got reward 0.80
Step 3: Chose action 2, got reward -0.30
Step 4: Chose action 0, got reward 1.50
Step 5: Chose action 1, got reward 0.90
Step 6: Chose action 0, got reward 1.10

History after 6 steps:
Actions chosen: [0, 1, 2, 0, 1, 0]
Rewards received: [1.2, 0.8, -0.3, 1.5, 0.9, 1.1]

Thinking questions for students:
- Which action seems to be giving the best rewards so far?
- How confident are you in that assessment?
- What would you choose next, and why?


In [9]:
print("\n" + "=" * 60)
print("STEP 2: Adding Simple Analysis")
print("=" * 60)

class AnalyticBanditTracker:
    """
    Now we add the ability to analyze our data.
    Students learn to extract insights from the raw records.
    """
    def __init__(self, k_arms):
        self.k = k_arms
        self.action_history = []
        self.reward_history = []

        print(f"Created an analytic {k_arms}-armed bandit")
        print("Now I can analyze my experience to make better decisions")

    def record_action_and_reward(self, action, reward):
        self.action_history.append(action)
        self.reward_history.append(reward)

    def get_action_statistics(self):
        """
        Calculate statistics for each action based on our history.
        This is where students see how we extract knowledge from data.
        """
        stats = {}

        for action in range(self.k):
            # Find all the times we chose this action
            action_indices = [i for i, a in enumerate(self.action_history) if a == action]

            if len(action_indices) == 0:
                # We haven't tried this action yet
                stats[action] = {
                    'count': 0,
                    'total_reward': 0,
                    'average_reward': None,
                    'rewards': []
                }
            else:
                # Calculate statistics from our experience
                rewards_for_action = [self.reward_history[i] for i in action_indices]
                stats[action] = {
                    'count': len(rewards_for_action),
                    'total_reward': sum(rewards_for_action),
                    'average_reward': sum(rewards_for_action) / len(rewards_for_action),
                    'rewards': rewards_for_action
                }

        return stats

    def print_analysis(self):
        """Show students what we've learned about each action."""
        print(f"\nAnalysis after {len(self.action_history)} steps:")
        print("-" * 40)

        stats = self.get_action_statistics()

        for action in range(self.k):
            s = stats[action]
            if s['count'] == 0:
                print(f"Action {action}: Never tried")
            else:
                print(f"Action {action}: {s['count']} tries, "
                      f"average reward = {s['average_reward']:.3f}")
                print(f"  Individual rewards: {[round(r, 2) for r in s['rewards']]}")

    def suggest_best_action(self):
        """
        Simple greedy selection based on current averages.
        Students see how to turn analysis into decisions.
        """
        stats = self.get_action_statistics()

        # Find the action with highest average reward (among tried actions)
        best_action = None
        best_average = float('-inf')

        for action in range(self.k):
            if stats[action]['count'] > 0:  # Only consider tried actions
                if stats[action]['average_reward'] > best_average:
                    best_average = stats[action]['average_reward']
                    best_action = action

        if best_action is not None:
            print(f"\nBased on current data, action {best_action} looks best "
                  f"(average reward: {best_average:.3f})")
        else:
            print("\nNo actions tried yet - need to explore!")

        return best_action

# Demonstrate the analysis
analyzer = AnalyticBanditTracker(3)

# Use the same data as before
for action, reward in zip(manual_actions, simulated_rewards):
    analyzer.record_action_and_reward(action, reward)

analyzer.print_analysis()
analyzer.suggest_best_action()

print("\nDiscussion points for students:")
print("- Is our 'best' action really the best? How can we be more confident?")
print("- What about actions we haven't tried much?")
print("- How do we balance using what we know vs. learning more?")

print("\n" + "=" * 60)



STEP 2: Adding Simple Analysis
Created an analytic 3-armed bandit
Now I can analyze my experience to make better decisions

Analysis after 6 steps:
----------------------------------------
Action 0: 3 tries, average reward = 1.267
  Individual rewards: [1.2, 1.5, 1.1]
Action 1: 2 tries, average reward = 0.850
  Individual rewards: [0.8, 0.9]
Action 2: 1 tries, average reward = -0.300
  Individual rewards: [-0.3]

Based on current data, action 0 looks best (average reward: 1.267)

Discussion points for students:
- Is our 'best' action really the best? How can we be more confident?
- What about actions we haven't tried much?
- How do we balance using what we know vs. learning more?



In [10]:
print("STEP 3: Efficient Value Tracking")
print("=" * 60)

class EfficientBanditTracker:
    """
    Now we introduce the efficient incremental averaging approach.
    Students see why we don't need to store all history.
    """
    def __init__(self, k_arms):
        self.k = k_arms

        # Instead of storing all history, we just track what we need
        self.Q = np.zeros(k_arms)      # Average rewards (our estimates)
        self.N = np.zeros(k_arms)      # How many times we've tried each action

        print(f"Created an efficient {k_arms}-armed bandit")
        print("Now I track just the essential information: averages and counts")

    def update_estimates(self, action, reward):
        """
        Update our estimates using the incremental average formula.
        This is where students learn the key algorithmic insight.
        """
        print(f"\nUpdating estimates for action {action} with reward {reward:.2f}")

        # Show the old values
        old_average = self.Q[action]
        old_count = self.N[action]

        # Update count
        self.N[action] += 1
        new_count = self.N[action]

        # Calculate new average using incremental formula
        # Q_new = Q_old + (1/N) * (reward - Q_old)
        self.Q[action] = old_average + (reward - old_average) / new_count
        new_average = self.Q[action]

        # Show students what happened
        if old_count == 0:
            print(f"  First time trying action {action}")
            print(f"  New average: {new_average:.3f}")
        else:
            print(f"  Old average: {old_average:.3f} (based on {old_count} tries)")
            print(f"  New average: {new_average:.3f} (based on {new_count} tries)")
            print(f"  Change: {new_average - old_average:.3f}")

    def show_current_estimates(self):
        """Display our current knowledge about each action."""
        print(f"\nCurrent estimates:")
        for action in range(self.k):
            if self.N[action] == 0:
                print(f"Action {action}: No data yet")
            else:
                print(f"Action {action}: {self.Q[action]:.3f} "
                      f"(based on {int(self.N[action])} tries)")

    def get_best_action(self):
        """Simple greedy action selection."""
        # Only consider actions we've tried
        tried_actions = [a for a in range(self.k) if self.N[a] > 0]

        if len(tried_actions) == 0:
            return None

        best_action = max(tried_actions, key=lambda a: self.Q[a])
        return best_action

# Demonstrate efficient tracking
efficient = EfficientBanditTracker(3)

print("Let's see how the incremental averaging works:")
for action, reward in zip(manual_actions, simulated_rewards):
    efficient.update_estimates(action, reward)
    efficient.show_current_estimates()

best = efficient.get_best_action()
if best is not None:
    print(f"\nGreedy choice: Action {best}")

print("\nKey insight for students:")
print("Notice that we get the same averages as before, but we don't need to store all the data!")
print("The incremental formula gives us mathematical efficiency.")

print("\n" + "=" * 60)


STEP 3: Efficient Value Tracking
Created an efficient 3-armed bandit
Now I track just the essential information: averages and counts
Let's see how the incremental averaging works:

Updating estimates for action 0 with reward 1.20
  First time trying action 0
  New average: 1.200

Current estimates:
Action 0: 1.200 (based on 1 tries)
Action 1: No data yet
Action 2: No data yet

Updating estimates for action 1 with reward 0.80
  First time trying action 1
  New average: 0.800

Current estimates:
Action 0: 1.200 (based on 1 tries)
Action 1: 0.800 (based on 1 tries)
Action 2: No data yet

Updating estimates for action 2 with reward -0.30
  First time trying action 2
  New average: -0.300

Current estimates:
Action 0: 1.200 (based on 1 tries)
Action 1: 0.800 (based on 1 tries)
Action 2: -0.300 (based on 1 tries)

Updating estimates for action 0 with reward 1.50
  Old average: 1.200 (based on 1.0 tries)
  New average: 1.350 (based on 2.0 tries)
  Change: 0.150

Current estimates:
Action 0: 1

In [11]:
print("STEP 4: Adding Exploration Strategy")
print("=" * 60)

class SimpleEpsilonGreedy:
    """
    Finally, we add exploration to create a complete learning algorithm.
    Students see how all the pieces come together.
    """
    def __init__(self, k_arms, epsilon=0.1):
        self.k = k_arms
        self.epsilon = epsilon

        # All the efficient tracking from before
        self.Q = np.zeros(k_arms)
        self.N = np.zeros(k_arms)

        # Track total steps for analysis
        self.total_steps = 0

        print(f"Created epsilon-greedy bandit with ε = {epsilon}")
        print("Now I can balance exploration and exploitation!")

    def choose_action(self):
        """
        The epsilon-greedy strategy students have been building toward.
        """
        self.total_steps += 1

        # Handle the case where we haven't tried any actions yet
        untried_actions = [a for a in range(self.k) if self.N[a] == 0]
        if len(untried_actions) > 0:
            action = np.random.choice(untried_actions)
            print(f"Step {self.total_steps}: Trying untested action {action}")
            return action

        # Epsilon-greedy decision
        if np.random.random() < self.epsilon:
            # Explore: choose randomly
            action = np.random.randint(self.k)
            print(f"Step {self.total_steps}: Exploring - chose random action {action}")
        else:
            # Exploit: choose the best known action
            action = np.argmax(self.Q)
            print(f"Step {self.total_steps}: Exploiting - chose best action {action} "
                  f"(value: {self.Q[action]:.3f})")

        return action

    def learn_from_reward(self, action, reward):
        """Update our knowledge (same as before)."""
        self.N[action] += 1
        self.Q[action] += (reward - self.Q[action]) / self.N[action]

    def show_learning_progress(self):
        """Show students how the algorithm is learning."""
        print("\nCurrent state of learning:")
        for action in range(self.k):
            if self.N[action] == 0:
                print(f"Action {action}: Never tried")
            else:
                confidence = "high" if self.N[action] >= 5 else "low"
                print(f"Action {action}: value = {self.Q[action]:.3f}, "
                      f"tried {int(self.N[action])} times ({confidence} confidence)")

# Create a simple environment for demonstration
class SimpleEnvironment:
    """A basic environment that students can understand easily."""
    def __init__(self):
        # True values that the bandit needs to learn
        self.true_values = [0.5, 1.2, 0.8]  # Action 1 is actually best
        print(f"Environment created with true action values: {self.true_values}")
        print("(The bandit doesn't know these - it has to learn them!)")

    def get_reward(self, action):
        """Return a noisy reward based on the true value."""
        true_value = self.true_values[action]
        # Add some randomness so the bandit can't learn immediately
        noise = np.random.normal(0, 0.3)
        return true_value + noise

# Final demonstration: complete learning loop
print("Complete learning demonstration:")
print("-" * 40)

env = SimpleEnvironment()
agent = SimpleEpsilonGreedy(3, epsilon=0.2)

# Run several steps so students can see the learning process
for step in range(10):
    action = agent.choose_action()
    reward = env.get_reward(action)
    agent.learn_from_reward(action, reward)

    print(f"  → Received reward: {reward:.2f}")

    if step % 3 == 2:  # Show progress every few steps
        agent.show_learning_progress()
        print()

print("=" * 60)


STEP 4: Adding Exploration Strategy
Complete learning demonstration:
----------------------------------------
Environment created with true action values: [0.5, 1.2, 0.8]
(The bandit doesn't know these - it has to learn them!)
Created epsilon-greedy bandit with ε = 0.2
Now I can balance exploration and exploitation!
Step 1: Trying untested action 0
  → Received reward: 0.73
Step 2: Trying untested action 1
  → Received reward: 0.77
Step 3: Trying untested action 2
  → Received reward: 1.06

Current state of learning:
Action 0: value = 0.728, tried 1 times (low confidence)
Action 1: value = 0.773, tried 1 times (low confidence)
Action 2: value = 1.056, tried 1 times (low confidence)

Step 4: Exploiting - chose best action 2 (value: 1.056)
  → Received reward: 0.36
Step 5: Exploring - chose random action 2
  → Received reward: 0.29
Step 6: Exploiting - chose best action 1 (value: 0.773)
  → Received reward: 1.26

Current state of learning:
Action 0: value = 0.728, tried 1 times (low conf