<a href="https://colab.research.google.com/github/worldofaryavart/AI-for-India/blob/main/MathsRL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers

In [6]:
# Simplified action space (digits 0-9 and signs)
actions = ['0','1','2','3','4','5','6','7','8','9', '+', '-']
num_actions = len(actions)

# Environment with one-hot operator encoding
def generate_problem():
    a = np.random.randint(0, 10)
    b = np.random.randint(0, 10)
    op = np.random.choice(['+', '-'])
    correct = a + b if op == '+' else a - b
    # State: [a/10, b/10, is_addition, is_subtraction]
    state = np.array([a/10.0, b/10.0, 1.0 if op == '+' else 0.0, 0.0 if op == '+' else 1.0])
    return state, str(correct)

In [7]:
model = tf.keras.Sequential([
    layers.Dense(32, activation='relu', input_shape=(4,)),
    layers.Dense(32, activation='relu'),
    layers.Dense(num_actions, activation='softmax')
])

optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

In [8]:
# Batched training loop
def train(num_episodes=5000):
    batch_size = 32
    states, rewards, actions_idxs = [], [], []

    for episode in range(num_episodes):
        state, correct = generate_problem()
        logits = model(tf.expand_dims(state, axis=0))
        action_idx = tf.random.categorical(logits, 1)[0, 0].numpy()
        predicted = actions[action_idx]

        # Calculate shaped reward
        try:
            reward = 1.0 if predicted == correct else -0.1  # Less punishment
        except:
            reward = -1.0

        # Store batch
        states.append(state)
        actions_idxs.append(action_idx)
        rewards.append(reward)

        # Update model in batches
        if len(states) == batch_size:
            with tf.GradientTape() as tape:
                logits = model(np.array(states))
                probs = tf.nn.softmax(logits)
                selected_log_probs = tf.math.log(
                    tf.reduce_sum(probs * tf.one_hot(actions_idxs, num_actions), axis=1)
                )
                loss = -tf.reduce_mean(selected_log_probs * rewards)

            grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            # Reset
            states, rewards, actions_idxs = [], [], []

        if episode % 500 == 0:
            print(f"Episode {episode}: Predicted {predicted}, Correct {correct}")

train()

Episode 0: Predicted 6, Correct 16
Episode 500: Predicted 0, Correct 8
Episode 1000: Predicted 9, Correct 7
Episode 1500: Predicted 8, Correct 2
Episode 2000: Predicted 3, Correct 0
Episode 2500: Predicted 5, Correct 6
Episode 3000: Predicted 7, Correct 6
Episode 3500: Predicted 4, Correct 9
Episode 4000: Predicted 8, Correct -2
Episode 4500: Predicted 9, Correct 2


In [10]:
def test(num_problems=100):
    correct_count = 0
    for _ in range(num_problems):
        state, correct = generate_problem()
        logits = model(tf.expand_dims(state, axis=0))
        action_idx = tf.argmax(logits, axis=-1).numpy()[0]
        predicted_answer = actions[action_idx]
        if predicted_answer == correct:
            correct_count += 1
        print(f"Problem: {state[0]*10:.0f} {'+' if state[2]==0 else '-'} {state[1]*10:.0f}")
        print(f"Predicted: {predicted_answer}, Correct: {correct}\n")
    print(f"Accuracy: {correct_count/num_problems*100:.2f}%")

test()

Problem: 7 - 0
Predicted: 8, Correct: 7

Problem: 9 - 8
Predicted: 8, Correct: 17

Problem: 1 - 4
Predicted: 8, Correct: 5

Problem: 8 + 4
Predicted: 2, Correct: 4

Problem: 5 - 7
Predicted: 8, Correct: 12

Problem: 8 + 2
Predicted: 2, Correct: 6

Problem: 9 - 7
Predicted: 8, Correct: 16

Problem: 5 - 2
Predicted: 8, Correct: 7

Problem: 9 + 6
Predicted: 2, Correct: 3

Problem: 0 + 0
Predicted: 2, Correct: 0

Problem: 0 + 7
Predicted: 2, Correct: -7

Problem: 2 - 7
Predicted: 8, Correct: 9

Problem: 5 - 1
Predicted: 8, Correct: 6

Problem: 2 - 5
Predicted: 8, Correct: 7

Problem: 3 - 2
Predicted: 8, Correct: 5

Problem: 4 + 6
Predicted: 2, Correct: -2

Problem: 4 - 6
Predicted: 8, Correct: 10

Problem: 6 - 6
Predicted: 8, Correct: 12

Problem: 9 + 4
Predicted: 2, Correct: 5

Problem: 5 - 4
Predicted: 8, Correct: 9

Problem: 6 + 1
Predicted: 2, Correct: 5

Problem: 5 - 8
Predicted: 8, Correct: 13

Problem: 1 - 7
Predicted: 8, Correct: 8

Problem: 2 + 0
Predicted: 2, Correct: 2

Problem: