# **Simple ASI Core **:- Here’s the code for a Reinforcement Learning (RL) agent learning to maximize a reward through interactions with an environment. We'll use a Q-learning agent with a simple environment to simulate self-improvement and feedback loops.

In [1]:
import numpy as np
import random

# Environment Setup
class SimpleEnvironment:
    def __init__(self):
        self.states = [0, 1, 2]  # Three possible states
        self.actions = [0, 1]  # Two possible actions (0 = bad action, 1 = good action)
        self.reward_matrix = {
            (0, 0): -1, (0, 1): 1,  # State 0
            (1, 0): 1, (1, 1): 0,   # State 1
            (2, 0): 0, (2, 1): 10,  # State 2 (goal state)
        }
        self.current_state = 0

    def reset(self):
        self.current_state = 0
        return self.current_state

    def step(self, action):
        reward = self.reward_matrix.get((self.current_state, action), -1)
        # Transition to a new state based on action
        if action == 1:  # Good action: move to next state
            next_state = min(self.current_state + 1, len(self.states) - 1)
        else:  # Bad action: stay in the current state
            next_state = self.current_state
        self.current_state = next_state
        return next_state, reward

# Q-Learning Agent Setup
class QLearningAgent:
    def __init__(self, env, alpha=0.1, gamma=0.9, epsilon=0.1):
        self.env = env
        self.alpha = alpha  # Learning rate
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.q_table = np.zeros((len(env.states), len(env.actions)))  # Initialize Q-table

    def choose_action(self, state):
        if random.uniform(0, 1) < self.epsilon:  # Exploration
            return random.choice([0, 1])
        else:  # Exploitation
            return np.argmax(self.q_table[state])  # Select best action based on Q-table

    def learn(self, state, action, reward, next_state):
        old_q_value = self.q_table[state, action]
        future_q_value = np.max(self.q_table[next_state])
        # Update Q-value using the Q-learning formula
        self.q_table[state, action] = old_q_value + self.alpha * (reward + self.gamma * future_q_value - old_q_value)

# Simulation of Learning Process with Feedback Loop
def run_simulation(episodes=1000):
    env = SimpleEnvironment()
    agent = QLearningAgent(env)

    # Simulate the learning process
    for episode in range(episodes):
        state = env.reset()
        total_reward = 0
        done = False

        while not done:
            action = agent.choose_action(state)
            next_state, reward = env.step(action)
            agent.learn(state, action, reward, next_state)
            total_reward += reward
            state = next_state

            # Check if the agent has reached the goal state
            if state == 2:  # Goal state
                done = True

        # Feedback Loop: Simulate Human Feedback
        if total_reward < 5:  # Poor performance feedback (human teaching scenario)
            agent.epsilon = 0.2  # More exploration to improve learning
        else:  # Positive feedback, continue with current exploration/exploitation balance
            agent.epsilon = 0.1

        # Print status every 100 episodes
        if episode % 100 == 0:
            print(f"Episode {episode} - Total Reward: {total_reward}")
            print("Current Q-table:\n", agent.q_table)

# Run the simulation
run_simulation(episodes=1000)


Episode 0 - Total Reward: 10
Current Q-table:
 [[-0.1         0.1       ]
 [ 0.95617925  0.        ]
 [ 0.          0.        ]]
Episode 100 - Total Reward: 54
Current Q-table:
 [[2.29151217 9.99925825]
 [9.99999973 0.        ]
 [0.         0.        ]]
Episode 200 - Total Reward: 8
Current Q-table:
 [[ 4.62916305  9.99999998]
 [10.          0.        ]
 [ 0.          0.        ]]
Episode 300 - Total Reward: 26
Current Q-table:
 [[ 6.38773914 10.        ]
 [10.          0.        ]
 [ 0.          0.        ]]
Episode 400 - Total Reward: 8
Current Q-table:
 [[ 7.4378394 10.       ]
 [10.         0.       ]
 [ 0.         0.       ]]
Episode 500 - Total Reward: 19
Current Q-table:
 [[ 7.73112033 10.        ]
 [10.          0.        ]
 [ 0.          0.        ]]
Episode 600 - Total Reward: 2
Current Q-table:
 [[ 7.87139569 10.        ]
 [10.          0.        ]
 [ 0.          0.        ]]
Episode 700 - Total Reward: 2
Current Q-table:
 [[ 7.91562271 10.        ]
 [10.          0.        

# **My Idea with code generated by ChatGPT, executed by Bhadale IT**