In [1]:
import numpy as np
import random

In [24]:
states = np.arange(16,31)
actions = ['ON','OFF']

In [25]:
print(states)
print(actions)

[16 17 18 19 20 21 22 23 24 25 26 27 28 29 30]
['ON', 'OFF']


In [26]:
#q table is [s,a] state, action
Q = np.zeros((len(states),len(actions)))

In [27]:
#alpha : (0-1)
#gama: 1
#epsilon:probability
#episodes: independent trainig runs ()
alpha = 0.1
gamma = 0.9
epsilon = 0.2
episodes = 300

In [28]:
# Step 3: Write Reward Function

def get_reward(temp, action):
    # Base comfort reward
    if 20 <= temp <= 24:
        reward = 10  # comfortable
    else:
        reward = -5  # uncomfortable

    # Energy cost
    if action == 'ON':
        reward -= 2  # cost of energy

    return reward


In [29]:
# Step 4: Environment Dynamic Change (e.g. Temperature Changes)
def next_temp(temp, action):
    if action == "ON":
        temp -= random.choice([1, 2])
    else:
        temp += random.choice([0, 1, 2])
    
    # Keep temperature between 16°C and 30°C
    return int(np.clip(temp, 16, 30))


In [30]:
# Step 5: Training Loop
for ep in range(episodes):
    temp = random.choice(states)
    done = False

    for _ in range(20):  # limit steps per episode
        # Choose action (epsilon-greedy)
        if random.uniform(0, 1) < epsilon:
            action = random.choice(actions)
        else:
            action = actions[np.argmax(Q[temp - 16])]

        # Get next state and reward
        next_state = next_temp(temp, action)
        reward = get_reward(next_state, action)
        # Update Q-value
        a = actions.index(action)
        best_next = np.max(Q[next_state - 16])
        Q[temp - 16, a] += alpha * (reward + gamma * best_next - Q[temp - 16, a])

        # move to next state
        temp = next_state

        
print('Training Done')

Training Done


In [31]:
# Test Learned Policy
temp = 28

for step in range(10):
    action = actions[np.argmax(Q[temp - 16])]  # use '=' and proper indexing
    print(f"Step {step + 1}: Temp = {temp}°C → Action = {action}")
    temp = next_temp(temp, action)  # update temperature correctly


Step 1: Temp = 28°C → Action = ON
Step 2: Temp = 27°C → Action = ON
Step 3: Temp = 25°C → Action = ON
Step 4: Temp = 23°C → Action = ON
Step 5: Temp = 21°C → Action = OFF
Step 6: Temp = 21°C → Action = OFF
Step 7: Temp = 23°C → Action = ON
Step 8: Temp = 21°C → Action = OFF
Step 9: Temp = 21°C → Action = OFF
Step 10: Temp = 22°C → Action = OFF


In [32]:
try:
    temp = int(input("Enter Starting Room Temp (16-30): "))

    if temp < 16 or temp > 30:
        raise ValueError("Temperature out of range!")

except ValueError as ve:
    print(ve)
    temp = 25
    print("Set to default: 25°C")

print(f"\nStarting temp: {temp}°C")

for step in range(10):
    action = actions[np.argmax(Q[temp - 16])]
    print(f"Step {step + 1}: Temp = {temp}°C → Action = {action}")
    temp = next_temp(temp, action)

print("\nDone ")


Enter Starting Room Temp (16-30):  20



Starting temp: 20°C
Step 1: Temp = 20°C → Action = OFF
Step 2: Temp = 22°C → Action = OFF
Step 3: Temp = 22°C → Action = OFF
Step 4: Temp = 23°C → Action = ON
Step 5: Temp = 21°C → Action = OFF
Step 6: Temp = 23°C → Action = ON
Step 7: Temp = 22°C → Action = OFF
Step 8: Temp = 24°C → Action = ON
Step 9: Temp = 22°C → Action = OFF
Step 10: Temp = 23°C → Action = ON

Done 
