# Real Code

In [14]:
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import random

# Constants
NUM_PERSONS = 100
EDUCATION_LEVELS = [0, 1, 2, 3, 4]
EDUCATION_EARNINGS = {0: 10000, 1: 13000, 2: 15000, 3: 17000, 4: 20000}
EXPENSE = 10000
ACTIONS = [0, 1, 2]
ALPHA = 0.1
GAMMA = 0.9
EPSILON = 0.1
BATCH_SIZE = 64
MEMORY_SIZE = 10000
SALARIES = [10000, 13000, 16000, 19000, 22000]

class Person:
    def __init__(self):
        self.net_worth = 0
        self.education_level = np.random.choice(EDUCATION_LEVELS)

    def earn(self):
        return EDUCATION_EARNINGS[self.education_level]

    def spend(self):
        return EXPENSE

    def update_net_worth(self):
        self.net_worth += self.earn() - self.spend()

class Environment:
    def __init__(self):
        self.persons = [Person() for _ in range(NUM_PERSONS)]

    def get_state(self):
        net_worths = [person.net_worth for person in self.persons]
        educations = [person.education_level for person in self.persons]
        net_worths+=educations
        return net_worths

    def step(self, action):
        for person in self.persons:
            person.update_net_worth()
        next_state = self.get_state()
        return next_state

#QNetwork and DQN Agent is both policy maker

class QNetwork(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(QNetwork, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 128)
        self.fc3 = nn.Linear(128, output_dim)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

class DQNAgent:
    def __init__(self):
        self.model = QNetwork(2*NUM_PERSONS, len(ACTIONS))
        self.memory = []  # For experience replay
        self.optimizer = optim.Adam(self.model.parameters(), lr=ALPHA)

    def select_action(self, state):   # how the fuck will it converge to the action with the most reward value
        if np.random.uniform(0, 1) < EPSILON:
            return np.random.choice(ACTIONS)
        else:
            with torch.no_grad():
                state_tensor = torch.FloatTensor(state).unsqueeze(0)
                q_values = self.model(state_tensor)
                return torch.argmax(q_values).item()

    def remember(self, state, action, reward, next_state):
        self.memory.append((state, action, reward, next_state))
        if len(self.memory) > MEMORY_SIZE:
            self.memory.pop(0)

    def replay(self):  # need to check this one too
        if len(self.memory) < BATCH_SIZE:
            return
        batch_indices = np.random.choice(len(self.memory), BATCH_SIZE, replace=False)
        batch = [self.memory[i] for i in batch_indices]
        for state, action, reward, next_state in batch:
            state_tensor = torch.FloatTensor(state).unsqueeze(0)
            next_state_tensor = torch.FloatTensor(next_state).unsqueeze(0)
            y1= self.model(next_state_tensor)
            y2= torch.max(y1)
            target = reward + GAMMA * y2
            q_values = self.model(state_tensor)
            loss = nn.MSELoss()(q_values[0][action], target)
            self.optimizer.zero_grad()
            loss.backward()
            self.optimizer.step()

    def apply_action(self, action, persons):
        total_cost = 0

        if action == 0:  # Assuming 0 is the action for direct money granting
            grant_amount = 500  # Define this as per requirement
            for person in persons:
                person.net_worth += grant_amount
            total_cost = grant_amount * len(persons)
            
        elif action == 1:  # Upgrade education level of random 5 people
            # Select 5 random people who have education level less than 4
            eligible_persons = [p for p in persons if p.education_level < 4]
            selected_persons = random.sample(eligible_persons, min(5, len(eligible_persons)))

            for person in selected_persons:
                # Increase the educational level
                person.education_level += 1

                # Calculate the salary difference due to the upgrade
                salary_difference = SALARIES[person.education_level] - SALARIES[person.education_level - 1]

                # Calculate the cost of the upgrade and add to the total_cost
                upgrade_cost = 5 * salary_difference
                total_cost += upgrade_cost


        return total_cost
    
    def get_reward(self, total_cost, persons):  # need to check this one too
        net_worth_sum = sum([person.net_worth for person in persons])
        reward = net_worth_sum - total_cost
        return reward

def simulate_episode(env, agent):
    current_state = env.get_state()
    action = agent.select_action(current_state)
    total_cost = agent.apply_action(action, env.persons)  # Assumes you've added this method to DQNAgent, similar to PolicyMaker
    next_state = env.step(action)  # Adjusted for simplicity; step might need more info
    reward = agent.get_reward(total_cost, env.persons)  # Assumes you've added this method to DQNAgent, similar to PolicyMaker

    agent.remember(current_state, action, reward, next_state)
    agent.replay()  # Experience replay

    return reward

def main():
    env = Environment()
    agent = DQNAgent()
    EPSILON = 0.1
    total_reward = 0
    num_episodes = 5000  # You might need more episodes for neural network training

    for episode in range(num_episodes):
        print('episode',episode)
        total_reward += simulate_episode(env, agent)

        # Optionally decrease epsilon over time to reduce exploration
        if EPSILON > 0.01:
            EPSILON *= 0.995

    print(f"Total reward after {num_episodes} episodes: {total_reward}")

if __name__ == "__main__":
    main()


episode 0
episode 1
episode 2
episode 3
episode 4
episode 5
episode 6
episode 7
episode 8
episode 9
episode 10
episode 11
episode 12
episode 13
episode 14
episode 15
episode 16
episode 17
episode 18
episode 19
episode 20
episode 21
episode 22
episode 23
episode 24
episode 25
episode 26
episode 27
episode 28
episode 29
episode 30
episode 31
episode 32
episode 33
episode 34
episode 35
episode 36
episode 37
episode 38
episode 39
episode 40
episode 41
episode 42
episode 43
episode 44
episode 45
episode 46
episode 47
episode 48
episode 49
episode 50
episode 51
episode 52
episode 53
episode 54
episode 55
episode 56
episode 57
episode 58
episode 59
episode 60
episode 61
episode 62
episode 63
episode 64
episode 65
episode 66
episode 67
episode 68
episode 69
episode 70
episode 71
episode 72
episode 73
episode 74
episode 75
episode 76
episode 77
episode 78
episode 79
episode 80
episode 81
episode 82
episode 83
episode 84
episode 85
episode 86
episode 87
episode 88
episode 89
episode 90
episode 9

KeyboardInterrupt: 