# Trial

In [None]:
!pip install torch torchvision torchaudio



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions import Categorical
import pandas as pd
import numpy as np
from google.colab import drive

In [None]:
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
#Define the paths to your CSV files in Google Drive (UPDATE THESE PATHS)
task_data_path = '/content/drive/MyDrive/Skripsi/Resources/task_test.csv'  # Update with your actual path
employee_data_path = '/content/drive/MyDrive/Skripsi/Resources/employee_test.csv'  # Update with your actual path

# Load the dataset
task_skills = df_tasks = pd.read_csv(task_data_path)
employee_skills = df_employees = pd.read_csv(employee_data_path)

# Extract relevant data
task_skills = df_tasks[['Mathematics.Linear Algebra', 'Mathematics.Differential Equations', 'Mathematics.Optimization Technique']].values
employee_skills = df_employees[['Mathematics.Linear Algebra', 'Mathematics.Differential Equations', 'Mathematics.Optimization Technique']].values
story_points = df_tasks['story_points'].values

In [None]:
# Hyperparameters
alpha = 0.5
w1 = 0.7  # Weight for skill matching
w2 = 0.3  # Weight for workload balancing
learning_rate = 0.001
gamma = 0.99  # Discount factor
epochs = 1000
max_story_points = 20

# PPO specific hyperparameters
clip_epsilon = 0.2
ppo_epochs = 4
batch_size = 32

In [None]:
# Define the policy network
class PolicyNetwork(nn.Module):
    def __init__(self, input_size, output_size):
        super(PolicyNetwork, self).__init__()
        self.fc1 = nn.Linear(input_size, 128)
        self.fc2 = nn.Linear(128, output_size)

    def forward(self, x):
        x = torch.relu(self.fc1(x))
        x = torch.softmax(self.fc2(x), dim=-1)
        return x

# Calculate weighted Euclidean distance
def weighted_euclidean_distance(employee_skills, task_skills, alpha=0.5):
    # Calculate the distance for a single employee-task pair
    weights = 1 / (1 + alpha * np.maximum(0, (employee_skills - task_skills)))
    distance = np.sqrt(np.sum(weights * (employee_skills - task_skills)**2))
    return distance

# Calculate reward
def calculate_reward(assignments, employee_workload, employee_skills, task_skills, story_points):
    total_weighted_distance = 0
    for task_idx, employee_idx in enumerate(assignments):
        if employee_idx is not None:
            distance = weighted_euclidean_distance(
                employee_skills[employee_idx], task_skills[task_idx], alpha
            )
            total_weighted_distance += distance

    std_dev_workload = np.std(employee_workload)
    reward = -(w1 * total_weighted_distance + w2 * std_dev_workload)

    # Ensure reward is a scalar
    return reward


# PPO Agent
class PPOAgent:
    def __init__(self, input_size, output_size):
        self.policy_network = PolicyNetwork(input_size, output_size)
        self.optimizer = optim.Adam(self.policy_network.parameters(), lr=learning_rate)

        self.old_policy_network = PolicyNetwork(input_size, output_size)
        self.old_policy_network.load_state_dict(self.policy_network.state_dict())

    def select_action(self, state):
        state = torch.from_numpy(state).float()
        probs = self.policy_network(state)
        m = Categorical(probs)
        action = m.sample()
        return action.item(), m.log_prob(action)

    def update_policy(self, states, actions, log_probs, rewards, masks):
        for _ in range(ppo_epochs):
            for i in range(0, len(states), batch_size):
                batch_states = torch.from_numpy(np.array(states[i:i+batch_size])).float()
                batch_actions = torch.tensor(actions[i:i+batch_size])
                batch_log_probs = torch.tensor(log_probs[i:i+batch_size])
                batch_rewards = torch.tensor(rewards[i:i+batch_size])
                batch_masks = torch.tensor(masks[i:i+batch_size])

                # Calculate advantage
                returns = []
                R = 0
                for r, mask in zip(reversed(batch_rewards), reversed(batch_masks)):
                    R = r + gamma * R * mask
                    returns.insert(0, R)
                returns = torch.tensor(returns)
                # Normalize returns (optional)
                returns = (returns - returns.mean()) / (returns.std() + 1e-8)

                # Calculate new log probabilities
                new_probs = self.policy_network(batch_states)
                m = Categorical(new_probs)
                new_log_probs = m.log_prob(batch_actions)

                # Calculate the ratio
                ratio = torch.exp(new_log_probs - batch_log_probs)

                # Calculate surrogate loss
                surr1 = ratio * returns
                surr2 = torch.clamp(ratio, 1 - clip_epsilon, 1 + clip_epsilon) * returns
                loss = -torch.min(surr1, surr2).mean()

                # Update the policy
                self.optimizer.zero_grad()
                loss.backward()
                self.optimizer.step()

        # Update old policy
        self.old_policy_network.load_state_dict(self.policy_network.state_dict())

# Environment
class TaskAssignmentEnv:
    def __init__(self, employee_skills, task_skills, story_points, max_story_points):
        self.employee_skills = employee_skills
        self.task_skills = task_skills
        self.story_points = story_points
        self.max_story_points = max_story_points
        self.reset()

    def reset(self):
        self.assignments = [None] * len(self.task_skills)
        self.employee_workload = np.array([0] * len(self.employee_skills))
        return self.get_state()

    def get_state(self):
        assignment_matrix = np.zeros((len(self.task_skills), len(self.employee_skills)))
        for i, a in enumerate(self.assignments):
            if a is not None:
                assignment_matrix[i, a] = 1
        return np.concatenate((
            assignment_matrix.flatten(),
            self.employee_workload / self.max_story_points,
            self.employee_skills.flatten(),
            self.task_skills.flatten()
        ))

    def step(self, action):
        task_idx = action // len(self.employee_skills)  # Determine task from action
        employee_idx = action % len(self.employee_skills)  # Determine employee from action

        # Check if action is valid
        if (
            task_idx < len(self.task_skills)
            and self.assignments[task_idx] is None
            and self.employee_workload[employee_idx] + self.story_points[task_idx] <= self.max_story_points
        ):
            self.assignments[task_idx] = employee_idx
            self.employee_workload[employee_idx] += self.story_points[task_idx]
            reward = calculate_reward(
                self.assignments, self.employee_workload, self.employee_skills, self.task_skills, self.story_points
            )
            done = all(a is not None for a in self.assignments)  # Done if all tasks are assigned
        else:
            reward = -10  # Penalty for invalid action
            done = False

        return self.get_state(), reward, done, {}


In [None]:
# Initialize environment and agent
env = TaskAssignmentEnv(employee_skills, task_skills, story_points, max_story_points)
state_size = len(env.get_state())
action_size = len(task_skills) * len(employee_skills)  # Number of tasks * number of employees
agent = PPOAgent(state_size, action_size)

# Training loop
for epoch in range(epochs):
    states = []
    actions = []
    log_probs = []
    rewards = []
    masks = []

    state = env.reset()
    done = False
    while not done:
        action, log_prob = agent.select_action(state)
        next_state, reward, done, _ = env.step(action)

        states.append(state)
        actions.append(action)

        log_probs.append(log_prob)
        rewards.append(reward)

        masks.append(1 - done) # Mask is 1 if not done, 0 if done

        state = next_state

    agent.update_policy(states, actions, log_probs, rewards, masks)

    if epoch % 100 == 0:
        print(f"Epoch: {epoch}, Reward: {np.sum(rewards)}")

# Test the trained agent
state = env.reset()
done = False
while not done:
    action, _ = agent.select_action(state)
    state, reward, done, _ = env.step(action)


print("Final Assignments:", env.assignments)
print("Employee Workload:", env.employee_workload)

Epoch: 0, Reward: -51.685965219849706
Epoch: 100, Reward: -131.12972685446164
Epoch: 200, Reward: -57.5201638840953
Epoch: 300, Reward: -55.76798828723369


  returns = (returns - returns.mean()) / (returns.std() + 1e-8)


ValueError: Expected parameter probs (Tensor of shape (32, 10)) of distribution Categorical(probs: torch.Size([32, 10])) to satisfy the constraint Simplex(), but found invalid values:
tensor([[nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan],
        [nan, nan, nan, nan, nan, nan, nan, nan, nan, nan]],
       grad_fn=<DivBackward0>)