In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from ai2thor.controller import Controller
from torchvision import transforms
from torch.distributions import Categorical

# Environment setup
controller = Controller()
controller.start()
controller.reset('FloorPlan1')
controller.step(action='Initialize', gridSize=0.25)

# Preprocess camera image
def preprocess(frame):
    transform = transforms.Compose([
        transforms.ToPILImage(),
        transforms.Resize((64, 64)),
        transforms.ToTensor()
    ])
    return transform(frame).unsqueeze(0)

# Policy network
class PolicyNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, kernel_size=8, stride=4),
            nn.ReLU(),
            nn.Conv2d(16, 32, kernel_size=4, stride=2),
            nn.ReLU(),
            nn.Flatten()
        )
        self.fc = nn.Sequential(
            nn.Linear(32 * 6 * 6, 256),
            nn.ReLU()
        )
        self.actor = nn.Linear(256, 4)   # 4 actions
        self.critic = nn.Linear(256, 1)

    def forward(self, x):
        x = self.conv(x)
        x = self.fc(x)
        return self.actor(x), self.critic(x) # pi(at|st)=argmax Q(St,At), V(st)

# Action mapping
ACTIONS = ['MoveAhead', 'MoveBack', 'RotateLeft', 'RotateRight']

# Hyperparameters
policy = PolicyNet()
optimizer = optim.Adam(policy.parameters(), lr=1e-4)
gamma = 0.99
eps_clip = 0.2

# Training loop
for episode in range(1000):
    controller.reset('FloorPlan1')
    controller.step(action='Initialize', gridSize=0.25)
    event = controller.step(action='LookDown')
    image = preprocess(event.frame)

    log_probs, rewards, values, actions = [], [], [], []
    done = False

    for t in range(100):
        logits, value = policy(image) # on-policy
        dist = Categorical(logits=logits)
        action = dist.sample()

        act_name = ACTIONS[action.item()]
        event = controller.step(action=act_name)

        reward = 1.0 if 'objectId' in event.metadata['lastActionSuccess'] else -0.01
        done = not event.metadata['lastActionSuccess']
        
        log_probs.append(dist.log_prob(action))
        values.append(value.squeeze())
        rewards.append(torch.tensor(reward, dtype=torch.float))
        actions.append(action)

        if done:
            break

        image = preprocess(event.frame)

    # Compute returns and advantages
    returns, advs = [], []
    G = 0 # return
    for r in reversed(rewards):
        G = r + gamma * G # REturns: Q-Learning, SARSA, On-policy / Off-policy
        returns.insert(0, G)
    returns = torch.tensor(returns)
    values = torch.stack(values)
    advantages = returns - values.detach() # G - V(St)

    # PPO loss
    log_probs = torch.stack(log_probs)
    old_log_probs = log_probs.detach()
    for _ in range(4):  # K epochs
        logits, value = policy(image)
        dist = Categorical(logits=logits)
        new_log_probs = dist.log_prob(torch.stack(actions))
        ratio = torch.exp(new_log_probs - old_log_probs)

        surr1 = ratio * advantages
        surr2 = torch.clamp(ratio, 1 - eps_clip, 1 + eps_clip) * advantages
        policy_loss = -torch.min(surr1, surr2).mean()
        value_loss = (returns - value.squeeze()).pow(2).mean()

        loss = policy_loss + 0.5 * value_loss

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f"Episode {episode}: Total Reward = {sum(rewards).item():.2f}")

controller.stop()




FileExistsError: [Errno 17] File exists