In [None]:
import socket
import json
import numpy as np
import tensorflow as tf
from collections import deque
import random

class DQLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Initial exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # Returns action index

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.train_on_batch(state, target_f)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [None]:
# Initialize the DQL Agent
import socket
import json
import numpy as np
import tensorflow as tf
from collections import deque
import random

class DQLAgent:
    def __init__(self, state_size, action_size):
        self.state_size = state_size
        self.action_size = action_size
        self.memory = deque(maxlen=2000)
        self.gamma = 0.95  # Discount factor
        self.epsilon = 1.0  # Initial exploration rate
        self.epsilon_min = 0.01
        self.epsilon_decay = 0.995
        self.learning_rate = 0.001
        self.model = self._build_model()

    def _build_model(self):
        model = tf.keras.models.Sequential()
        model.add(tf.keras.layers.Dense(24, input_dim=self.state_size, activation='relu'))
        model.add(tf.keras.layers.Dense(24, activation='relu'))
        model.add(tf.keras.layers.Dense(self.action_size, activation='linear'))
        model.compile(loss='mse', optimizer=tf.keras.optimizers.Adam(lr=self.learning_rate))
        return model

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return random.randrange(self.action_size)
        act_values = self.model.predict(state)
        return np.argmax(act_values[0])  # Returns action index

    def replay(self, batch_size):
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            target = reward
            if not done:
                target = (reward + self.gamma * np.amax(self.model.predict(next_state)[0]))
            target_f = self.model.predict(state)
            target_f[0][action] = target
            self.model.train_on_batch(state, target_f)
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay

state_size = 16
action_size = 4  # 0: forward, 1: backward, 2: turn left, 3: turn right
agent = DQLAgent(state_size, action_size)
done = False
batch_size = 32

# Server setup
HOST = '127.0.0.1'
PORT = 12345

def extract_state(data):
    
    values = [item['value'] for item in data['data']]
    return np.array([values], dtype=np.float32)

def calculate_reward2(prev_state, state):
    prev_self_x, prev_self_z, dest_x, dest_z = prev_state[0][-4], prev_state[0][-3], prev_state[0][-2], prev_state[0][-1]
    self_x, self_z = state[0][-4], state[0][-3]
    prev_distance = np.sqrt((dest_x - prev_self_x)**2 + (dest_z - prev_self_z)**2)
    current_distance = np.sqrt((dest_x - self_x)**2 + (dest_z - self_z)**2)
    return prev_distance - current_distance 

def calculate_reward(prev_state, state):
    prev_self_x, prev_self_z, dest_x, dest_z = prev_state[0][-4], prev_state[0][-3], prev_state[0][-2], prev_state[0][-1]
    self_x, self_z = state[0][-4], state[0][-3]

    prev_distance = np.sqrt((dest_x - prev_self_x)**2 + (dest_z - prev_self_z)**2)
    current_distance = np.sqrt((dest_x - self_x)**2 + (dest_z - self_z)**2)

    distance_difference = prev_distance - current_distance

    # Base reward is the distance difference
    reward = distance_difference * 3
    
    # Small penalty for each time step
    time_penalty = -0.01
    reward += time_penalty
    
    # Significant bonus if agent has reached the destination
    if current_distance < 0.1:  # Threshold might need adjustment depending on scale
        reward += 10.0
    
    # Penalty for moving away from target
    if distance_difference < 0:
        reward -= 0.5  # This value can be tuned

    return reward


with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
    s.bind((HOST, PORT))
    
    state = None
    total_reward = 0
    episode = 0
    counter = -1
    while True:
        data, addr = s.recvfrom(1024)
        env_info = json.loads(data.decode('utf-8'))
        
        if counter != env_info["stepCounter"]:
            counter = env_info["stepCounter"]
        else:
            print(counter)
            print(env_info["stepCounter"])
            break
        
        next_state = extract_state(env_info)
        if state is not None:
            reward = calculate_reward(state, next_state)
            total_reward += reward
            agent.remember(state, action, reward, next_state, done)
            if len(agent.memory) > batch_size:
                agent.replay(batch_size)
            
        action = agent.act(next_state)
        state = next_state
        
        # Convert the action to movement command
        move_command_obj = {"commands": [0, 0]}
        if action == 0:
            move_command_obj["commands"] = [30, 0]
        elif action == 1:
            move_command_obj["commands"] = [-30, 0]
        elif action == 2:
            move_command_obj["commands"] = [0, 30]
        else:
            move_command_obj["commands"] = [0, -30]
        print(f"Episode: {episode}, Total Reward: {total_reward}, Sending command: {move_command_obj}")
        s.sendto(json.dumps(move_command_obj).encode('utf-8'), addr)

  super().__init__(name, **kwargs)


Episode: 0, Total Reward: 0, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.01, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.02, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.03, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.04, Sending command: {'commands': [-30, 0]}
Episode: 0, Total Reward: -0.05, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.060000000000000005, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.07, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.08, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.09, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.09999999999999999, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.10999999999999999, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.11999999999999998, Sending command: {'commands': [0, -30]}
Ep

Episode: 0, Total Reward: -0.34000000000000014, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.35000000000000014, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.36000000000000015, Sending command: {'commands': [0, 30]}


Episode: 0, Total Reward: -0.37000000000000016, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.38000000000000017, Sending command: {'commands': [0, -30]}


Episode: 0, Total Reward: -0.3900000000000002, Sending command: {'commands': [-30, 0]}
Episode: 0, Total Reward: -0.4000000000000002, Sending command: {'commands': [0, 30]}


Episode: 0, Total Reward: -0.4100000000000002, Sending command: {'commands': [-30, 0]}
Episode: 0, Total Reward: -0.4200000000000002, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.4300000000000002, Sending command: {'commands': [-30, 0]}


Episode: 0, Total Reward: -0.4400000000000002, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.45000000000000023, Sending command: {'commands': [0, 30]}


Episode: 0, Total Reward: -0.46000000000000024, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.47000000000000025, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.48000000000000026, Sending command: {'commands': [0, -30]}


Episode: 0, Total Reward: -0.49000000000000027, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.5000000000000002, Sending command: {'commands': [-30, 0]}


Episode: 0, Total Reward: -0.5100000000000002, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.5200000000000002, Sending command: {'commands': [-30, 0]}


Episode: 0, Total Reward: -0.5300000000000002, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.5400000000000003, Sending command: {'commands': [0, -30]}
Episode: 0, Total Reward: -0.5500000000000003, Sending command: {'commands': [0, -30]}


Episode: 0, Total Reward: -0.5600000000000003, Sending command: {'commands': [0, 30]}
Episode: 0, Total Reward: -0.5700000000000003, Sending command: {'commands': [30, 0]}


Episode: 0, Total Reward: -0.5800000000000003, Sending command: {'commands': [-30, 0]}
Episode: 0, Total Reward: -0.5900000000000003, Sending command: {'commands': [30, 0]}


Episode: 0, Total Reward: -0.6000000000000003, Sending command: {'commands': [30, 0]}
Episode: 0, Total Reward: -0.6100000000000003, Sending command: {'commands': [-30, 0]}

In [None]:
import socket
import json

HOST = '127.0.0.1'  # Server IP
PORT = 12345        # Port

with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
    s.bind((HOST, PORT))
    
    while True:
        data, addr = s.recvfrom(1024)
        env_info = json.loads(data.decode('utf-8'))
        print("Received env info:", env_info)
        
        # For demonstration purposes, let's just randomly move the character forward
        # You can replace this with your logic based on the received env_info
        move_command_obj = {"commands": [3, 1]}
        print(json.dumps(move_command_obj))
        s.sendto(json.dumps(move_command_obj).encode('utf-8'), addr)
        