# Reinforcement Learning Model

# Single Agent and Enviroment without Grid

In [None]:
import gym
from gym import spaces
import numpy as np

class TrainSchedulingEnv(gym.Env):
    def __init__(self):
        super(TrainSchedulingEnv, self).__init__()

        # Stations and their routes (in minutes)
        self.stations = ["Chennai", "Madurai", "Coimbatore", "Tiruchirappalli", "Salem"]
        self.routes = {
            "Chennai": {"Madurai": 420, "Coimbatore": 480, "Tiruchirappalli": 330},
            "Madurai": {"Chennai": 420, "Tiruchirappalli": 120, "Coimbatore": 240},
            "Coimbatore": {"Chennai": 480, "Madurai": 240, "Salem": 210},
            "Tiruchirappalli": {"Chennai": 330, "Madurai": 120, "Salem": 150},
            "Salem": {"Coimbatore": 210, "Tiruchirappalli": 150}
        }
        self.num_stations = len(self.stations)

        # Train details
        self.num_trains = 5
        self.max_tracks_per_station = 3

        # Weather and maintenance
        self.weather_conditions = {route: np.random.uniform(0.8, 1.2) for route in self.routes.keys()}
        self.maintenance_schedule = {route: np.random.choice([0, 1], p=[0.9, 0.1]) for route in self.routes.keys()}  # 10% chance of maintenance

        # Passenger demand (randomized at reset)
        self.passenger_demand = None

        # Action and state spaces
        self.action_space = spaces.MultiDiscrete([self.num_stations] * self.num_trains)  # Next station for each train
        self.observation_space = spaces.Box(
            low=0, high=1, shape=(self.num_trains, 6), dtype=np.float32
        )  # [current_station, delay, destination, progress, weather, maintenance]
        self.reset()

    def reset(self):
        self.train_states = np.zeros((self.num_trains, 6))
        self.passenger_demand = np.random.randint(50, 200, size=(self.num_stations, self.num_stations))
        for train_id in range(self.num_trains):
            self.train_states[train_id, 0] = np.random.randint(0, self.num_stations)  # Random starting station
            self.train_states[train_id, 2] = np.random.randint(0, self.num_stations)  # Random destination
        return self.train_states

    def step(self, actions):
        reward = 0
        done = False

        for train_id, next_station_index in enumerate(actions):
            current_station_index = int(self.train_states[train_id, 0])
            destination_station_index = int(self.train_states[train_id, 2])
            current_station = self.stations[current_station_index]
            next_station = self.stations[next_station_index]

            # Check if route exists
            if next_station not in self.routes[current_station]:
                reward -= 50  # Heavy penalty for invalid route choice
                continue

            # Travel time, weather, and maintenance impact
            travel_time = self.routes[current_station][next_station]
            weather_impact = self.weather_conditions[current_station]
            maintenance = self.maintenance_schedule[current_station]

            if maintenance:
                reward -= 30  # Penalty for choosing a route under maintenance
                continue

            adjusted_travel_time = travel_time * weather_impact
            self.train_states[train_id, 1] += adjusted_travel_time  # Add delay
            self.train_states[train_id, 0] = next_station_index  # Update current station

            # Reward based on passenger satisfaction
            passengers_satisfied = self.passenger_demand[current_station_index, destination_station_index]
            reward += passengers_satisfied * 2 - adjusted_travel_time  # Positive reward for satisfying demand and minimizing delay

            # Check if destination reached
            if current_station_index == destination_station_index:
                reward += 100  # Large reward for reaching destination
                done = True  # Simulation ends when a train reaches its destination

        return self.train_states, reward, done, {}

    def render(self, mode="human"):
        print(f"Train States: {self.train_states}")
        for train_id, train_state in enumerate(self.train_states):
            current_station = self.stations[int(train_state[0])]
            destination_station = self.stations[int(train_state[2])]
            print(
                f"Train {train_id}: Current Station={current_station}, Destination={destination_station}, "
                f"Delay={train_state[1]:.2f} minutes"
            )

# Create environment
env = TrainSchedulingEnv()

In [None]:
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env

# Wrap the environment
env = make_vec_env(lambda: TrainSchedulingEnv(), n_envs=1)

# Train the PPO model
model = PPO("MlpPolicy", env, verbose=1)
model.learn(total_timesteps=50000)

# Test the model
obs = env.reset()
for _ in range(100):
    action, _ = model.predict(obs)
    obs, rewards, done, info = env.step(action)
    env.render()


## Even tried to simulate realworld train paths of india to check whether this solution works but it fails 

In [None]:
import pandas as pd
import networkx as nx
import matplotlib.pyplot as plt
import gym
from gym import spaces
import numpy as np
import time  # To add delay for visualization

# Load the dataset
train_data = pd.read_csv('Train_details_22122017.csv', dtype={'Distance': str})

# Convert the 'Distance' column to numeric, coercing errors into NaN
train_data['Distance'] = pd.to_numeric(train_data['Distance'], errors='coerce')

# Select data for multiple trains (e.g., Train No 107, 108, 109)
train_numbers = [107, 108, 109]
train_subset = train_data

# Create a graph with NetworkX
G = nx.Graph()

# Add stations as nodes
for _, row in train_subset.iterrows():
    G.add_node(row["Station Name"], station_code=row["Station Code"])

# Add routes (edges) between stations for each train
for i in range(1, len(train_subset)):
    source = train_subset.iloc[i-1]
    target = train_subset.iloc[i]

    if pd.notna(source["Distance"]) and pd.notna(target["Distance"]):
        G.add_edge(
            source["Station Name"],
            target["Station Name"],
            distance=target["Distance"] - source["Distance"],
            departure_time=target["Departure Time"],
            arrival_time=source["Arrival time"]
        )

# Check the graph
print(f"Number of nodes (stations) in the graph: {len(G.nodes())}")
print(f"Number of edges (routes) in the graph: {len(G.edges())}")

# Ensure that there are nodes to create the action space
if len(G.nodes()) == 0:
    raise ValueError("Graph has no nodes. Ensure the dataset contains valid station data.")

# Define the environment
class TrainMovementEnv(gym.Env):
    def __init__(self, graph):
        super(TrainMovementEnv, self).__init__()
        self.graph = graph
        self.stations = list(graph.nodes())  # List of station names
        self.current_station = 0  # Start at the first station
        self.action_space = spaces.Discrete(len(self.stations))  # Actions: move to a station
        self.observation_space = spaces.Discrete(len(self.stations))  # Observations: current station

    def reset(self):
        self.current_station = 0  # Reset to the first station
        return self.current_station

    def step(self, action):
        if action < 0 or action >= len(self.stations):
            return self.current_station, -1, False, {}

        # Move to the selected station
        next_station = self.stations[action]
        if next_station in self.graph[self.stations[self.current_station]]:
            self.current_station = action
            return self.current_station, 0, False, {}  # No reward, still moving
        else:
            return self.current_station, -1, False, {}  # Invalid move

    def render(self):
        # Plot the graph with the current station highlighted
        plt.figure(figsize=(10, 8))
        pos = nx.spring_layout(self.graph, seed=42)  # Use a layout for better positioning
        nx.draw(self.graph, pos, with_labels=True, node_size=5000, node_color='skyblue', font_size=10)

        # Highlight the current station in a different color
        current_station = self.stations[self.current_station]
        nx.draw_networkx_nodes(self.graph, pos, nodelist=[current_station], node_size=5000, node_color='red')

        plt.title(f"Train is at {current_station}")
        plt.show()

        # Adding a small delay to visualize the movement
        time.sleep(1)

# Initialize environment with the graph of train stations
env = TrainMovementEnv(G)

# Example of simulating one episode
obs = env.reset()
done = False
while not done:
    action = env.action_space.sample()  # Choose a random action (move to a random station)
    obs, reward, done, info = env.step(action)
    env.render()  # Render the current state after each step


In [None]:
import networkx as nx
import matplotlib.pyplot as plt
import geopandas as gpd

# Example: Create a simple graph
G = nx.Graph()
G.add_edge("Mumbai", "Delhi", weight=1000)
G.add_edge("Delhi", "Kolkata", weight=1500)
G.add_edge("Mumbai", "Chennai", weight=1300)

# Plot with NetworkX
pos = nx.spring_layout(G)  # You can replace with geospatial coordinates
nx.draw(G, pos, with_labels=True, node_color='lightblue', edge_color='gray')
plt.show()


# Mutli Agent and Enviroment with Grid

## First deliverable, Reinforcement learning model for train scheduling optimization

In [1]:
import numpy as np
import random

class RailEnvironment:
    def __init__(self, grid_size=(10, 10), n_agents=3):
        self.grid_size = grid_size
        self.n_agents = n_agents
        self.grid = np.zeros(grid_size)  # 0: empty, 1: station, 2: track
        self.agents = []
        self._initialize_environment()

    def _initialize_environment(self):
        # Place stations randomly
        num_stations = 5
        self.stations = [tuple(np.random.randint(0, self.grid_size[i], size=num_stations)) for i in range(2)]
        for x, y in zip(self.stations[0], self.stations[1]):
            self.grid[x, y] = 1

        # Initialize agents
        for i in range(self.n_agents):
            start = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            target = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            while target == start:
                target = (random.choice(self.stations[0]), random.choice(self.stations[1]))
            self.agents.append({
                "id": i,
                "start": start,
                "target": target,
                "position": start,
                "done": False,
                "reward": 0
            })

    def step(self, actions):
        rewards = {}
        done = True
        for agent, action in zip(self.agents, actions):
            if agent["done"]:
                rewards[agent["id"]] = 0
                continue

            # Compute new position
            new_position = self._move(agent["position"], action)
            if not self._is_valid(new_position):
                rewards[agent["id"]] = -5  # Invalid move penalty
                continue

            # Update position and check if target is reached
            agent["position"] = new_position
            if new_position == agent["target"]:
                agent["done"] = True
                rewards[agent["id"]] = 10  # Reaching target reward
            else:
                rewards[agent["id"]] = -1  # Step penalty

            done = done and agent["done"]

        return self.grid, rewards, done

    def _move(self, position, action):
        moves = {
            0: (-1, 0),  # Up
            1: (1, 0),   # Down
            2: (0, -1),  # Left
            3: (0, 1),   # Right
            4: (0, 0)    # Wait
        }
        return (position[0] + moves[action][0], position[1] + moves[action][1])

    def _is_valid(self, position):
        return (0 <= position[0] < self.grid_size[0] and
                0 <= position[1] < self.grid_size[1] and
                self.grid[position] != 2)  # No collision with other tracks

    def reset(self):
        self.__init__(self.grid_size, self.n_agents)
        return self.grid


In [4]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from collections import deque
import random

class DQN(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(DQN, self).__init__()
        self.fc = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.fc(x)


In [3]:
pip install torch


Collecting torch
  Using cached torch-2.5.1-cp311-cp311-win_amd64.whl.metadata (28 kB)
Collecting fsspec (from torch)
  Using cached fsspec-2024.10.0-py3-none-any.whl.metadata (11 kB)
Using cached torch-2.5.1-cp311-cp311-win_amd64.whl (203.1 MB)
Using cached fsspec-2024.10.0-py3-none-any.whl (179 kB)
Installing collected packages: fsspec, torch
Note: you may need to restart the kernel to use updated packages.


DEPRECATION: Loading egg at c:\python311\lib\site-packages\vboxapi-1.0-py3.11.egg is deprecated. pip 24.3 will enforce this behaviour change. A possible replacement is to use pip for package installation.. Discussion can be found at https://github.com/pypa/pip/issues/12330
ERROR: Could not install packages due to an OSError: [WinError 2] The system cannot find the file specified: 'C:\\Python311\\Scripts\\convert-caffe2-to-onnx.exe' -> 'C:\\Python311\\Scripts\\convert-caffe2-to-onnx.exe.deleteme'


[notice] A new release of pip is available: 24.0 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [5]:
class DQLAgent:
    def __init__(self, state_size, action_size, learning_rate=0.001, gamma=0.99, epsilon=1.0, epsilon_min=0.01, epsilon_decay=0.995):
        self.state_size = state_size
        self.action_size = action_size
        self.gamma = gamma  # Discount factor
        self.epsilon = epsilon  # Exploration rate
        self.epsilon_min = epsilon_min
        self.epsilon_decay = epsilon_decay
        self.memory = deque(maxlen=2000)
        self.model = DQN(state_size, action_size)
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)

    def remember(self, state, action, reward, next_state, done):
        self.memory.append((state, action, reward, next_state, done))

    def act(self, state):
        if np.random.rand() <= self.epsilon:
            return np.random.choice(self.action_size)  # Random action
        state = torch.FloatTensor(state).unsqueeze(0)
        with torch.no_grad():
            q_values = self.model(state)
        return torch.argmax(q_values).item()  # Exploitation

    def replay(self, batch_size):
        if len(self.memory) < batch_size:
            return
        minibatch = random.sample(self.memory, batch_size)
        for state, action, reward, next_state, done in minibatch:
            state = torch.FloatTensor(state).unsqueeze(0)
            next_state = torch.FloatTensor(next_state).unsqueeze(0)
            target = reward
            if not done:
                target += self.gamma * torch.max(self.model(next_state)).item()
            target_f = self.model(state).detach()
            target_f[0][action] = target
            self.model.zero_grad()
            loss = nn.functional.mse_loss(self.model(state), target_f)
            loss.backward()
            self.optimizer.step()
        if self.epsilon > self.epsilon_min:
            self.epsilon *= self.epsilon_decay


In [6]:
if __name__ == "__main__":
    env = RailEnvironment(grid_size=(10, 10), n_agents=5)  # Single agent for simplicity
    agent = DQLAgent(state_size=env.grid.size, action_size=5)
    episodes = 1000
    batch_size = 32

    for episode in range(episodes):
        state = env.reset().flatten()  # Flatten grid to vector
        done = False
        total_reward = 0

        while not done:
            action = agent.act(state)
            next_state, rewards, done = env.step([action])
            next_state = next_state.flatten()
            reward = sum(rewards.values())
            agent.remember(state, action, reward, next_state, done)
            state = next_state
            total_reward += reward

        agent.replay(batch_size)
        print(f"Episode {episode + 1}/{episodes}, Total Reward: {total_reward}, Epsilon: {agent.epsilon:.2f}")


Episode 1/1000, Total Reward: -6, Epsilon: 1.00
Episode 2/1000, Total Reward: -30, Epsilon: 1.00
Episode 3/1000, Total Reward: -6, Epsilon: 1.00
Episode 4/1000, Total Reward: -41, Epsilon: 0.99
Episode 5/1000, Total Reward: -71, Epsilon: 0.99
Episode 6/1000, Total Reward: -7, Epsilon: 0.99
Episode 7/1000, Total Reward: -12, Epsilon: 0.98
Episode 8/1000, Total Reward: -78, Epsilon: 0.98
Episode 9/1000, Total Reward: -15, Epsilon: 0.97
Episode 10/1000, Total Reward: -19, Epsilon: 0.97
Episode 11/1000, Total Reward: -6, Epsilon: 0.96
Episode 12/1000, Total Reward: -13, Epsilon: 0.96
Episode 13/1000, Total Reward: -7, Epsilon: 0.95
Episode 14/1000, Total Reward: -33, Epsilon: 0.95
Episode 15/1000, Total Reward: -5, Epsilon: 0.94
Episode 16/1000, Total Reward: -11, Epsilon: 0.94
Episode 17/1000, Total Reward: -13, Epsilon: 0.93
Episode 18/1000, Total Reward: -10, Epsilon: 0.93
Episode 19/1000, Total Reward: -21, Epsilon: 0.92
Episode 20/1000, Total Reward: -17, Epsilon: 0.92
Episode 21/1000

Episode 166/1000, Total Reward: -7, Epsilon: 0.44
Episode 167/1000, Total Reward: -18, Epsilon: 0.44
Episode 168/1000, Total Reward: -5, Epsilon: 0.44
Episode 169/1000, Total Reward: -5, Epsilon: 0.44
Episode 170/1000, Total Reward: -18, Epsilon: 0.43
Episode 171/1000, Total Reward: -10, Epsilon: 0.43
Episode 172/1000, Total Reward: -6, Epsilon: 0.43
Episode 173/1000, Total Reward: -9, Epsilon: 0.43
Episode 174/1000, Total Reward: -9, Epsilon: 0.42
Episode 175/1000, Total Reward: -6, Epsilon: 0.42
Episode 176/1000, Total Reward: -16, Epsilon: 0.42
Episode 177/1000, Total Reward: -6, Epsilon: 0.42
Episode 178/1000, Total Reward: -13, Epsilon: 0.42
Episode 179/1000, Total Reward: -6, Epsilon: 0.41
Episode 180/1000, Total Reward: -19, Epsilon: 0.41
Episode 181/1000, Total Reward: -11, Epsilon: 0.41
Episode 182/1000, Total Reward: 6, Epsilon: 0.41
Episode 183/1000, Total Reward: -12, Epsilon: 0.41
Episode 184/1000, Total Reward: -15, Epsilon: 0.40
Episode 185/1000, Total Reward: -20, Epsil

Episode 330/1000, Total Reward: -10, Epsilon: 0.19
Episode 331/1000, Total Reward: -6, Epsilon: 0.19
Episode 332/1000, Total Reward: -10, Epsilon: 0.19
Episode 333/1000, Total Reward: -15, Epsilon: 0.19
Episode 334/1000, Total Reward: -12, Epsilon: 0.19
Episode 335/1000, Total Reward: -6, Epsilon: 0.19
Episode 336/1000, Total Reward: -8, Epsilon: 0.19
Episode 337/1000, Total Reward: -5, Epsilon: 0.19
Episode 338/1000, Total Reward: -8, Epsilon: 0.19
Episode 339/1000, Total Reward: -7, Epsilon: 0.19
Episode 340/1000, Total Reward: 5, Epsilon: 0.18
Episode 341/1000, Total Reward: -17, Epsilon: 0.18
Episode 342/1000, Total Reward: -6, Epsilon: 0.18
Episode 343/1000, Total Reward: 8, Epsilon: 0.18
Episode 344/1000, Total Reward: -6, Epsilon: 0.18
Episode 345/1000, Total Reward: -11, Epsilon: 0.18
Episode 346/1000, Total Reward: -9, Epsilon: 0.18
Episode 347/1000, Total Reward: -12, Epsilon: 0.18
Episode 348/1000, Total Reward: -9, Epsilon: 0.18
Episode 349/1000, Total Reward: -7, Epsilon: 

Episode 493/1000, Total Reward: -10, Epsilon: 0.09
Episode 494/1000, Total Reward: -6, Epsilon: 0.09
Episode 495/1000, Total Reward: -6, Epsilon: 0.08
Episode 496/1000, Total Reward: -6, Epsilon: 0.08
Episode 497/1000, Total Reward: 8, Epsilon: 0.08
Episode 498/1000, Total Reward: -6, Epsilon: 0.08
Episode 499/1000, Total Reward: -14, Epsilon: 0.08
Episode 500/1000, Total Reward: -11, Epsilon: 0.08
Episode 501/1000, Total Reward: -11, Epsilon: 0.08
Episode 502/1000, Total Reward: -13, Epsilon: 0.08
Episode 503/1000, Total Reward: -14, Epsilon: 0.08
Episode 504/1000, Total Reward: -13, Epsilon: 0.08
Episode 505/1000, Total Reward: -17, Epsilon: 0.08
Episode 506/1000, Total Reward: -12, Epsilon: 0.08
Episode 507/1000, Total Reward: -7, Epsilon: 0.08
Episode 508/1000, Total Reward: -16, Epsilon: 0.08
Episode 509/1000, Total Reward: 4, Epsilon: 0.08
Episode 510/1000, Total Reward: -15, Epsilon: 0.08
Episode 511/1000, Total Reward: 10, Epsilon: 0.08
Episode 512/1000, Total Reward: -11, Epsi

Episode 658/1000, Total Reward: -10, Epsilon: 0.04
Episode 659/1000, Total Reward: -12, Epsilon: 0.04
Episode 660/1000, Total Reward: -14, Epsilon: 0.04
Episode 661/1000, Total Reward: 9, Epsilon: 0.04
Episode 662/1000, Total Reward: -13, Epsilon: 0.04
Episode 663/1000, Total Reward: -12, Epsilon: 0.04
Episode 664/1000, Total Reward: -13, Epsilon: 0.04
Episode 665/1000, Total Reward: -10, Epsilon: 0.04
Episode 666/1000, Total Reward: -14, Epsilon: 0.04
Episode 667/1000, Total Reward: -11, Epsilon: 0.04
Episode 668/1000, Total Reward: -9, Epsilon: 0.04
Episode 669/1000, Total Reward: -5, Epsilon: 0.04
Episode 670/1000, Total Reward: -14, Epsilon: 0.04
Episode 671/1000, Total Reward: -16, Epsilon: 0.04
Episode 672/1000, Total Reward: 10, Epsilon: 0.03
Episode 673/1000, Total Reward: -6, Epsilon: 0.03
Episode 674/1000, Total Reward: -12, Epsilon: 0.03
Episode 675/1000, Total Reward: -7, Epsilon: 0.03
Episode 676/1000, Total Reward: -9, Epsilon: 0.03
Episode 677/1000, Total Reward: 10, Eps

Episode 822/1000, Total Reward: -11, Epsilon: 0.02
Episode 823/1000, Total Reward: -12, Epsilon: 0.02
Episode 824/1000, Total Reward: -11, Epsilon: 0.02
Episode 825/1000, Total Reward: -11, Epsilon: 0.02
Episode 826/1000, Total Reward: 9, Epsilon: 0.02
Episode 827/1000, Total Reward: -7, Epsilon: 0.02
Episode 828/1000, Total Reward: -10, Epsilon: 0.02
Episode 829/1000, Total Reward: 8, Epsilon: 0.02
Episode 830/1000, Total Reward: -8, Epsilon: 0.02
Episode 831/1000, Total Reward: -7, Epsilon: 0.02
Episode 832/1000, Total Reward: -12, Epsilon: 0.02
Episode 833/1000, Total Reward: -6, Epsilon: 0.02
Episode 834/1000, Total Reward: -6, Epsilon: 0.02
Episode 835/1000, Total Reward: -7, Epsilon: 0.02
Episode 836/1000, Total Reward: 3, Epsilon: 0.02
Episode 837/1000, Total Reward: -11, Epsilon: 0.02
Episode 838/1000, Total Reward: -6, Epsilon: 0.02
Episode 839/1000, Total Reward: -12, Epsilon: 0.02
Episode 840/1000, Total Reward: -14, Epsilon: 0.02
Episode 841/1000, Total Reward: -5, Epsilon:

Episode 986/1000, Total Reward: -6, Epsilon: 0.01
Episode 987/1000, Total Reward: -10, Epsilon: 0.01
Episode 988/1000, Total Reward: -10, Epsilon: 0.01
Episode 989/1000, Total Reward: -14, Epsilon: 0.01
Episode 990/1000, Total Reward: -11, Epsilon: 0.01
Episode 991/1000, Total Reward: -8, Epsilon: 0.01
Episode 992/1000, Total Reward: -6, Epsilon: 0.01
Episode 993/1000, Total Reward: -9, Epsilon: 0.01
Episode 994/1000, Total Reward: -6, Epsilon: 0.01
Episode 995/1000, Total Reward: -12, Epsilon: 0.01
Episode 996/1000, Total Reward: -14, Epsilon: 0.01
Episode 997/1000, Total Reward: 10, Epsilon: 0.01
Episode 998/1000, Total Reward: -8, Epsilon: 0.01
Episode 999/1000, Total Reward: 8, Epsilon: 0.01
Episode 1000/1000, Total Reward: -5, Epsilon: 0.01
