In [1]:
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
from torch_geometric.utils import to_networkx
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_i

## 1. Tiền xử lý dữ liệu

In [2]:
# Đọc từng file từ thư mục MXH_Dataset
train_df = pd.read_csv("../Dataset/train.csv")
segment_status_df = pd.read_csv("../Dataset/segment_status.csv")


In [3]:
# Kiểm tra số dòng, số cột của từng file
for name, df in [("segment_status", segment_status_df), 
                 ("train", train_df)]:
    print(f"{name}: {df.info}")

segment_status: <bound method DataFrame.info of          _id                updated_at  segment_id  velocity
0          0  2020-07-03T14:55:31.869Z       24845        20
1          1  2020-07-03T15:02:56.048Z       33923        10
2          2  2020-07-04T08:15:52.696Z       33824         5
3          3  2020-07-04T08:15:59.903Z       33824         5
4          4  2020-07-04T08:16:08.201Z       33824         5
...      ...                       ...         ...       ...
90933  90933  2021-04-22T06:52:39.280Z       52247         1
90934  90934  2021-04-22T06:52:52.501Z       52247         1
90935  90935  2021-04-22T06:53:02.335Z       52247         1
90936  90936  2021-04-22T06:53:14.294Z       52247         1
90937  90937  2021-04-22T06:53:27.300Z       52247         1

[90938 rows x 4 columns]>
train: <bound method DataFrame.info of          _id  segment_id        date  weekday        period LOS   s_node_id  \
0          0          26  2021-04-16        4   period_0_30   A   366428456

In [4]:
print("Missing values in train_df:\n", train_df.isnull().sum())
# Xử lý giá trị thiếu
train_df['max_velocity'] = train_df['max_velocity'].fillna(train_df['max_velocity'].mean())
train_df['street_name'] = train_df['street_name'].fillna('Unknown')

print("Missing values in train_df:\n", train_df.isnull().sum())

# Chuyển đổi thời gian
train_df['date'] = pd.to_datetime(train_df['date'])
segment_status_df['updated_at'] = pd.to_datetime(segment_status_df['updated_at'])
train_df = train_df.sort_values('date')

Missing values in train_df:
 _id                 0
segment_id          0
date                0
weekday             0
period              0
LOS                 0
s_node_id           0
e_node_id           0
length              0
street_id           0
max_velocity    28495
street_level        0
street_name         1
street_type         0
long_snode          0
lat_snode           0
long_enode          0
lat_enode           0
dtype: int64
Missing values in train_df:
 _id             0
segment_id      0
date            0
weekday         0
period          0
LOS             0
s_node_id       0
e_node_id       0
length          0
street_id       0
max_velocity    0
street_level    0
street_name     0
street_type     0
long_snode      0
lat_snode       0
long_enode      0
lat_enode       0
dtype: int64


## 2. Chuẩn bị model

### 2.1 Hàm tạo graph

In [5]:
# Chuẩn bị đồ thị từ train_df
def create_graphs(df):
    # Lấy danh sách các ngày duy nhất
    dates = df['date'].dt.date.unique()
    graphs = []
    node_encoder = LabelEncoder()
    
    # Tạo danh sách node duy nhất
    all_nodes = pd.concat([df['s_node_id'], df['e_node_id']]).unique()
    node_encoder.fit(all_nodes)
    
    for date in dates:
        # Lọc dữ liệu theo ngày
        daily_df = df[df['date'].dt.date == date]
        G = nx.Graph()
        
        # Thêm cạnh (đoạn đường)
        for _, row in daily_df.iterrows():
            s_node = node_encoder.transform([row['s_node_id']])[0]
            e_node = node_encoder.transform([row['e_node_id']])[0]
            G.add_edge(s_node, e_node, segment_id=row['segment_id'])
        
        if G.number_of_edges() > 0:  # Chỉ thêm đồ thị có cạnh
            graphs.append(G)
    
    return graphs, node_encoder

graphs, node_encoder = create_graphs(train_df)
print(f"Created {len(graphs)} graphs with max nodes: {max(g.number_of_nodes() for g in graphs)}")


Created 122 graphs with max nodes: 2576


### 2.2 Chuẩn bị ma trận kề

In [None]:
# Chuẩn bị ma trận kề
max_nodes = max(g.number_of_nodes() for g in graphs)
adj_matrices = []
for g in graphs:
    adj = nx.adjacency_matrix(g).todense()
    # Pad ma trận để có kích thước max_nodes x max_nodes
    adj_padded = np.pad(adj, ((0, max_nodes - adj.shape[0]), (0, max_nodes - adj.shape[0])), 'constant')
    adj_matrices.append(adj_padded)
adj_tensor = torch.tensor(adj_matrices, dtype=torch.float32)

## 3. Định nghĩa model

In [None]:
class GraphRNN(nn.Module):
    def __init__(self, max_nodes, hidden_dim):
        super(GraphRNN, self).__init__()
        self.max_nodes = max_nodes
        self.hidden_dim = hidden_dim
        self.node_rnn = nn.GRU(input_size=1, hidden_size=hidden_dim, batch_first=True)
        self.edge_rnn = nn.GRU(input_size=hidden_dim, hidden_size=hidden_dim, batch_first=True)
        self.output_layer = nn.Linear(hidden_dim, max_nodes)  # Dự đoán toàn bộ hàng của ma trận kề
    
    def forward(self, adj_matrix):
        batch_size = adj_matrix.size(0)
        node_hidden = torch.zeros(1, batch_size, self.hidden_dim).to(adj_matrix.device)
        
        # Sinh node (giả lập xác suất sinh node)
        node_inputs = torch.ones(batch_size, self.max_nodes, 1).to(adj_matrix.device)
        node_outputs, _ = self.node_rnn(node_inputs, node_hidden)  # [batch_size, max_nodes, hidden_dim]
        
        # Sinh ma trận kề đầy đủ
        edge_probs = []
        for i in range(self.max_nodes):
            # Lấy đặc trưng của node i
            node_i = node_outputs[:, i, :].unsqueeze(1)  # [batch_size, 1, hidden_dim]
            edge_output, node_hidden = self.edge_rnn(node_i, node_hidden)  # [batch_size, 1, hidden_dim]
            
            # Dự đoán xác suất cạnh từ node i đến tất cả node khác
            edge_prob = torch.sigmoid(self.output_layer(edge_output.squeeze(1)))  # [batch_size, max_nodes]
            edge_probs.append(edge_prob)
        
        # Stack để được [batch_size, max_nodes, max_nodes]
        edge_probs = torch.stack(edge_probs, dim=1)  # [batch_size, max_nodes, max_nodes]
        return edge_probs

# Khởi tạo mô hình
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GraphRNN(max_nodes=max_nodes, hidden_dim=128).to(device)
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.BCELoss()

## 4. Huấn luyện model

In [None]:
model.train()
adj_tensor = adj_tensor.to(device)
losses = []
for epoch in range(20):
    optimizer.zero_grad()
    output = model(adj_tensor)
    print(f"Epoch {epoch+1} - Output shape: {output.shape}, Target shape: {adj_tensor.shape}")  # Debug
    loss = criterion(output, adj_tensor)
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    if (epoch + 1) % 10 == 0:
        print(f"Epoch {epoch+1}, Loss: {loss.item():.4f}")

ValueError: Using a target size (torch.Size([122, 2576, 2576])) that is different to the input size (torch.Size([122, 2576])) is deprecated. Please ensure they have the same size.

In [None]:
# Vẽ biểu đồ loss
plt.plot(losses)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss')
plt.savefig('graphrnn_loss.png')
plt.close()

In [None]:
# Sinh đồ thị mới
model.eval()
with torch.no_grad():
    generated_adj = model(torch.zeros(1, max_nodes, max_nodes).to(device))
    generated_adj = (generated_adj > 0.5).float().squeeze(0).cpu().numpy()
    generated_graph = nx.from_numpy_array(generated_adj)
    
    # Lưu đồ thị sinh ra
    os.makedirs('generated_graphs', exist_ok=True)
    nx.write_edgelist(generated_graph, 'generated_graphs/generated_graph.edgelist')
    print(f"Generated graph with {generated_graph.number_of_nodes()} nodes and {generated_graph.number_of_edges()} edges")

# Vẽ đồ thị sinh ra (tùy chọn)
plt.figure(figsize=(8, 8))
nx.draw(generated_graph, with_labels=True, node_color='lightblue', edge_color='gray')
plt.savefig('generated_graphs/generated_graph.png')
plt.close()