In [1]:
import pandas as pd
import numpy as np
import torch
import networkx as nx
from sklearn.preprocessing import LabelEncoder, MinMaxScaler

## 1. Tiền xử lý dữ liệu

In [2]:
# Đọc từng file từ thư mục MXH_Dataset
train_df = pd.read_csv("../Dataset/train.csv")
segment_status_df = pd.read_csv("../Dataset/segment_status.csv")


In [3]:
# Kiểm tra số dòng, số cột của từng file
for name, df in [("segment_status", segment_status_df), 
                 ("train", train_df)]:
    print(f"{name}: {df.info}")

segment_status: <bound method DataFrame.info of          _id                updated_at  segment_id  velocity
0          0  2020-07-03T14:55:31.869Z       24845        20
1          1  2020-07-03T15:02:56.048Z       33923        10
2          2  2020-07-04T08:15:52.696Z       33824         5
3          3  2020-07-04T08:15:59.903Z       33824         5
4          4  2020-07-04T08:16:08.201Z       33824         5
...      ...                       ...         ...       ...
90933  90933  2021-04-22T06:52:39.280Z       52247         1
90934  90934  2021-04-22T06:52:52.501Z       52247         1
90935  90935  2021-04-22T06:53:02.335Z       52247         1
90936  90936  2021-04-22T06:53:14.294Z       52247         1
90937  90937  2021-04-22T06:53:27.300Z       52247         1

[90938 rows x 4 columns]>
train: <bound method DataFrame.info of          _id  segment_id        date  weekday        period LOS   s_node_id  \
0          0          26  2021-04-16        4   period_0_30   A   366428456

### 1.1 Xử lý dữ liệu thiếu

In [4]:
print("Missing values in train_df:\n", train_df.isnull().sum())

Missing values in train_df:
 _id                 0
segment_id          0
date                0
weekday             0
period              0
LOS                 0
s_node_id           0
e_node_id           0
length              0
street_id           0
max_velocity    28495
street_level        0
street_name         1
street_type         0
long_snode          0
lat_snode           0
long_enode          0
lat_enode           0
dtype: int64


In [5]:
# Xử lý giá trị thiếu
train_df['max_velocity'] = train_df['max_velocity'].fillna(train_df['max_velocity'].mean())
train_df['street_name'] = train_df['street_name'].fillna('Unknown')

print("Missing values in train_df:\n", train_df.isnull().sum())


Missing values in train_df:
 _id             0
segment_id      0
date            0
weekday         0
period          0
LOS             0
s_node_id       0
e_node_id       0
length          0
street_id       0
max_velocity    0
street_level    0
street_name     0
street_type     0
long_snode      0
lat_snode       0
long_enode      0
lat_enode       0
dtype: int64


In [6]:
# Chuyển đổi thời gian
train_df['date'] = pd.to_datetime(train_df['date'])
segment_status_df['updated_at'] = pd.to_datetime(segment_status_df['updated_at'])
train_df = train_df.sort_values('date')

### 1.2 Chuẩn hóa cột LOS

In [7]:
le = LabelEncoder()
train_df['LOS_encoded'] = le.fit_transform(train_df['LOS'])
scaler = MinMaxScaler()
train_df['LOS_norm'] = scaler.fit_transform(train_df[['LOS_encoded']])

### 1.3 Tạo đồ thị với NetworkX

In [8]:
edge_index = train_df[['s_node_id', 'e_node_id', 'segment_id']].drop_duplicates().dropna().astype(int)

# Tạo ánh xạ cho node IDs
node_ids = sorted(set(edge_index['s_node_id']).union(edge_index['e_node_id']))
node_id_to_index = {node_id: idx for idx, node_id in enumerate(node_ids)}
num_nodes = len(node_ids)

# Cập nhật edge_index với chỉ số mới
edge_index['s_node_idx'] = edge_index['s_node_id'].map(node_id_to_index)
edge_index['e_node_idx'] = edge_index['e_node_id'].map(node_id_to_index)
edge_index_torch = torch.tensor(edge_index[['s_node_idx', 'e_node_idx']].T.values, dtype=torch.int64)

# Thêm cạnh từ edge_index
G = nx.DiGraph()
for _, row in edge_index.iterrows():
    G.add_edge(row['s_node_id'], row['e_node_id'], segment_id=row['segment_id'])

# Tạo đồ thị với NetworkX
G = nx.DiGraph()
for _, row in edge_index.iterrows():
    G.add_edge(row['s_node_idx'], row['e_node_idx'], segment_id=row['segment_id'])

# Tạo đặc trưng nút
node_features = []
for node in range(num_nodes):  # Lặp qua chỉ số từ 0 đến num_nodes-1
    degree = G.degree(node)
    connected_edges = [(u, v) for u, v, d in G.edges(data=True) if u == node or v == node]
    segment_ids = [G[u][v]['segment_id'] for u, v in connected_edges]
    if segment_ids:
        segment_data = train_df[train_df['segment_id'].isin(segment_ids)]
        avg_length = segment_data['length'].mean() if 'length' in segment_data else 0
        avg_velocity = segment_data['velocity'].mean() if 'velocity' in segment_data else 0
    else:
        avg_length, avg_velocity = 0, 0
    node_features.append([degree, avg_length, avg_velocity])

node_features = torch.tensor(node_features, dtype=torch.float32)


A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.3 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "C:\Users\ASUS\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\traitlets\config\application.py", line 1075, in launch_i

### 1.4 Merge 2 file segment_status và train

In [9]:
# Hàm convert period (vd: "period_14_30") thành timedelta
def period_to_time(period_str):
    try:
        _, hour_str, min_str = period_str.split("_")
        hour = int(hour_str)
        minute = int(min_str)
        return pd.to_timedelta(f"{hour}:{minute}:00")
    except:
        return pd.NaT

# Apply và tạo cột thời gian đầy đủ
train_df['time_delta'] = train_df['period'].apply(period_to_time)
train_df['date'] = pd.to_datetime(train_df['date']) + train_df['time_delta']

# Xoá cột phụ nếu muốn
train_df.drop(columns='time_delta', inplace=True)

# Chuyển 'date' và 'updated_at' về datetime
train_df['date'] = pd.to_datetime(train_df['date']).dt.tz_localize(None)
segment_status_df['updated_at'] = pd.to_datetime(segment_status_df['updated_at']).dt.tz_localize(None)

# Sort trước khi dùng merge_asof
train = train_df.sort_values(by='date')
segment_status = segment_status_df.sort_values(by='updated_at')

# Merge gần đúng theo thời gian, trong cùng segment_id
merged_df = pd.merge_asof(
    train,
    segment_status,
    by='segment_id',
    left_on='date',
    right_on='updated_at',
    direction='nearest',  # hoặc 'backward' nếu bạn chỉ muốn dùng dữ liệu trước đó
    tolerance=pd.Timedelta('30min')  # chỉ chấp nhận khớp nếu lệch thời gian <= 30 phút
)


### 1.5 Tạo pivot table cho LOS_norm và velocity

In [10]:
# Tạo los_pivot
los_pivot = merged_df.pivot_table(
    index='segment_id',
    columns='date',
    values='LOS_norm',
    aggfunc='mean'
).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).sort_index(axis=1)

# Tạo velocity_pivot từ merged_df
velocity_pivot = merged_df.pivot_table(
    index='segment_id',
    columns='date',
    values='velocity',
    aggfunc='mean'
).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).sort_index(axis=1)

  ).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).sort_index(axis=1)
  ).fillna(method='ffill', axis=1).fillna(method='bfill', axis=1).sort_index(axis=1)


### 1.6 Tạo danh sách snapshot

In [11]:
# Tạo edge_index từ NetworkX
edge_index_torch = torch.tensor(list(G.edges(data=False)), dtype=torch.int64).t()

# Tạo danh sách snapshot
snapshots = []
for date in los_pivot.columns:
    edge_features = []
    targets = []
    for u, v, d in G.edges(data=True):
        segment_id = d['segment_id']
        if segment_id in los_pivot.index:
            los = los_pivot.loc[segment_id, date]
            velocity = velocity_pivot.loc[segment_id, date]
            edge_features.append([los, velocity])
            targets.append([los])
        else:
            edge_features.append([0, 0])
            targets.append([0])
    edge_features = torch.tensor(edge_features, dtype=torch.float32)
    targets = torch.tensor(targets, dtype=torch.float32)
    snapshots.append({
        'edge_index': edge_index_torch,
        'edge_features': edge_features,
        'node_features': node_features,
        'targets': targets
    })

# Kiểm tra snapshot
print(f"Number of snapshots: {len(snapshots)}")
print(f"Edge index shape: {snapshots[0]['edge_index'].shape}")
print(f"Edge features shape: {snapshots[0]['edge_features'].shape}")
print(f"Node features shape: {snapshots[0]['node_features'].shape}")
print(f"Targets shape: {snapshots[0]['targets'].shape}")

Number of snapshots: 827
Edge index shape: torch.Size([2, 10026])
Edge features shape: torch.Size([10026, 2])
Node features shape: torch.Size([11314, 3])
Targets shape: torch.Size([10026, 1])


## 2. Định nghĩa mô hình GRNN

In [None]:
import torch.nn as nn

# Định nghĩa mô hình GRNN
class GRNN(nn.Module):
    def __init__(self, node_features_dim, edge_features_dim, hidden_dim, out_dim):
        super(GRNN, self).__init__()
        self.conv = nn.Linear(2 * node_features_dim + edge_features_dim, hidden_dim)
        self.rnn = nn.GRU(hidden_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, out_dim)

    def forward(self, snapshots):
        outputs = []
        h = None
        for snapshot in snapshots:
            edge_index = snapshot['edge_index']  # [2, num_edges]
            edge_features = snapshot['edge_features']  # [num_edges, edge_features_dim]
            node_features = snapshot['node_features']  # [num_nodes, node_features_dim]

            # Kết hợp đặc trưng nút và cạnh
            src_features = node_features[edge_index[0]]  # [num_edges, node_features_dim]
            dst_features = node_features[edge_index[1]]  # [num_edges, node_features_dim]
            edge_input = torch.cat([src_features, dst_features, edge_features], dim=1)

            # Áp dụng convolution
            h_edge = torch.relu(self.conv(edge_input))  # [num_edges, hidden_dim]

            # Tổng hợp đặc trưng cạnh cho mỗi nút
            node_h = torch.zeros(node_features.size(0), h_edge.size(1)).to(h_edge.device)
            node_h.index_add_(0, edge_index[0], h_edge)
            node_h.index_add_(0, edge_index[1], h_edge)

            # RNN
            node_h = node_h.unsqueeze(0)  # [1, num_nodes, hidden_dim]
            node_h, h = self.rnn(node_h, h)
            node_h = node_h.squeeze(0)  # [num_nodes, hidden_dim]

            # Dự đoán cho mỗi cạnh
            edge_out = self.fc(h_edge)  # [num_edges, out_dim]
            outputs.append(edge_out)

        return outputs

# Khởi tạo mô hình
model = GRNN(node_features_dim=3, edge_features_dim=2, hidden_dim=32, out_dim=1)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.MSELoss()

## 3. Huấn luyện mô hình

In [None]:
# Huấn luyện
model.train()
for epoch in range(5):
    total_loss = 0
    for snapshot in snapshots:
        optimizer.zero_grad()
        # Xử lý từng snapshot
        output = model([snapshot])[0]  # Gọi model với danh sách 1 snapshot
        loss = criterion(output, snapshot['targets'])
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss / len(snapshots)}")

Epoch 1, Loss: 0.6277169371434818


KeyboardInterrupt: 

In [None]:
# Dự đoán
model.eval()
predictions = []
with torch.no_grad():
    outputs = model(snapshots)
    for out in outputs:
        predictions.append(out.cpu().numpy())

In [None]:
# Đánh giá
targets = [snapshot['targets'].cpu().numpy() for snapshot in snapshots]
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(np.concatenate(targets), np.concatenate(predictions))
print(f"Test MSE: {mse}")