In [1]:
import os
import torch
import warnings
import numpy as np
import pandas as pd
import networkx as nx
from datetime import datetime
from datetime import timedelta
from collections import Counter
from tqdm import tqdm
# data
from torch_geometric.data import Data
from data import pre_processing
from sklearn.model_selection import KFold
#from models import GNNEdgeClassifier, GCNEdgeClassifier
# plot
import matplotlib.pyplot as plt
import seaborn as sns
# val
from sklearn.metrics import confusion_matrix, matthews_corrcoef

In [2]:
# set seed
torch.manual_seed(42)
np.random.seed(42)
warnings.filterwarnings('ignore')

## Data

 - Features
 - Edges

In [3]:
df_edges_features = pd.read_parquet('../../data_design/df_edges.parquet')
df_edges_features.head()

Unnamed: 0,src,dst,loader,vm_edge,target
0,100009577,345936831,2.770115,29.946355,normal
1,100722777,100722778,18.540541,15.494709,normal
2,100722777,44782645,5.820513,10.86524,devagar
3,100722777,45833440,23.02381,16.153585,normal
4,100722777,66771046,21.5,14.545003,normal


In [4]:
df_edges_features.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4458 entries, 0 to 4457
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   src      4458 non-null   object 
 1   dst      4458 non-null   object 
 2   loader   4458 non-null   float64
 3   vm_edge  4458 non-null   float64
 4   target   4458 non-null   object 
dtypes: float64(2), object(3)
memory usage: 174.3+ KB


In [5]:
df_edges_features.columns

Index(['src', 'dst', 'loader', 'vm_edge', 'target'], dtype='object')

In [6]:
df_edges = pd.read_parquet('../../data/graph_designer/graph_gtfs_fev_2024.parquet')
df_edges.head()

Unnamed: 0,src,dst,distance,src_lat,dst_lat,src_lon,dst_lon
0,100009577,345936831,0.254,-12.901954,-12.902051,-38.419582,-38.417114
1,100722777,100722778,0.362,-12.899299,-12.896647,-38.407673,-38.408215
2,100722777,44782645,1.062,-12.899299,-12.899458,-38.407673,-38.412964
3,100722777,45833440,0.417,-12.899299,-12.896741,-38.407673,-38.408672
4,100722777,66771046,0.934,-12.899299,-12.89679,-38.407673,-38.41254


In [7]:
df_node_features = pd.read_parquet('../../data/raw/df_features.parquet')
df_node_features.head()

Unnamed: 0,ponto,carregamento,desce,linha,sobe,trip_id,veiculo,vm,target
0,100009577,0.946667,0.026667,1.08,0.48,1.106667,1.093333,8.275914,devagar
1,100722777,22.069444,5.222222,1.666667,4.840278,1.75,1.75,20.951829,normal
2,100722778,21.402516,1.666667,1.773585,0.081761,2.056604,2.056604,13.930104,normal
3,101214305,7.982759,4.465517,1.0,0.62069,1.0,1.0,17.759792,normal
4,101269104,77.882653,16.591837,4.540816,10.285714,5.520408,5.520408,36.548041,normal


In [8]:
#df_edges_join = df_edges_features.merge(df_edges, on=['src', 'dst'], how='left')
df_edges_join = df_edges.merge(df_edges_features, on=['src', 'dst'], how='left')

In [9]:
df_edges_features.shape, df_edges.shape, df_edges_join.shape

((4458, 5), (4526, 7), (4526, 10))

In [10]:
df_edges_join.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4526 entries, 0 to 4525
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   src       4526 non-null   object 
 1   dst       4526 non-null   object 
 2   distance  4526 non-null   float64
 3   src_lat   4526 non-null   float64
 4   dst_lat   4526 non-null   float64
 5   src_lon   4526 non-null   float64
 6   dst_lon   4526 non-null   float64
 7   loader    4445 non-null   float64
 8   vm_edge   4445 non-null   float64
 9   target    4445 non-null   object 
dtypes: float64(7), object(3)
memory usage: 389.0+ KB


In [11]:
df_edges_features.shape, df_edges.shape, df_edges_join.shape

((4458, 5), (4526, 7), (4526, 10))

In [12]:
df_edges_join = df_edges_join[df_edges_join.target.notna()]

In [13]:
df_edges_features = df_edges_join.copy()
df_edges_features.head()

Unnamed: 0,src,dst,distance,src_lat,dst_lat,src_lon,dst_lon,loader,vm_edge,target
0,100009577,345936831,0.254,-12.901954,-12.902051,-38.419582,-38.417114,2.770115,29.946355,normal
1,100722777,100722778,0.362,-12.899299,-12.896647,-38.407673,-38.408215,18.540541,15.494709,normal
2,100722777,44782645,1.062,-12.899299,-12.899458,-38.407673,-38.412964,5.820513,10.86524,devagar
3,100722777,45833440,0.417,-12.899299,-12.896741,-38.407673,-38.408672,23.02381,16.153585,normal
4,100722777,66771046,0.934,-12.899299,-12.89679,-38.407673,-38.41254,21.5,14.545003,normal


In [14]:
df_edges_features.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4445 entries, 0 to 4525
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   src       4445 non-null   object 
 1   dst       4445 non-null   object 
 2   distance  4445 non-null   float64
 3   src_lat   4445 non-null   float64
 4   dst_lat   4445 non-null   float64
 5   src_lon   4445 non-null   float64
 6   dst_lon   4445 non-null   float64
 7   loader    4445 non-null   float64
 8   vm_edge   4445 non-null   float64
 9   target    4445 non-null   object 
dtypes: float64(7), object(3)
memory usage: 382.0+ KB


In [15]:
df_edges_features[df_edges_features.distance.isna()].shape

(0, 10)

In [16]:
df_edges_features.target.value_counts(dropna=False)

normal     2991
fluido      779
devagar     675
Name: target, dtype: int64

In [17]:
df_edges_features.shape, df_node_features.shape

((4445, 10), (2833, 9))

### Pre-processing

In [18]:
df_node_features = df_node_features.drop(['vm', 'target'], axis=1)
df_node_features = df_node_features.rename({'ponto': 'leg_pos'}, axis=1)
df_node_features = df_node_features.astype({'leg_pos': str})

df_edges_features = df_edges_features.drop(['vm_edge'], axis=1)
df_edges_features.head()

Unnamed: 0,src,dst,distance,src_lat,dst_lat,src_lon,dst_lon,loader,target
0,100009577,345936831,0.254,-12.901954,-12.902051,-38.419582,-38.417114,2.770115,normal
1,100722777,100722778,0.362,-12.899299,-12.896647,-38.407673,-38.408215,18.540541,normal
2,100722777,44782645,1.062,-12.899299,-12.899458,-38.407673,-38.412964,5.820513,devagar
3,100722777,45833440,0.417,-12.899299,-12.896741,-38.407673,-38.408672,23.02381,normal
4,100722777,66771046,0.934,-12.899299,-12.89679,-38.407673,-38.41254,21.5,normal


In [19]:
df_node_features.head()

Unnamed: 0,leg_pos,carregamento,desce,linha,sobe,trip_id,veiculo
0,100009577,0.946667,0.026667,1.08,0.48,1.106667,1.093333
1,100722777,22.069444,5.222222,1.666667,4.840278,1.75,1.75
2,100722778,21.402516,1.666667,1.773585,0.081761,2.056604,2.056604
3,101214305,7.982759,4.465517,1.0,0.62069,1.0,1.0
4,101269104,77.882653,16.591837,4.540816,10.285714,5.520408,5.520408


In [20]:
# columns to select as features
features_cols = ['carregamento', 'desce', 'linha', 'sobe', 'trip_id', 'veiculo']

In [21]:
df_edges_features = df_edges_features.rename({'src': 'pos1', 'dst': 'pos2', 'distance': 'weight'}, axis=1)
df_edges_features = df_edges_features.astype({'pos1': str, 'pos2': str})
df_edges_features.head()

Unnamed: 0,pos1,pos2,weight,src_lat,dst_lat,src_lon,dst_lon,loader,target
0,100009577,345936831,0.254,-12.901954,-12.902051,-38.419582,-38.417114,2.770115,normal
1,100722777,100722778,0.362,-12.899299,-12.896647,-38.407673,-38.408215,18.540541,normal
2,100722777,44782645,1.062,-12.899299,-12.899458,-38.407673,-38.412964,5.820513,devagar
3,100722777,45833440,0.417,-12.899299,-12.896741,-38.407673,-38.408672,23.02381,normal
4,100722777,66771046,0.934,-12.899299,-12.89679,-38.407673,-38.41254,21.5,normal


In [22]:
x, edge_labels, edge_index, edge_weights, pos =  pre_processing(df_edges_features, 
                                                          df_node_features, 
                                                          features_cols,
                                                          col_target='target', 
                                                          col_edges=['weight'])

100% 2833/2833 [00:10<00:00, 266.38it/s]


### One hot encoding

In [23]:
edge_labels_ohe = torch.nn.functional.one_hot(edge_labels).float()
edge_labels_ohe.shape

torch.Size([4341, 3])

In [24]:
edge_labels_ohe.shape, edge_labels.shape

(torch.Size([4341, 3]), torch.Size([4341]))

In [25]:
edge_labels = edge_labels_ohe.float()

## Create K fold and save data

In [26]:
# Shape: [num_edges, 2], [num_edges], [num_edges]
edge_index.shape, edge_labels.shape, edge_weights.shape  

(torch.Size([4341, 2]), torch.Size([4341, 3]), torch.Size([4341, 1]))

In [27]:
edge_index_np = edge_index.numpy()  
edge_labels_np = edge_labels.numpy()  
edge_weights_np = edge_weights.numpy() 
edge_index_np.shape

(4341, 2)

In [28]:
kf = KFold(n_splits=10, shuffle=True, random_state=42)

for fold, (train_idx, test_idx) in enumerate(kf.split(edge_index_np)):
    fold_idx = fold + 1
    print(f"Fold {fold_idx}")
    

    # Split the data
    X_train, X_test = edge_index_np[train_idx], edge_index_np[test_idx]
    y_train, y_test = edge_labels_np[train_idx], edge_labels_np[test_idx]
    w_train, w_test = edge_weights_np[train_idx], edge_weights_np[test_idx]

    # Convert back to PyTorch tensors
    X_train = torch.tensor(X_train, dtype=torch.long).t()  # Transpose back to shape [2, num_train_edges]
    X_test = torch.tensor(X_test, dtype=torch.long).t()  # Transpose back to shape [2, num_test_edges]

    y_train = torch.tensor(y_train, dtype=torch.long)  # Shape: [num_train_edges]
    y_test = torch.tensor(y_test, dtype=torch.long)  # Shape: [num_test_edges]

    w_train = torch.tensor(w_train, dtype=torch.float)  # Shape: [num_train_edges]
    w_test = torch.tensor(w_test, dtype=torch.float)  # Shape: [num_test_edges]

    # Create Data objects
    train_data = Data(
        x=x,
        edge_index=X_train,
        edge_label=y_train.float(),
        edge_attr=w_train
    )

    test_data = Data(
        x=x,
        edge_index=X_test,
        edge_label=y_test.float(),
        edge_attr=w_test
    )
    
    # save
    torch.save(train_data, f'../../data/graph_designer/train_test_edge_classification/train_data_{fold_idx}.pt')
    torch.save(test_data, f'../../data/graph_designer/train_test_edge_classification/test_data_{fold_idx}.pt')
    
    # Print out the fold results (or store them for later analysis)
    print(f"Train data: {train_data}")
    print(f"Test data: {test_data}")
    print()

Fold 1
Train data: Data(x=[2833, 6], edge_index=[2, 3906], edge_attr=[3906, 1], edge_label=[3906, 3])
Test data: Data(x=[2833, 6], edge_index=[2, 435], edge_attr=[435, 1], edge_label=[435, 3])

Number os class: tensor([2, 2, 2, 2, 2, 2, 2, 1, 0, 2, 2, 2, 2, 2, 2, 2, 1, 1, 1, 1, 0, 2, 0, 2,
        2, 2, 2, 2, 2, 0, 2, 2, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2,
        2, 2, 2, 2, 2, 2, 2, 1, 2, 1, 2, 2, 2, 2, 0, 0, 2, 1, 2, 2, 0, 2, 2, 2,
        2, 2, 2, 2, 2, 2, 2, 1, 2, 0, 2, 0, 2, 1, 1, 2, 1, 0, 2, 2, 2, 2, 2, 2,
        2, 2, 2, 2, 2, 1, 2, 1, 0, 2, 1, 2, 2, 1, 2, 2, 2, 2, 2, 2, 0, 1, 2, 0,
        2, 2, 1, 1, 2, 2, 1, 0, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 1,
        2, 2, 1, 2, 1, 2, 2, 2, 1, 2, 2, 1, 1, 2, 2, 2, 2, 2, 2, 0, 2, 1, 2, 2,
        2, 2, 2, 1, 2, 1, 2, 2, 0, 2, 2, 0, 2, 1, 2, 2, 0, 2, 2, 2, 2, 2, 2, 2,
        2, 1, 0, 2, 0, 2, 2, 0, 2, 2, 2, 0, 2, 2, 2, 2, 0, 1, 0, 2, 2, 2, 2, 1,
        2, 2, 2, 2, 1, 2, 2, 2, 1, 2, 1, 2, 2, 1, 2, 0, 2, 2, 2, 1, 0

In [29]:
train_data.edge_label

tensor([[0., 0., 1.],
        [0., 0., 1.],
        [1., 0., 0.],
        ...,
        [0., 0., 1.],
        [0., 0., 1.],
        [0., 0., 1.]])

In [33]:
Counter(test_data.edge_label.argmax(dim=1).numpy())

Counter({2: 297, 1: 78, 0: 59})