In [1]:
import os
import warnings
import numpy as np
import pandas as pd
import networkx as nx
from datetime import datetime
from datetime import timedelta
from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
sns.set_theme(style="darkgrid")
warnings.filterwarnings('ignore')
pd.set_option('display.max_columns', None)

## Graph

In [3]:
G = nx.read_gpickle('../data/graph_designer/graph_gtfs_fev_2024.gpickle')
num_nodes = G.number_of_nodes()
num_edges = G.number_of_edges()

print(f"Direcionado ..........: {G.is_directed()}")
print(f"Numero de nós  .......: {num_nodes}")
print(f"Numero de arestas ....: {num_edges}")

Direcionado ..........: True
Numero de nós  .......: 2871
Numero de arestas ....: 4526


In [27]:
df_edges = pd.read_parquet('../data/graph_designer/graph_gtfs_fev_2024.parquet')
df_edges.head()

Unnamed: 0,src,dst,distance,src_lat,dst_lat,src_lon,dst_lon
0,100009577,345936831,0.254,-12.901954,-12.902051,-38.419582,-38.417114
1,100722777,100722778,0.362,-12.899299,-12.896647,-38.407673,-38.408215
2,100722777,44782645,1.062,-12.899299,-12.899458,-38.407673,-38.412964
3,100722777,45833440,0.417,-12.899299,-12.896741,-38.407673,-38.408672
4,100722777,66771046,0.934,-12.899299,-12.89679,-38.407673,-38.41254


In [31]:
df_edges_sample = df_edges[:5].copy()

In [32]:
df_edges_sample['averange_speed'] = [25.6, 11.3, 40.2, 50.5, 26.2]

In [33]:
df_edges_sample['trip_time'] = [4, 8, 5, 10, 6]

In [35]:
df_edges_sample['loading'] = [78, 20, 45, 90, 30]

In [38]:
df_edges_sample.round(3).to_csv('sample_edge_features.csv', index=False)

In [None]:
df_edges.to_csv('edge_features')

In [30]:
df_edges.src.nunique(), df_edges.dst.nunique()

(2869, 2870)

In [28]:
df_edges.distance.describe()

count    4526.000000
mean        0.469940
std         0.521761
min         0.000000
25%         0.198000
50%         0.337000
75%         0.567000
max         7.076000
Name: distance, dtype: float64

## Intervals (granularity)

In [5]:
def add_intervals(
    df: pd.DataFrame, 
    date: str, 
    freq=5,
    min_time='05:00:00', 
    max_time='00:59:59'
    ) -> tuple:
    '''
     Add intervalos in to sobe e desce dataset
    '''
    
    #df = pd.read_csv(fpath)
    # convert to datetime
    df.hora_ponto = pd.to_datetime(df.hora_ponto, format='%Y-%m-%d %H:%M:%S')
    # 
    min_time_str = f"{date} {min_time}"
    max_time_str = f"{date} {max_time}"
    # convert to datetime instance
    min_datetime = datetime.strptime(min_time_str, '%Y-%m-%d %H:%M:%S')
    max_datetime = datetime.strptime(max_time_str, '%Y-%m-%d %H:%M:%S')
    max_datetime = max_datetime + timedelta(days=1)
        
    # create intervals by frequency
    interval = freq
    intervals = []
    generation = True
    intervals.append(min_datetime)
    count = 0
    while generation:
        lag = intervals[-1]
        new = lag + timedelta(minutes=interval)
        if new <= max_datetime:
            intervals.append(new)
        else:
            generation = False
            print(f"End generation: {new}")
        count +=1
    print(f"Intervals: {count}")
    intervals_pairs = [[intervals[i], intervals[i+1]] for i in range(len(intervals)-1)]
    df['interval'] = pd.NaT
    for inter in tqdm(intervals_pairs):
        i, j = inter
        df['interval'][(df.hora_ponto >= i) & (df.hora_ponto < j)] = i
    # drop na
    df = df[~df.interval.isna()]
        
    return df, intervals

## Data day

In [6]:
year, month, day = '2024', '03', '08'
date=f'{day}-{month}-{year}'
date_folder = f'2024-03-{day}'
path = f'/mnt/data/sobe_desce/03_2024/{date_folder}/output/'
path

'/mnt/data/sobe_desce/03_2024/2024-03-08/output/'

### Trip Time Series

In [7]:
tts = pd.read_parquet(f'{path}trips_time-series_{date}_{date}.parquet')
tts = tts.rename({'stop_id': 'ponto'}, axis=1)
tts = tts.astype({'ponto': str})
tts.head(3)

Unnamed: 0,linha_atend,linha,cod_ate,sentido,shape,ordem,ponto,lat,lon,distancias,acumulado,posicao,veiculo,trip,trip_id,inicio_viagem,fim_viagem,set,hora_ponto,saida,fill,tempo_parada,duracao,tempo_total,tempo_parada_acum,duracao_acum,distancia,dis_acum,vm,vm_acum,type
0,1374_30966,1374,30966,I,30966_I,1,110438500,-12.942272,-38.421867,0.0,0.0,inicial,20771,1.0,20771_1374_1,2024-03-08 06:25:17,2024-03-08 06:44:25,ottrans,2024-03-08 06:25:17,2024-03-08 06:26:05,False,48.0,0.0,48.0,48.0,0.0,0.0,0.0,0.0,0.0,bus
1,1374_30966,1374,30966,I,30966_I,2,110952763,-12.940207,-38.427055,0.922,0.922,intermediario,20771,1.0,20771_1374_1,2024-03-08 06:25:17,2024-03-08 06:44:25,ottrans,2024-03-08 06:27:01,2024-03-08 06:27:05,False,4.0,56.0,60.0,52.0,56.0,0.607295,0.0,39.040412,0.0,bus
2,1374_30966,1374,30966,I,30966_I,3,110952766,-12.937143,-38.425938,0.391,1.313,intermediario,20771,1.0,20771_1374_1,2024-03-08 06:25:17,2024-03-08 06:44:25,ottrans,2024-03-08 06:28:03,2024-03-08 06:28:10,False,7.0,58.0,65.0,59.0,114.0,0.361609,0.607295,22.444727,19.177746,bus


In [8]:
tts.ponto.nunique()

2860

In [9]:
# cols_unpack_tts = ['trip_id', 'sentido', 'ponto', 'tempo_parada']
# tts_stop_id_coord  = tts.pivot_table(index=cols_unpack_tts, 
#                                      aggfunc=set).reset_index()[cols_unpack_tts]
# tts_stop_id_coord.head()

### Sobe e Desce

In [10]:
sb = pd.read_csv(f'{path}sobe_desce_{date}_esp.csv')
sb = sb.astype({'ponto': str})
sb.head(3)

Unnamed: 0,index,linha,cod_ate,sentido,ordem,ponto,veiculo,trip,trip_id,inicio_viagem,fim_viagem,set,hora_ponto,sobe,desce,lag_carregamento,saldo,carregamento,sobe_fit,lag_carregamento_fit,desce_fit,saldo_fit,carregamento_fit,percent_desce,percent_sobe_esp,sobe_especie,sobe_total_especie,lag_carregamento_especie,desce_total_especie,saldo_especie,carregamento_especie,percent_desce_fit
0,11353,1137,33228,I,1,45833577,20937,1,20937_1137_1,2024-03-08 05:10:32,2024-03-08 05:37:51,ottrans,2024-03-08 05:10:32,11.0,0.0,0,0,11,11.0,0.0,0.0,0.0,11.0,0.0,1.270208,1.765589,12.765589,0.0,0.0,0.0,12.765589,0.0
1,11354,1137,33228,I,2,45833578,20937,1,20937_1137_1,2024-03-08 05:10:32,2024-03-08 05:37:51,ottrans,2024-03-08 05:10:59,9.0,0.0,11,11,20,9.0,11.0,0.0,11.0,20.0,0.0,1.039261,1.444573,10.444573,12.765589,0.0,12.765589,23.210162,0.0
2,11355,1137,33228,I,3,45832840,20937,1,20937_1137_1,2024-03-08 05:10:32,2024-03-08 05:37:51,ottrans,2024-03-08 05:13:13,1.0,0.0,20,20,21,1.0,20.0,0.0,20.0,21.0,0.0,0.115473,0.160508,1.160508,23.210162,0.0,23.210162,24.37067,0.0


In [11]:
sb.shape

(735127, 32)

In [12]:
cols_on_tts =  ['trip_id', 'sentido', 'ordem', 'ponto', 'tempo_parada']
sb = sb.merge(tts[cols_on_tts], 
              on=cols_on_tts[:-1], 
              how='left')
sb.shape

(735127, 33)

In [13]:
sb.shape

(735127, 33)

## Temporal node Features

In [14]:
df_day, inter = add_intervals(sb, 
                             date=f'{year}-{month}-{day}', 
                             freq=5)

End generation: 2024-03-09 01:00:00
Intervals: 240


100% 239/239 [00:01<00:00, 235.41it/s]


### Create intervals for each node

In [15]:
def create_nodes_intervals(gnx, intervals):
    # Get the nodes of the graph
    nodes = list(gnx.nodes())

    dfs = []
    for node in tqdm(nodes):
        df_node =  pd.DataFrame()
        df_node['time'] = intervals
        df_node['node'] =  node
        df_node['lat'] =  G.nodes[node]['lat']
        df_node['lon'] =  G.nodes[node]['lon']
        dfs.append(df_node)
    df_nodes_src = pd.concat(dfs, ignore_index=True)
    
    return df_nodes_src

In [16]:
df_nodes = create_nodes_intervals(G, inter)
print(f"num nodes in A ...: {df_nodes.node.nunique()}")

100% 2871/2871 [00:05<00:00, 541.12it/s]


num nodes in A ...: 2871


In [17]:
df_nodes.shape

(689040, 4)

### Join features for each node in each time

In [18]:
def join_features(df_nds, df_sb, atts, aggs, index_cols=['time', 'node'],):

    for i, att in enumerate(atts):
        print(i, att, aggs[i])
        # get node features to attribute att
        nf_att = df_sb.pivot_table(index=index_cols,
                                   values=att, 
                                   aggfunc=aggs[i]).reset_index()
        # join
        df_nds = df_nds.merge(nf_att, on=index_cols, how='left')
        
    return df_nds

In [19]:
df_nodes_src =  df_nodes.copy()
df_day_new = df_day.rename({'interval': 'time', 'ponto': 'node'}, axis=1)
dtypes = {'node': str}
df_day_new = df_day_new.astype(dtypes)
df_nodes_src = df_nodes_src.astype(dtypes)

In [20]:
atributos = ['sobe', 'desce', 'carregamento', 'linha', 'trip_id', 'tempo_parada']
funcoes   = ['sum', 'sum', 'sum', pd.Series.nunique, pd.Series.nunique, 'mean']

In [21]:
df_nodes_f = join_features(df_nodes_src, df_day_new, atributos, funcoes)

0 sobe sum
1 desce sum
2 carregamento sum
3 linha <function IndexOpsMixin.nunique at 0x7f27062ec9e0>
4 trip_id <function IndexOpsMixin.nunique at 0x7f27062ec9e0>
5 tempo_parada mean


In [22]:
df_nodes_f.head()

Unnamed: 0,time,node,lat,lon,sobe,desce,carregamento,linha,trip_id,tempo_parada
0,2024-03-08 05:00:00,100009577,-12.901954,-38.419582,,,,,,
1,2024-03-08 05:05:00,100009577,-12.901954,-38.419582,3.0,0.0,17.0,1.0,1.0,27.0
2,2024-03-08 05:10:00,100009577,-12.901954,-38.419582,1.0,0.0,1.0,1.0,1.0,0.0
3,2024-03-08 05:15:00,100009577,-12.901954,-38.419582,,,,,,
4,2024-03-08 05:20:00,100009577,-12.901954,-38.419582,2.0,0.0,2.0,1.0,1.0,0.0


In [23]:
df_nodes_f = df_nodes_f.fillna(0)
df_nodes_f.head()

Unnamed: 0,time,node,lat,lon,sobe,desce,carregamento,linha,trip_id,tempo_parada
0,2024-03-08 05:00:00,100009577,-12.901954,-38.419582,0.0,0.0,0.0,0.0,0.0,0.0
1,2024-03-08 05:05:00,100009577,-12.901954,-38.419582,3.0,0.0,17.0,1.0,1.0,27.0
2,2024-03-08 05:10:00,100009577,-12.901954,-38.419582,1.0,0.0,1.0,1.0,1.0,0.0
3,2024-03-08 05:15:00,100009577,-12.901954,-38.419582,0.0,0.0,0.0,0.0,0.0,0.0
4,2024-03-08 05:20:00,100009577,-12.901954,-38.419582,2.0,0.0,2.0,1.0,1.0,0.0


In [24]:
# Função para remover outliers
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    filtered_df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    return filtered_df

# Removendo outliers
cleaned_data = remove_outliers(df_nodes_f, 'tempo_parada')

## Create Tensor
 - shape: intervals x nodes x features)

In [25]:
dfs = []
df_columns_order = pd.DataFrame()
for att in atributos:
    df_att = df_nodes_f.pivot_table(index='time', columns='node', values=att).reset_index()
    
    df_att = df_att.drop(['time'], axis=1)
    dfs.append(df_att.to_numpy())
    df_columns_order[att] = df_att.columns
    
node_features = np.array(dfs)
node_features.shape

(6, 240, 2871)

In [26]:
stop

NameError: name 'stop' is not defined

In [None]:
df_columns_order['tensor_idx'] = np.arange(0, df_columns_order.shape[0])
df_columns_order.to_parquet('../data/graph_designer/tensor_node_idx.parquet', index=False)
df_columns_order.head()

In [None]:
# save dataframe with a temporal node features
df_nodes_f.to_parquet(f'../data/graph_designer/node_temporal_features_{date}.parquet', index=False)
# numpy array with all features
np.save(f'../data/graph_designer/tensor_node_temporal_features_{date}.npy', node_features)

In [None]:
node_features.shape

In [None]:
date