In [1]:
import sys
import os
sys.path.insert(1, '/home/jovyan/graphormer_v2/')

import torch
import numpy as np
import torch_geometric.datasets
from ogb.graphproppred import PygGraphPropPredDataset
from ogb.lsc.pcqm4m_pyg import PygPCQM4MDataset
import pyximport
from torch_geometric.data import InMemoryDataset, download_url
import pandas as pd
from sklearn import preprocessing

pyximport.install(setup_args={'include_dirs': np.get_include()})
import os.path as osp
from torch_geometric.data import Data
import time

from torch_geometric.utils import add_self_loops, negative_sampling
from graphormer.data.wrapper import preprocess_item


Using backend: pytorch


In [3]:
import sys
from os import path

path.dirname(path.dirname(path.abspath(''))) 

'/home/jovyan/graphormer_v2/examples'

In [2]:
from graphormer.models.graphormer import GraphormerModel


In [None]:
checkpoint_best.pt

In [None]:
from fairseq.models.transformer import TransformerModel
zh2en = TransformerModel.from_pretrained(
  '/path/to/checkpoints',
  checkpoint_file='checkpoint_best.pt',
  data_name_or_path='data-bin/wmt17_zh_en_full',
  bpe='subword_nmt',
  bpe_codes='data-bin/wmt17_zh_en_full/zh.code'
)
zh2en.translate('你好 世界')
# 'Hello World'

In [5]:
root = osp.join('dataset', 'omsk')
update_dir = osp.join(root, 'processed', 'data_omsk_1_upd')
split = 'train'
a = torch.load(update_dir + '/' + f'{split}.pt')

In [2]:
class geo_Omsk(InMemoryDataset):
    
    
    def __init__(self, root, transform=None, pre_transform=None, split = 'train'):
        super(geo_Omsk, self).__init__(root, transform, pre_transform)
        self.data, self.slices = torch.load(self.processed_dir + '/' + f'{split}.pt')
        # self.data = torch.load(self.processed_dir + '/' + f'{split}.pt')
#         self.raw_dir = '/home/jovyan/'
        
    @property
    def raw_dir(self) -> str:
        return '/home/jovyan/tte_data/'
    
    
    @property
    def raw_file_names(self):
        return ['omsk_full_routes_final_weather_L_NaN_filtered_FIXED.csv']

    @property
    def processed_file_names(self):
        return ['train.pt', 'test.pt', 'val.pt']
    
    @property
    def processed_dir(self):
        return osp.join(self.root)

    def download(self):
        path = download_url(self.url, self.raw_dir)
        print(self.processed_paths[0])
    
    def my_load_dataset(self):
        return [self.data, self.slices]
    
    def process(self):
        
        # Read data
        start_time = time.time()
        data = pd.read_csv(osp.join('/home/jovyan/tte_data/', 'omsk_full_routes_final_weather_L_NaN_filtered_FIXED.csv'))
        data = data[data['rebuildCount']<=1].reset_index(drop = True).copy()
        shape = int(data.shape[0])
        data = data[0:shape].copy()
        
        data = data.drop(columns = ['Unnamed: 0'])
        data['hour'] = data['start_timestamp'].apply(lambda x: int(x[-10:-8]))
        # Graph 
        graph_columns_gran = ['edges', 'time', 'speed', 'length']
        edges = ['edges']
        target = ['time']
        node_features_gran = ['speed', 'length']

        edge_features_agg = [' start_point_part', 'finish_point_part', 'day_period', 'week_period', 'clouds', 'snow', 'temperature', 'wind_dir', 'wind_speed', 'pressure','hour']

        
        all_speed = []
        all_length = []
        for i in range(0,shape):
            data_row = data[i:i+1].reset_index(drop = True).copy()
            speed_list = [int(x) for x in (data_row['speed'].values[0].replace("'",'').split(','))]
            list_length = [int(x) for x in (data_row['length'].values[0].replace("'",'').split(','))]
            all_speed.append(speed_list)
            all_length.append(list_length)
            
        all_speed = [item for sublist in all_speed for item in sublist]
        all_length = [item for sublist in all_length for item in sublist]
    
        train_size = 0.8
        test_size = 0.1
        val_size = 0.1

        data_split_dict = dict()
        data_split_dict['train'] = np.arange(0, int(data.shape[0]*train_size))
        data_split_dict['test'] = np.arange(int(data.shape[0]*train_size), int(data.shape[0]*(train_size+test_size)))
        data_split_dict['val'] = np.arange(int(data.shape[0]*(train_size + test_size)),int((data.shape[0]*(train_size+test_size + val_size))))
        
        for split in data_split_dict.keys():
            data_list = []
            for i in data_split_dict[split]:
                data_row = data.iloc[[i],].reset_index(drop = True).copy()

                edge_list = [int(x) for x in (data_row['edges'].values[0].replace("'",'').split(','))]
                speed_list = [int(x) for x in (data_row['speed'].values[0].replace("'",'').split(','))]
                list_length = [int(x) for x in (data_row['length'].values[0].replace("'",'').split(','))]

                source = edge_list.copy()
                target = edge_list[1:].copy() + [edge_list[0]].copy()

                data_row_gran = pd.DataFrame()
                data_row_gran['source'] = source
                data_row_gran['target'] = target
                data_row_gran['speed'] = speed_list
                data_row_gran['length'] = list_length

                
                target_val = data_row['RTA'].values[0]


                data_row_gran['speed'] = data_row_gran['speed']/np.mean(speed_list)
                data_row_gran['length'] = data_row_gran['length']/np.mean(list_length)

                for col in edge_features_agg:
                    data_row_gran[col] = data_row[col].values[0]

                total_nodes_list = list(set(list(data_row_gran.source.values)))
                le = preprocessing.LabelEncoder()
                le.fit(total_nodes_list)

                data_row_gran['source'] = le.transform(data_row_gran.source.values)
                data_row_gran['target'] = le.transform(data_row_gran.target.values)

                total_nodes_list = list(set(list(data_row_gran.source.values)))

                edge_index = torch.tensor(torch.from_numpy(data_row_gran[['source','target']].values.T),dtype = torch.long)


                # Define tensor of nodes features
                x = torch.tensor(torch.from_numpy(data_row_gran[['speed','length'] + edge_features_agg].values),dtype = torch.long)

                # Define tensor of edge features

                # Define tensor of edge features
                edge_num_feach = 1
                edge_attr = torch.from_numpy(np.ones(shape = ((edge_index.size()[1]), edge_num_feach)))
                edge_attr = torch.tensor(edge_attr,dtype = torch.long)

                # Define tensor of targets
                y = torch.tensor(target_val,dtype = torch.long)


                data_graph = Data(x=x, edge_index = edge_index, edge_attr = edge_attr, y=y)
                data_graph = preprocess_item(data_graph)
                data_list.append(data_graph)
            torch.save(data_list, osp.join(self.processed_dir, f'{split}.pt'))
    
    # def get(self, idx):
    #     data = torch.load(osp.join(self.processed_dir, f'{idx}.pt'))
    #     return data


In [2]:
# import multiprocessing

# output=[]
# data = range(0,200000000)

# def f(x):
#     return x**2

# def handler():
#     p = multiprocessing.Pool(64)
#     r=p.map(f, data)
#     return r

# a = handler()
# if __name__ == '__main__':
#     output.append(handler())

# print(output[0])

In [3]:
root = "dataset/" + 'omsk'
a = geo_Omsk(root)


Processing...




KeyboardInterrupt: 

In [12]:
data_train.get(0)

Data(edge_attr=[53, 1], edge_index=[2, 53], x=[53, 13], y=[1])

In [12]:
geo_Omsk(root, split="train")

ValueError: too many values to unpack (expected 2)

In [14]:
data_train.my_load_dataset

Data(edge_attr=[53, 1], edge_index=[2, 53], x=[53, 13], y=[1])

In [9]:
preprocess_item(data_train[0])

preprocess start
start floyd
start floyd_warshall
end floyd_warshall
end floyd
floyd end with time 0.0019443035125732422
start gen_edge_input
start gen_edge_input
start numpy
end numpy
('numpy end with time', 0.0009686946868896484)
start sycle
end sycle
('sycle end with time', 0.07059931755065918)
end gen_edge_input
end gen_edge_input
gen_edge_input end with time 0.07194352149963379
preprocess end with time 0.08118867874145508
shortest_path_result [[ 0  6 45 ...  5 48 49]
 [47  0 39 ... 52 42 43]
 [ 8 14  0 ... 13  3  4]
 ...
 [48  1 40 ...  0 43 44]
 [ 5 11 50 ... 10  0  1]
 [ 4 10 49 ...  9 52  0]]
path [[ 0 50 50 ... 36 50 51]
 [52  0 49 ... 52 49 51]
 [52 52  0 ... 52 48 51]
 ...
 [52  0 49 ...  0 49 51]
 [52 52 52 ... 52  0  0]
 [46 50 50 ... 46 50  0]]
max_dist 52


Data(attn_bias=[54, 54], attn_edge_type=[53, 53, 1], edge_attr=[53, 1], edge_index=[2, 53], edge_input=[53, 53, 52, 1], in_degree=[53], out_degree=[53], spatial_pos=[53, 53], x=[53, 13], y=[1])

In [15]:
data_test

geo_Omsk(53926)

In [16]:
data_valid

geo_Omsk(53926)

In [None]:
import sys
import os
# cwd = os.getcwd()
sys.path.insert(1, '/home/jovyan/graphormer_v2/')

from graphormer.data.dataset import (
    GraphormerDataset,
    EpochShuffleDataset
)

from graphormer.models.graphormer import GraphormerModel
from graphormer.tasks.graph_prediction import GraphPredictionTask

In [2]:
dataset = GraphormerDataset(
                dataset_spec = 'omsk',
                dataset_source = 'pyg',
            )

batched_data = BatchedDataDataset(
            dataset.dataset_train,
            max_node=16,
            multi_hop_max_dist=5,
            spatial_pos_max=2,
        )

In [3]:
class cfg():
    user_data_dir = ""
    dataset_name = "omsk"
    dataset_source = "pyg"
    seed = 2
    max_nodes = 16,
    multi_hop_max_dist=5,
    spatial_pos_max=2,
    train_epoch_shuffle = False
    
a = cfg()
task = GraphPredictionTask(a)
task.load_dataset('train')

{}
3


2022-02-02 16:11:26 | INFO | graphormer.tasks.graph_prediction | Loaded train with #samples: 431406


<fairseq.data.nested_dictionary_dataset.NestedDictionaryDataset at 0x7f3c1adaf690>

In [4]:
batch_iterator = task.get_batch_iterator(
        dataset=task.dataset('train'),
        max_positions=None,
        ignore_invalid_inputs=True,
        required_batch_size_multiple=1,
        seed=10,
        num_workers=4,
        epoch=0,
        data_buffer_size=64,
        disable_iterator_cache=True,
    )

itr = batch_iterator.next_epoch_itr(shuffle=False, set_dataset_epoch=False)

from fairseq.logging import progress_bar
progress = progress_bar.progress_bar(itr, log_format = "simple")

In [10]:
for i, sample in enumerate(progress):
    print(i)
    # y = model(**sample["net_input"])[:, 0, :].reshape(-1)

In [2]:
dataset = GraphormerDataset(
                dataset_spec = 'omsk',
                dataset_source = 'pyg',
            )

In [3]:
from graphormer.data.wrapper import preprocess_item


In [4]:
preprocess_item(dataset.dataset_train[3])

preprocess start
start floyd
start floyd_warshall
end floyd_warshall
end floyd
floyd end with time 0.0018382072448730469
start gen_edge_input
start gen_edge_input
start numpy
end numpy
('numpy end with time', 0.0014154911041259766)
start sycle
end sycle
('sycle end with time', 0.04857349395751953)
end gen_edge_input
end gen_edge_input
gen_edge_input end with time 0.05029606819152832
preprocess end with time 0.059174299240112305
shortest_path_result [[ 0  2  3 ...  5  7 15]
 [40  0  1 ...  3  5 13]
 [39 41  0 ...  2  4 12]
 ...
 [37 39 40 ...  0  2 10]
 [35 37 38 ... 40  0  8]
 [27 29 30 ... 32 34  0]]
path [[ 0 35 35 ... 35 39 40]
 [41  0  0 ... 21 39 40]
 [41 41  0 ... 21 39 40]
 ...
 [41 41 41 ...  0 27 40]
 [41 41 41 ... 41  0 32]
 [38 38 38 ... 38 39  0]]
max_dist 41
preprocess start
start floyd
start floyd_warshall
end floyd_warshall
end floyd
floyd end with time 0.0015265941619873047
start gen_edge_input
start gen_edge_input
start numpy
end numpy
('numpy end with time', 0.0006906

Data(attn_bias=[43, 43], attn_edge_type=[42, 42, 1], edge_attr=[42, 1], edge_index=[2, 42], edge_input=[42, 42, 41, 1], idx=3, in_degree=[42], out_degree=[42], spatial_pos=[42, 42], x=[42, 13], y=[1])

In [8]:
dataset.dataset_train[10]

preprocess start
start floyd
start floyd_warshall
end floyd_warshall
end floyd
floyd end with time 0.00013136863708496094
start gen_edge_input
start gen_edge_input
start numpy
end numpy
('numpy end with time', 3.5762786865234375e-05)
start sycle
end sycle
('sycle end with time', 0.0005044937133789062)
end gen_edge_input
end gen_edge_input
gen_edge_input end with time 0.0010328292846679688
preprocess end with time 0.0020513534545898438


Data(attn_bias=[10, 10], attn_edge_type=[9, 9, 1], edge_attr=[9, 1], edge_index=[2, 9], edge_input=[9, 9, 8, 1], idx=10, in_degree=[9], out_degree=[9], spatial_pos=[9, 9], x=[9, 13], y=[1])