In [1]:
import numpy as np
import pandas as pd
from torch_geometric.data import Data

# PyTorch Geometric custom Dataset
This module illustrates the process part of the custom dataset creation for PyG done in the `datasets/SP100Stocks.py` file.
The Dataset class is used to create a custom dataset and use it with PyG modules.
**Note: To use this notebook, first execute notebooks 1 and 2.**

## Processing the graph data
The next cells demonstrate the adjacency matrix and historical values processing done in the `Dataset.process(*)` method.   

### Opening the historical values and the adjacency matrix
The goal is here to create T (the number of timestamps) graphs, one for each timestep. We use the same adjacency matrix for all the graphs as the structure is based on fundamentals and will not change.

In [2]:
values = pd.read_csv('../data/SP100/raw/values.csv').set_index(['Symbol', 'Date'])
values.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,NormClose,DailyLogReturn,ALR1W,ALR2W,ALR1M,ALR2M,RSI,MACD
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
AAPL,2019-07-15 00:00:00-04:00,49.511898,-1.878514,0.467715,0.633083,0.634758,0.659704,0.708706,0.650352,0.846853
AAPL,2019-07-16 00:00:00-04:00,49.340595,-1.882459,-0.173349,0.397144,0.254995,0.705227,0.579131,0.63151,0.836485
AAPL,2019-07-17 00:00:00-04:00,49.063137,-1.88885,-0.282053,0.014589,0.053592,0.567244,0.568765,0.601131,0.796695
AAPL,2019-07-18 00:00:00-04:00,49.620468,-1.876013,0.564958,0.474378,0.106982,0.424941,0.711989,0.638726,0.800902
AAPL,2019-07-19 00:00:00-04:00,48.879761,-1.893074,-0.752247,-0.086458,-0.141486,0.280705,0.852988,0.562803,0.735982


In [3]:
adj = np.load('../data/SP100/raw/adj.npy')
adj[:10, :10]

array([[0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.88339223, 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.72791519, 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        , 0.7385159 ],
       [0.        , 0.88339223, 0.       

### Creating the feature matrix and edge indices
The feature matrix is a 3D matrix of shape (stocks_nb, features_nb, timestamps_nb). The adjacency matrix is used to create the edge_index matrix in the PyTorch Geomtric format.

In [4]:
nodes_nb = len(adj)
x = np.array(
	values.drop(columns=["Close"]).to_numpy().reshape((nodes_nb, -1, values.shape[1] - 1))
)  # shape (nodes_nb, timestamps_nb, features_nb)
x = np.swapaxes(x, 1, 2)  # shape (nodes_nb, features_nb, timestamps_nb)

edge_nb = np.count_nonzero(adj)
edge_index = np.zeros((2, edge_nb))
edge_weight = np.zeros((edge_nb,))
count = 0
for i in range(nodes_nb):
	for j in range(nodes_nb):
		if (weight := adj[i, j]) != 0:
			edge_index[0, count], edge_index[1, count] = i, j
			edge_weight[count] = weight
			count += 1
x.shape, edge_index.shape, edge_weight.shape

((100, 8, 1217), (2, 524), (524,))

### Building the graphs
Finally, the graphs can be built using the Data class for PyG.

In [5]:
past_window, future_window = 25, 1
timestamps = [
	Data(x=x[:, :, idx:idx+past_window], edge_index=edge_index, edge_weight=edge_weight, y=x[:, 0, idx+past_window:idx+past_window+future_window]) for idx in range(x.shape[0] - past_window - future_window)
]
timestamps[:5]

[Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524]),
 Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524]),
 Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524]),
 Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524]),
 Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524])]

## Defining the PyTorch Geometric datasets
The previously defined method is used to process the data and create a standard PyG Dataset class.
The dataset contains graphs with 3d node features -- data for the timestamps $[\![t, t+T[\![$ -- and a target variable -- the variation at time $t+T$ --.

These dataset is used in the next notebooks. The code is in the `datasets/SP100Stocks.py` file.

In [6]:
from datasets.SP100Stocks import SP100Stocks

In [7]:
dataset = SP100Stocks()
dataset, dataset[0]

Processing...
Done!


(SP100Stocks(1191),
 Data(x=[100, 8, 25], edge_index=[2, 524], y=[100, 1], edge_weight=[524], close_price=[100, 25], close_price_y=[100, 1]))