In [153]:
import numpy as np
import os.path as osp
import pandas as pd
import torch
from torch_geometric.data import Dataset, Data

# PyTorch Geometric custom Dataset
The Dataset class is used to create a custom dataset and use it with PyG modules.
**Note: To use this notebook, first execute notebooks 1 and 2.**

## Processing the graph data
The next cells demonstrate the adjacency matrix and historical values processing done in the `Dataset.process(*)` method.   

### Opening the historical values and the adjacency matrix
The goal is here to create T (the number of timestamps) graphs, one for each timestep. We use the same adjacency matrix for all the graphs as the structure is based on fundamentals and will not change.

In [154]:
values = pd.read_csv('../data/SP100/raw/values.csv').set_index(['Symbol', 'Date'])
values.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Close,Volume,Variation,RSI,MACD,BhB,BlB
Symbol,Date,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
AAPL,2019-06-11 00:00:00-04:00,47.066399,0.195742,-0.000257,0.594733,-0.578354,0.0,0.0
AAPL,2019-06-12 00:00:00-04:00,46.916603,-0.472418,0.001237,0.583426,-0.379584,0.0,0.0
AAPL,2019-06-13 00:00:00-04:00,46.906933,-0.20904,-0.002825,0.582656,-0.220298,0.0,0.0
AAPL,2019-06-14 00:00:00-04:00,46.566288,-0.433289,0.006212,0.554871,-0.120164,0.0,0.0
AAPL,2019-06-17 00:00:00-04:00,46.844131,-0.748321,0.005132,0.572767,-0.018179,0.0,0.0


In [155]:
adj = np.load('../data/SP100/raw/adj.npy')
adj[:5, :5]

array([[0.        , 0.        , 0.        , 0.25805462, 0.25805462],
       [0.        , 0.        , 0.51610923, 0.        , 0.        ],
       [0.        , 0.51610923, 0.        , 0.        , 0.        ],
       [0.25805462, 0.        , 0.        , 0.        , 0.51610923],
       [0.25805462, 0.        , 0.        , 0.51610923, 0.        ]])

### Creating the feature matrix and edge indices
The feature matrix is a 3d matrix of shape (stocks_nb, timestamps_nb, features_nb). The adjacency matrix is used to create the edge_index matrix in the PyTorch Geomtric format.

In [156]:
nodes_nb = len(adj)
x = np.array(
	values.drop(columns=["Close"]).to_numpy().reshape((nodes_nb, -1, values.shape[1] - 1))
)
x = np.swapaxes(x, 1, 2)  # to obtain (nodes, features, timestamps)

edge_nb = np.count_nonzero(adj) // 2  # the graph is undirected
edge_index = np.zeros((2, edge_nb))
edge_weight = np.zeros((edge_nb,))
count = 0
for i in range(nodes_nb):
	for j in range(i + 1, nodes_nb):
		if (weight := adj[i, j]) != 0:
			edge_index[0, count], edge_index[1, count] = i, j
			edge_weight[count] = weight
			count += 1
x.shape, edge_index.shape, edge_weight.shape

((100, 6, 1234), (2, 1230), (1230,))

### Building the graphs
Finally, the graphs can be built using the Data class for PyG.

In [157]:
timestamps = [
	Data(x=x[:, :, idx], edge_index=edge_index, edge_weight=edge_weight) for idx in range(x.shape[1])
]
timestamps[:5]

[Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230]),
 Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230]),
 Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230]),
 Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230]),
 Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230])]

## Defining the PyTorch Geometric datasets
The previously defined method is used to process the data and create a standard PyG Dataset class.
Two datasets are defined:
- The first dataset contains graphs with data for a single timestamps
- The second dataset contains graphs with 3d node features -- data for the timestamps $[\![t, t+T[\![$ -- and a target variable -- the variation at time $t+T$ -- used for forecasting.

In [158]:
class SP100Stocks(Dataset):
	"""
	Stock price data for the S&P 100 companies.
	The graph data built from the notebook is used.
	"""
	def __init__(self, root: str = "../data/SP100/", values_file_name: str = "values.csv", adj_file_name: str = "adj.npy"):
		self.values_file_name = values_file_name
		self.adj_file_name = adj_file_name
		super().__init__(root)

	@property
	def raw_file_names(self) -> list[str]:
		return [
			self.values_file_name, self.adj_file_name
		]

	@property
	def processed_file_names(self) -> list[str]:
		return [
			f'timestep_{idx}.pt' for idx in range(len(self))
		]

	def download(self) -> None:
		pass

	def process(self) -> None:
		values = pd.read_csv('../data/SP100/raw/values.csv').set_index(['Symbol', 'Date'])
		adj = np.load('../data/SP100/raw/adj.npy')
		nodes_nb, edge_nb = len(adj), np.count_nonzero(adj) // 2
		x = torch.tensor(
			values.drop(columns=["Close"]).to_numpy().reshape((nodes_nb, -1, values.shape[1] - 1))
		)
		x = np.swapaxes(x, 1, 2)
		close_prices = torch.tensor(
			values[["Close"]].to_numpy().reshape((nodes_nb, -1))
		)
		edge_index, edge_weight = torch.zeros((2, edge_nb)), torch.zeros((edge_nb,))
		count = 0
		for i in range(nodes_nb):
			for j in range(i + 1, nodes_nb):
				if (weight := adj[i, j]) != 0:
					edge_index[0, count], edge_index[1, count] = i, j
					edge_weight[count] = weight
					count += 1
		timestamps = [
			Data(x=x[:, :, idx], edge_index=edge_index, edge_weight=edge_weight, close_price=close_prices[:, idx]) for idx in range(x.shape[2])
		]
		for t, timestep in enumerate(timestamps):
			torch.save(
				timestep, osp.join(self.processed_dir, f"timestep_{t}.pt")
			)
		
	def len(self) -> int:
		values = pd.read_csv(self.raw_paths[0]).set_index(['Symbol', 'Date'])
		return len(values.loc[values.index[0][0]])

	def get(self, idx: int) -> Data:
		data = torch.load(osp.join(self.processed_dir, f'timestep_{idx}.pt'))
		return data

In [159]:
dataset = SP100Stocks()
dataset, dataset[0]

Processing...
Done!


(SP100Stocks(1234),
 Data(x=[100, 6], edge_index=[2, 1230], edge_weight=[1230], close_price=[100]))

In [160]:
class SP100StocksForecasting(Dataset):
	"""
	Stock price data for the S&P 100 companies.
	The graph data built from the notebook is used.
	"""
	def __init__(self, root: str = "../data/SP100/", values_file_name: str = "values.csv", adj_file_name: str = "adj.npy", time_window: int = 20):
		self.values_file_name = values_file_name
		self.adj_file_name = adj_file_name
		self.time_window = time_window
		super().__init__(root)

	@property
	def processed_dir(self) -> str:
		return osp.join(self.root, 'forecasting_processed')
	
	@property
	def raw_file_names(self) -> list[str]:
		return [
			self.values_file_name, self.adj_file_name
		]

	@property
	def processed_file_names(self) -> list[str]:
		return [
			f'forecasting_timestep_{idx}.pt' for idx in range(len(self))
		]

	def download(self) -> None:
		pass

	def process(self) -> None:
		values = pd.read_csv('../data/SP100/raw/values.csv').set_index(['Symbol', 'Date'])
		adj = np.load('../data/SP100/raw/adj.npy')
		nodes_nb, edge_nb = len(adj), np.count_nonzero(adj) // 2
		x = torch.tensor(
			values.drop(columns=["Close"]).to_numpy().reshape((nodes_nb, -1, values.shape[1] - 1))
		)
		x = np.swapaxes(x, 1, 2)
		close_prices = torch.tensor(
			values[["Close"]].to_numpy().reshape((nodes_nb, -1))
		)
		edge_index, edge_weight = torch.zeros((2, edge_nb)), torch.zeros((edge_nb,))
		count = 0
		for i in range(nodes_nb):
			for j in range(i + 1, nodes_nb):
				if (weight := adj[i, j]) != 0:
					edge_index[0, count], edge_index[1, count] = i, j
					edge_weight[count] = weight
					count += 1
		timestamps = [
			Data(
				x=x[:, :, idx:idx + self.time_window], 
				edge_index=edge_index, 
				edge_weight=edge_weight, 
				close_price=close_prices[:, idx:idx + self.time_window],
				y=x[:, 0, idx + self.time_window],
				close_price_y=close_prices[:, idx + self.time_window]
			) for idx in range(x.shape[2] - self.time_window)
		]
		for t, timestep in enumerate(timestamps):
			torch.save(
				timestep, osp.join(self.processed_dir, f"forecasting_timestep_{t}.pt")
			)
		
	def len(self) -> int:
		values = pd.read_csv(self.raw_paths[0]).set_index(['Symbol', 'Date'])
		return len(values.loc[values.index[0][0]]) - self.time_window

	def get(self, idx: int) -> Data:
		data = torch.load(osp.join(self.processed_dir, f'forecasting_timestep_{idx}.pt'))
		return data

In [161]:
forecasting_dataset = SP100StocksForecasting()
forecasting_dataset, forecasting_dataset[0]

Processing...
Done!


(SP100StocksForecasting(1214),
 Data(x=[100, 6, 20], edge_index=[2, 1230], y=[100], edge_weight=[1230], close_price=[100, 20], close_price_y=[100]))