In [7]:
import pickle
from tqdm import tqdm

import torch
from torch_geometric.loader import LinkNeighborLoader

from torch.nn import functional as F

import torch.optim as optim
from torch.optim.lr_scheduler import StepLR

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

import torch_geometric.transforms as T

import csv

# Set the random seed for PyTorch, NumPy, and random
seed = 2328466898069313329
torch.manual_seed(seed)

# Print the random seed
print(f"Random seed: {torch.initial_seed()}")

with open('../data/hetero_graph_data.pkl', "rb") as f:
    loaded_data = pickle.load(f)

# Extract the data from the loaded dictionary
data = loaded_data["hetero_graph"]

Random seed: 2328466898069313329


In [8]:
from torch_geometric.utils.convert import to_networkx
networkx_data = to_networkx(data=data, to_undirected=True)

TypeError: 'HeteroData' object is not callable

In [27]:

unique_tickers = loaded_data["unique_tickers"]
unique_congresspeople = loaded_data["unique_congresspeople"]
unique_committees = loaded_data["unique_committees"]
unique_bills = loaded_data["unique_bills"]
unique_naics = loaded_data["unique_naics"]

import torch

# Check if a GPU is available and use it, otherwise use the CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Using {} device".format(device))

# Assign consecutive indices to each node type
data['congressperson'].node_id = torch.arange(len(unique_congresspeople))
data['committee'].node_id = torch.arange(len(unique_committees))
data['ticker'].node_id = torch.arange(len(unique_tickers))
data['bill'].node_id = torch.arange(len(unique_bills))
data['naics'].node_id = torch.arange(len(unique_naics))

# Print the updated data
print("Node IDs have been assigned to each node type.")
print(data)
print(data.node_types)

# Collect edge_types 
edge_types = []
# Convert edge_index tensors to integer type (torch.long)
for edge_type, edge_index in data.edge_index_dict.items():
    data.edge_index_dict[edge_type] = edge_index.to(torch.long)
    edge_types.append(edge_type)

# in this way we can effectively remove the edges we don't want to use - like congressperson/buy-sell/ticker
model_edge_types = [edge_type for edge_type in edge_types if edge_type not in [("congressperson", "buy-sell", "ticker"), ("ticker", "rev_buy-sell", "congressperson")]]

print("Edge types:", edge_types)
print(len(edge_types))

import torch_geometric.transforms as T

# For this, we first split the set of edges into
# training (80%), validation (10%), and testing edges (10%).
# Across the training edges, we use 70% of edges for message passing,
# and 30% of edges for supervision.
# We further want to generate fixed negative edges for evaluation with a ratio of 2:1.
# Negative edges during training will be generated on-the-fly.
# We can leverage the `RandomLinkSplit()` transform for this from PyG:

transform = T.RandomLinkSplit(
    num_val=0,
    num_test=0.1,
    disjoint_train_ratio=0.3, # Across the training edges, we use 70% of edges for message passing, and 30% of edges for supervision.
    neg_sampling_ratio=1.0,
    add_negative_train_samples=True,
    edge_types=("congressperson", "buy-sell", "ticker"),
    rev_edge_types=("ticker", "rev_buy-sell", "congressperson"), 
)


train_data, val_data, test_data = transform(data)

Using cpu device
Node IDs have been assigned to each node type.
HeteroData(
  [1mcongressperson[0m={
    num_nodes=2431,
    node_id=[2431]
  },
  [1mcommittee[0m={
    num_nodes=556,
    node_id=[556]
  },
  [1mticker[0m={
    num_nodes=4202,
    node_id=[4202]
  },
  [1mbill[0m={
    num_nodes=47767,
    node_id=[47767]
  },
  [1mnaics[0m={
    num_nodes=744,
    node_id=[744]
  },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 24675],
    edge_attr=[24675, 2]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 2]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 2]
  },
  [1m(ticker, classified, naics)[0m={
    edge_index=[2, 4147],
    edge_attr=[4147, 2]
  },
  [1m(ticker, rev_buy-sell, congressperson)[0m={
    edge_index=[2, 24675],
    edge_attr=[2467

In [28]:
train_data

HeteroData(
  [1mcongressperson[0m={
    num_nodes=2431,
    node_id=[2431]
  },
  [1mcommittee[0m={
    num_nodes=556,
    node_id=[556]
  },
  [1mticker[0m={
    num_nodes=4202,
    node_id=[4202]
  },
  [1mbill[0m={
    num_nodes=47767,
    node_id=[47767]
  },
  [1mnaics[0m={
    num_nodes=744,
    node_id=[744]
  },
  [1m(congressperson, buy-sell, ticker)[0m={
    edge_index=[2, 15546],
    edge_attr=[15546, 2],
    edge_label=[13324],
    edge_label_index=[2, 13324]
  },
  [1m(congressperson, assignment, committee)[0m={
    edge_index=[2, 11698],
    edge_attr=[11698, 2]
  },
  [1m(ticker, lobbies_on, bill)[0m={
    edge_index=[2, 148487],
    edge_attr=[148487, 2]
  },
  [1m(bill, assigned_to, committee)[0m={
    edge_index=[2, 75626],
    edge_attr=[75626, 2]
  },
  [1m(ticker, classified, naics)[0m={
    edge_index=[2, 4147],
    edge_attr=[4147, 2]
  },
  [1m(ticker, rev_buy-sell, congressperson)[0m={
    edge_index=[2, 15546],
    edge_attr=[15546, 2]
 

In [29]:
# Check unique values after applying the transform
transformed_edge_label = train_data["congressperson", "buy-sell", "ticker"].edge_label

In [30]:
transformed_edge_label
len(transformed_edge_label)

13324

In [31]:
#   Define seed edges:
edge_label_index = train_data["congressperson", "buy-sell", "ticker"].edge_label_index
edge_label = train_data["congressperson", "buy-sell", "ticker"].edge_label
edge_attr = train_data["congressperson", "buy-sell", "ticker"].edge_attr

In [32]:
from torch_geometric.loader import LinkNeighborLoader


In [33]:
# In the first hop, we sample at most 20 neighbors.
# In the second hop, we sample at most 10 neighbors.
# In addition, during training, we want to sample negative edges on-the-fly with
# a ratio of 2:1.
# We can make use of the `loader.LinkNeighborLoader` from PyG:

num_neigbors = [20, 10, 5]
batch_size = 1
print("batch_size", batch_size)

train_loader = LinkNeighborLoader(
    data=train_data,
    num_neighbors=num_neigbors,
    edge_label_index=(("congressperson", "buy-sell", "ticker"), edge_label_index),
    edge_label=edge_label,
    batch_size=batch_size,
    shuffle=True,
)

batch_size 1


In [34]:
# Define the model
from model import BuySellLinkPrediction

In [35]:
# Given the HeteroData object named 'data'
num_nodes_dict = {node_type: data[node_type].num_nodes for node_type in data.node_types}


In [36]:
num_nodes_dict

{'congressperson': 2431,
 'committee': 556,
 'ticker': 4202,
 'bill': 47767,
 'naics': 744}

In [37]:
# Print the num_nodes_dict
print(num_nodes_dict)

# Instantiate the model
num_layers = 2
print("num_layers", num_layers)
# model = BuySellLinkPrediction(num_nodes_dict, embedding_dim=64, num_edge_features=2, out_channels=64, edge_types=edge_types, num_layers=num_layers).to(device)
model = BuySellLinkPrediction(
    num_nodes_dict,
    embedding_dim=64,
    num_edge_features=2,
    out_channels=64,
    edge_types=model_edge_types,
    num_layers=num_layers,
).to(device)

# Training loop
import torch.optim as optim
from torch.nn import functional as F

from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score

from torch.optim.lr_scheduler import StepLR

epochs = 100
optimizer = optim.Adam(
    model.parameters(), lr=0.005
)  # You can set the learning rate (lr) as needed

# Define the learning rate scheduler
scheduler = StepLR(
    optimizer, step_size=10, gamma=0.1
)  # Decay the learning rate by a factor of 0.1 every 10 epochs

# Initialize a variable to keep track of the best test AUC-ROC score
best_test_auc_roc = 0.0

for epoch in range(epochs):
    model.train()
    total_loss = 0
    total_accuracy = 0
    total_auc_roc = 0
    for batch in tqdm(train_loader):
        pass

{'congressperson': 2431, 'committee': 556, 'ticker': 4202, 'bill': 47767, 'naics': 744}
num_layers 2


  0%|          | 0/13324 [00:00<?, ?it/s]


ImportError: 'NeighborSampler' requires either 'pyg-lib' or 'torch-sparse'