In [1]:
import os
import networkx as nx
import numpy as np
import pandas as pd
import torch
from tqdm import tqdm
from sklearn.preprocessing import StandardScaler

data_path = "/home/tpinho/IJGIS/Datasets/Brazil_Election_2018/Original"
adj_matrix = pd.read_csv(os.path.join(data_path, "queen_matrix.csv"))
adj_matrix.set_index(adj_matrix.columns[0], inplace=True)
original_data = pd.read_csv(os.path.join(data_path, "data.csv"), index_col="INDEX")
original_data.drop(columns=["[GEO]_LATITUDE", "[GEO]_LONGITUDE", "INDEX_FOLDS"], inplace=True)

cols_in_data = [c for c in adj_matrix.index if c in original_data.index]
adj_matrix = adj_matrix.loc[cols_in_data]
adj_matrix = adj_matrix[adj_matrix.index.astype("str")]

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load graph from networkx library
G = nx.from_numpy_matrix(adj_matrix.to_numpy())

# retrieve the labels for each node
labels = original_data[["TARGET"]].to_numpy()

# create edge index from 
adj = nx.to_scipy_sparse_matrix(G).tocoo()
row = torch.from_numpy(adj.row.astype(np.int64)).to(torch.long)
col = torch.from_numpy(adj.col.astype(np.int64)).to(torch.long)
edge_index = torch.stack([row, col], dim=0)

# using degree as embedding
embeddings = original_data.drop(columns=["TARGET"]).to_numpy()
# normalizing degree values
#scale = StandardScaler()
#embeddings = scale.fit_transform(embeddings.reshape(-1,1))
print(embeddings)

[[0.9959689  0.98877052 0.00143968 ... 0.06907316 0.05178486 0.14955266]
 [0.98338658 0.95782748 0.         ... 0.0898704  0.07435198 0.19379971]
 [0.99280094 0.95613595 0.00100452 ... 0.09837843 0.07909871 0.17258823]
 ...
 [1.         1.         0.         ... 0.11731044 0.09537434 0.22208083]
 [0.99385875 0.99215285 0.         ... 0.13761635 0.10750137 0.19025179]
 [0.995625   0.99       0.001875   ... 0.10762254 0.08625501 0.14887995]]


In [3]:
import torch
import pandas as pd
from torch_geometric.data import InMemoryDataset, Data
from sklearn.model_selection import train_test_split
import torch_geometric.transforms as T

# custom dataset
class KarateDataset(InMemoryDataset):
    def __init__(self, transform=None):
        super(KarateDataset, self).__init__('.', transform, None, None)

        data = Data(edge_index=edge_index)
        
        data.num_nodes = G.number_of_nodes()
        
        # embedding 
        data.x = torch.from_numpy(embeddings).type(torch.float32)
        
        # labels
        y = torch.from_numpy(labels).type(torch.float32)
        data.y = y.clone().detach()
        print(data.y.shape)
        
        data.num_classes = 1
        # splitting the data into train, validation and test
        X_train, X_test, y_train, y_test = train_test_split(pd.Series(list(G.nodes())), 
                                                            original_data["TARGET"],
                                                            test_size=0.30, 
                                                            random_state=42)
        print(y_train.shape)
        n_nodes = G.number_of_nodes()
        
        # create train and test masks for data
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[X_train.index] = True
        test_mask[X_test.index] = True
        data['train_mask'] = train_mask
        data['test_mask'] = test_mask

        self.data, self.slices = self.collate([data])

    def _download(self):
        return

    def _process(self):
        return

    def __repr__(self):
        return '{}()'.format(self.__class__.__name__)
    
dataset = KarateDataset()
data = dataset[0]
data

torch.Size([5565, 1])
(3895,)


Data(edge_index=[2, 31724], num_nodes=5565, x=[5565, 3998], y=[5565, 1], num_classes=1, train_mask=[5565], test_mask=[5565])

In [4]:
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

# GCN model with 2 layers 
class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(data.num_features, 1000)
        self.conv2 = GCNConv(1000, 500)
        self.linear1 = torch.nn.Linear(500,1)

    def forward(self):
        x, edge_index = data.x, data.edge_index
        x = F.relu(self.conv1(x, edge_index))
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)
        x = self.linear1(x)
        return x

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

data =  data.to(device)

model = Net().to(device) 

In [5]:
torch.manual_seed(42)

optimizer_name = "Adam"
lr = 1e-3
optimizer = getattr(torch.optim, optimizer_name)(model.parameters(), lr=lr)
epochs = 400

def train():
  model.train()
  optimizer.zero_grad()
  F.mse_loss(model()[data.train_mask], data.y[data.train_mask]).backward()
  optimizer.step()

@torch.no_grad()
def test():
  model.eval()
  logits = model()
  mask1 = data['test_mask']
  test_pred = logits[mask1]
  test_true = data.y[mask1]
  print(test_pred)
  print(test_true)
  mse = test_pred.sub(test_true)**2
  return mse.sum()

for epoch in tqdm(range(1, epochs)):
  train()

mse = test()
print(mse)


100%|██████████| 399/399 [05:37<00:00,  1.18it/s]


tensor([[43.6241],
        [40.6020],
        [42.7261],
        ...,
        [40.0590],
        [49.1139],
        [48.2970]])
tensor([[47.9763],
        [74.7425],
        [71.1065],
        ...,
        [60.0694],
        [23.3065],
        [39.9842]])
tensor(360016.9062)
