In [1]:
import sys
sys.path.append("../../src/models")

In [2]:
import pandas as pd
import networkx as nx
import numpy as np

In [3]:
from joblib import Parallel, delayed
#from tqdm import tqdm
from tqdm.auto import tqdm
from sklearn.metrics import roc_auc_score, accuracy_score

In [4]:
from actions import city_restrictions

prhome = 0.06

p_r = {
    'home'    :  prhome,
    'neighbor':  .1*prhome,
    'work'    :  .1*prhome,
    'school'  :  .15*prhome,
}

In [89]:
model_dataset = pd.read_parquet("model_dataset.parquet")

In [6]:
res_df = pd.read_parquet("simulation_results_dataset.parquet")

In [7]:
gpickle_path = "../../data/processed/SP_multiGraph_Job_Edu_Level.gpickle"
G = nx.read_gpickle(gpickle_path)

In [8]:
edges = (
    [(source, target, data['edge_type'])
        for source, target, data in G.edges(data=True)] +
    [(target, source, data['edge_type'])
        for source, target, data in G.edges(data=True)]
)

edgelist_df = pd.DataFrame(edges, columns=['source', 'target', 'edge_type'])

# Data Handling - Pytorch Geometric

In [9]:
import torch
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader

In [10]:
WEEK, SIMULATION = 12, '0_1'

In [11]:
res_sample_week = res_df[(res_df.week == WEEK) &
                         (res_df.simulation == SIMULATION)].sort_values('id')
action = res_sample_week.action.iloc[0]

### 1. data.x
data.x: Node feature matrix with shape [num_nodes, num_node_features]

In [12]:
x = torch.tensor([[state] for state in [0] + res_sample_week['state'].tolist()],
                 dtype=torch.float)
x.shape

torch.Size([55493, 1])

### 2. data.edge_index
data.edge_index: Graph connectivity in COO format with shape [2, num_edges] and type torch.long

In [13]:
edge_index = torch.tensor([[int(e[0]) for e in edges], [int(e[1]) for e in edges]],
                          dtype=torch.long)
edge_index.shape

torch.Size([2, 1924076])

### 3. data.edge_attr

data.edge_attr: Edge feature matrix with shape [num_edges, num_edge_features]

In [14]:
edge_attr = torch.tensor([[city_restrictions[action][e[2]] * p_r[e[2]]] for e in edges],
                         dtype=torch.float)
edge_attr.shape

torch.Size([1924076, 1])

### 4. data.y
data.y: Target to train against (may have arbitrary shape), e.g., node-level targets of shape [num_nodes, *] or graph-level targets of shape [1, *]

In [15]:
target_dict = (
    model_dataset[(model_dataset.week == WEEK) & (model_dataset.simulation == SIMULATION)]
    [['id', 'binary_target']]
    .set_index('id')
    .to_dict()
    ['binary_target']
)

graph_targets = [-1]+ [
    -1 if state != 0
    else target_dict[node_id]
    for node_id, state in res_sample_week[['id', 'state']].values
]

y = torch.tensor(graph_targets, dtype=torch.long)
y.shape

torch.Size([55493])

In [16]:
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

In [17]:
data

Data(x=[55493, 1], edge_index=[2, 1924076], edge_attr=[1924076, 1], y=[55493])

In [18]:
has_target = data.y != -1
display(pd.Series(y.numpy()).value_counts())

 0    44934
-1     9574
 1      985
dtype: int64

## First GNN

In [19]:
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

num_node_features = 1
num_node_classes = 2

class GCN(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.conv1 = GCNConv(num_node_features, 16)
        self.conv2 = GCNConv(16, 4)
        self.conv3 = GCNConv(4, num_node_classes)

        
    def forward(self, data):
        x, edge_index, edge_weight = data.x, data.edge_index, data.edge_attr

        x = self.conv1(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index, edge_weight)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv3(x, edge_index, edge_weight)

        return F.log_softmax(x, dim=1)

In [20]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [21]:
model = GCN().to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [22]:
weight = torch.tensor([1, int((y.numpy() == 0).sum() /  (y.numpy() == 1).sum())], dtype=torch.float)

In [23]:
model.train()
for epoch in tqdm(range(200)):
    if epoch % 100 == 0:
        print(epoch)
        print(roc_auc_score(data.y[has_target].numpy(), model(data)[has_target, 1].detach().numpy()))
    optimizer.zero_grad()
    out = model(data)
    #loss = F.nll_loss(out[has_target], data.y[has_target])
    loss = torch.nn.CrossEntropyLoss(weight=weight)(out[has_target], data.y[has_target])
    loss.backward()
    optimizer.step()
print(epoch)
print(roc_auc_score(data.y[has_target].numpy(), model(data)[has_target, 1].detach().numpy()))

  0%|          | 0/200 [00:00<?, ?it/s]

0
0.5785899748282817
100
0.7810011366925297
199
0.7859268495090034


In [24]:
model.eval()

GCN(
  (conv1): GCNConv(1, 16)
  (conv2): GCNConv(16, 4)
  (conv3): GCNConv(4, 2)
)

In [25]:
accuracy_score(data.y[has_target].numpy(), model(data)[has_target].argmax(dim=1).detach().numpy())

0.7616237287397374

In [26]:
roc_auc_score(data.y[has_target].numpy(), model(data)[has_target, 1].detach().numpy())

0.7926465188988971

In [27]:
roc_auc_score(data.y.numpy()[has_target], model(data)[:, 1].detach().numpy()[has_target])

0.7926465188988971

## Temporal data

In [73]:
all_one_sim = res_df[(res_df['simulation'] == '0_1') & (res_df['week'] > 0) & (res_df['week'] < 12)]

In [75]:
x = torch.tensor([[state] for state in [0] + all_one_sim['state'].tolist()],
                 dtype=torch.float)
x.shape

torch.Size([610413, 1])

In [76]:
edge_index = torch.tensor([[int(e[0]) for e in (all_one_sim.week.nunique() * edges)],
                           [int(e[1]) for e in (all_one_sim.week.nunique() * edges)]],
                          dtype=torch.long)
edge_index.shape

torch.Size([2, 21164836])

In [77]:
edge_attr = torch.tensor([[city_restrictions[action][e[2]] * p_r[e[2]]] for e in (all_one_sim.week.nunique() * edges)],
                         dtype=torch.float)
edge_attr.shape

torch.Size([21164836, 1])

In [91]:
target_dict = (
    model_dataset[(model_dataset['simulation'] == '0_1') & (model_dataset['week'] > 0) & (model_dataset['week'] < 12)]
    [['id', 'week', 'binary_target']]
    .set_index(['week', 'id'])
    .to_dict()
    ['binary_target']
)

In [92]:
graph_targets = [-1]+ [
    -1 if state != 0
    else target_dict[(week, node_id)]
    for node_id, week, state in all_one_sim[['id', 'week', 'state']].values
]

y = torch.tensor(graph_targets, dtype=torch.long)
y.shape

torch.Size([610413])

In [97]:
data = Data(x=x, edge_index=edge_index, edge_attr=edge_attr, y=y)

In [98]:
data

Data(x=[610413, 1], edge_index=[2, 21164836], edge_attr=[21164836, 1], y=[610413])

In [99]:
has_target = data.y != -1
display(pd.Series(y.numpy()).value_counts())

 0    573928
-1     26981
 1      9504
dtype: int64

In [100]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [101]:
model = GCN().to(device)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = GCN().to(device)
data = data.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

In [102]:
weight = torch.tensor([1, int((y.numpy() == 0).sum() /  (y.numpy() == 1).sum())], dtype=torch.float)

In [None]:
model.train()
for epoch in tqdm(range(200)):
    if epoch % 100 == 0:
        print(epoch)
        print(roc_auc_score(data.y[has_target].numpy(), model(data)[has_target, 1].detach().numpy()))
    optimizer.zero_grad()
    out = model(data)
    #loss = F.nll_loss(out[has_target], data.y[has_target])
    loss = torch.nn.CrossEntropyLoss(weight=weight)(out[has_target], data.y[has_target])
    loss.backward()
    optimizer.step()
print(epoch)
print(roc_auc_score(data.y[has_target].numpy(), model(data)[has_target, 1].detach().numpy()))

  0%|          | 0/200 [00:00<?, ?it/s]

0
0.4691152017054885
