In [7]:
import os

import tqdm
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F


import torch_geometric.nn as pyg_nn
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import degree

In [2]:
device = torch.device("cpu")
# device = torch.device("cuda")

In [3]:
df_train = pd.read_csv(
    os.path.join("data", "train.csv")
)
df_attrs = pd.read_csv(
    os.path.join("data", "attr.csv")
)

df_attrs.loc[:, "node_idx_global"] = df_attrs.index
df_attrs.rename(
    columns={
        "u": "node_idx"
    },
    inplace=True,
)

df_train = df_train.merge(df_attrs, left_on=["ego_id", "u"], right_on=["ego_id", "node_idx"], how="inner")
df_train = df_train.merge(df_attrs, left_on=["ego_id", "v"], right_on=["ego_id", "node_idx"], how="inner", suffixes=("_u", "_v"))
df_train.drop(columns=["node_idx_u", "node_idx_v"], inplace=True)

In [21]:
df_train_subset = df_train.loc[df_train.loc[:, "ego_id"] == 0]
df_attrs_subset = df_attrs.loc[df_attrs.loc[:, "ego_id"] == 0]

node_features = df_attrs_subset.loc[:, [
        "age",
        "city_id",
        "sex",
        "school",
        "university",
    ]
]

adjacency_list = df_train_subset.loc[:, [
        "node_idx_global_u",
        "node_idx_global_v",
    ]
]

edge_features = df_train_subset.loc[:, [
        "t",
        "x2",
        "x3",
    ]
]

target = df_train_subset.loc[:, "x1"]

In [22]:
graph = Data(
    x=torch.tensor(node_features.to_numpy(), dtype=torch.float32),
    edge_index=torch.tensor(adjacency_list.to_numpy(), dtype=torch.int64).T,
    edge_attr=torch.tensor(edge_features.to_numpy(), dtype=torch.float32),
    y=torch.tensor(target.to_numpy(), dtype=torch.float32),
).to(device)

In [23]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = pyg_nn.SAGEConv((-1, -1), hidden_channels)
        self.conv2 = pyg_nn.SAGEConv((-1, -1), hidden_channels)
        self.lin1 = torch.nn.Linear(2 * hidden_channels, hidden_channels)
        self.lin2 = torch.nn.Linear(hidden_channels, 1)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index).relu()
        x = self.conv2(x, edge_index).relu()

        idx_u, idx_v = edge_index
        x = torch.cat([x[idx_u], x[idx_v]], dim=-1)
        x = self.lin1(x).relu()
        x = self.lin2(x)
        return x.view(-1)

In [24]:
model = Model(hidden_channels=128).to(device)

In [27]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300)

for epoch in range(1, 10000):
    model.train()
    optimizer.zero_grad()
    pred = model(graph.x, graph.edge_index)
    target = graph.y
    loss = F.mse_loss(pred, target)
    loss.backward()
    optimizer.step()
    scheduler.step()
    print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 2.0652
Epoch: 002, Loss: 2.0014
Epoch: 003, Loss: 1.9428
Epoch: 004, Loss: 1.8895
Epoch: 005, Loss: 1.8414
Epoch: 006, Loss: 1.7986
Epoch: 007, Loss: 1.7611
Epoch: 008, Loss: 1.7287
Epoch: 009, Loss: 1.7014
Epoch: 010, Loss: 1.6789
Epoch: 011, Loss: 1.6609
Epoch: 012, Loss: 1.6472
Epoch: 013, Loss: 1.6374
Epoch: 014, Loss: 1.6310
Epoch: 015, Loss: 1.6275
Epoch: 016, Loss: 1.6263
Epoch: 017, Loss: 1.6271
Epoch: 018, Loss: 1.6291
Epoch: 019, Loss: 1.6320
Epoch: 020, Loss: 1.6352
Epoch: 021, Loss: 1.6384
Epoch: 022, Loss: 1.6413
Epoch: 023, Loss: 1.6436
Epoch: 024, Loss: 1.6452
Epoch: 025, Loss: 1.6460
Epoch: 026, Loss: 1.6461
Epoch: 027, Loss: 1.6455
Epoch: 028, Loss: 1.6443
Epoch: 029, Loss: 1.6426
Epoch: 030, Loss: 1.6406
Epoch: 031, Loss: 1.6384
Epoch: 032, Loss: 1.6362
Epoch: 033, Loss: 1.6340
Epoch: 034, Loss: 1.6321
Epoch: 035, Loss: 1.6303
Epoch: 036, Loss: 1.6289
Epoch: 037, Loss: 1.6278
Epoch: 038, Loss: 1.6270
Epoch: 039, Loss: 1.6266
Epoch: 040, Loss: 1.6264


KeyboardInterrupt: 

In [8]:
graph_dataset = []
for ego_id in tqdm.tqdm(df_train.loc[:, "ego_id"].unique()[:100]):
    df_train_subset = df_train.loc[df_train.loc[:, "ego_id"] == ego_id]
    df_attrs_subset = df_attrs.loc[df_attrs.loc[:, "ego_id"] == ego_id]
    node_features = df_attrs_subset.loc[:, [
            "age",
            "city_id",
            "sex",
            "school",
            "university",
        ]
    ]
    adjacency_list = df_train_subset.loc[:, [
            "node_idx_global_u",
            "node_idx_global_v",
        ]
    ]
    edge_features = df_train_subset.loc[:, [
            "t",
            "x2",
            "x3",
        ]
    ]
    target = df_train_subset.loc[:, "x1"]
    graph_dataset.append(
            Data(
            x=torch.tensor(node_features.to_numpy(), dtype=torch.float32),
            edge_index=torch.tensor(adjacency_list.to_numpy(), dtype=torch.int64).T,
            edge_attr=torch.tensor(edge_features.to_numpy(), dtype=torch.float32),
            y=torch.tensor(target.to_numpy(), dtype=torch.float32),
        ).to(device)
    )

100%|██████████| 100/100 [00:06<00:00, 15.10it/s]


In [19]:
loader = DataLoader(graph_dataset, batch_size=1)

In [28]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=300)

for epoch in range(1, 10000):
    for batch in loader:
        model.train()
        optimizer.zero_grad()
        pred = model(batch.x, batch.edge_index)
        target = batch.y
        loss = F.mse_loss(pred, target)
        loss.backward()
        optimizer.step()
        scheduler.step()
        print(f'Epoch: {epoch:03d}, Loss: {loss:.4f}')

Epoch: 001, Loss: 1.6263


IndexError: Encountered an index error. Please ensure that all indices in 'edge_index' point to valid indices in the interval [0, 106] (got interval [445, 551])

In [35]:
graph_dataset[0]

Data(x=[286, 5], edge_index=[2, 1591], edge_attr=[1591, 3], y=[1591])