In [1]:
import os
import multiprocessing

import tqdm
import pandas as pd
from sklearn.metrics import mean_squared_error

import torch
import torch.nn as nn
import torch.nn.functional as F


import torch_geometric.nn as pyg_nn
from torch_geometric.data import Data
from torch_geometric.loader import DataLoader
from torch_geometric.utils import degree

In [2]:
# device = torch.device("cpu")
device = torch.device("cuda")

In [3]:
df_test = pd.read_csv(
    os.path.join("data", "test.csv")
)
df_attrs = pd.read_csv(
    os.path.join("data", "attr.csv")
)

In [4]:
df_submission = pd.read_csv(
    os.path.join("data", "submission.csv")
)

In [34]:
df_submission

Unnamed: 0,ego_id,u,v,x1
0,8,0,93,0.000000
1,8,0,143,0.000000
2,8,0,151,1.606742
3,8,1,24,0.026496
4,8,5,4,0.159857
...,...,...,...,...
810971,1709396984676,73,23,1.562170
810972,1709396984676,74,68,0.454210
810973,1709396984676,77,28,0.078504
810974,1709396984676,79,38,0.981812


In [27]:
df_test.loc[df_test.loc[:, "ego_id"] == 8]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
0,8,20,19,185.7,3.839089e-04,0.000000,0.0
1,8,131,125,161.4,4.034464e-01,0.000000,0.0
2,8,73,56,127.0,8.554643e-05,0.000000,0.0
3,8,0,4,594.5,2.886418e-01,0.000000,0.0
4,8,63,73,127.0,4.281692e-07,0.000000,0.0
...,...,...,...,...,...,...,...
1013,8,132,17,24.1,1.826740e+00,1.791759,0.0
1014,8,29,14,346.9,,0.000000,0.0
1015,8,56,59,80.0,,0.000000,0.0
1016,8,14,11,300.1,,0.000000,0.0


In [28]:
df_test.loc[
    (df_test.loc[:, "ego_id"] == 8) &
    (df_test.loc[:, "u"] == 0) &
    (df_test.loc[:, "v"] == 93)
]

Unnamed: 0,ego_id,u,v,t,x1,x2,x3
938,8,0,93,359.6,,0.0,0.0


In [31]:
df_attrs.loc[
    (df_attrs.loc[:, "ego_id"] == 8) &
    (df_attrs.loc[:, "u"] == 0)
]

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
321,8,0,36,979281502,2,213987831,845825535


In [32]:
df_attrs.loc[
    (df_attrs.loc[:, "ego_id"] == 8) &
    (df_attrs.loc[:, "u"] == 93)
]

Unnamed: 0,ego_id,u,age,city_id,sex,school,university
348,8,93,36,979281502,2,734952557,566091832


In [14]:
graph_dataset[0].edge_index

tensor([[ 20, 131,  73,  ...,  56,  14,   0],
        [ 19, 125,  56,  ...,  59,  11,  33]], device='cuda:0')

In [31]:
graph_dataset[0]

Data(x=[194, 5], edge_index=[2, 1018], edge_attr=[1018, 3], y=[1018])

In [29]:
len(pred)

535

In [16]:
model

NameError: name 'model' is not defined

In [30]:
graph_dataset[0].x[0]

tensor([3.6000e+01, 9.7928e+08, 2.0000e+00, 2.1399e+08, 8.4583e+08],
       device='cuda:0')

In [5]:
node_feature_cols = [
    "age",
    "city_id",
    "sex",
    "school",
    "university",
]

edge_feature_cols = [
    "t",
    "x2",
    "x3",
]

adj_col_names = ["u", "v"]

def prepare_graph_data(ego_id: int):
    df_edges = df_test.loc[df_test.loc[:, "ego_id"] == ego_id]
    df_features = df_attrs.loc[df_attrs.loc[:, "ego_id"] == ego_id]

    unique_u = set(df_edges.loc[:, "u"])
    unique_v = set(df_edges.loc[:, "v"])
    unique_nodes_with_attrs = set(df_features.loc[:, "u"])

    max_node_idx = max(max(unique_u), max(unique_v))
    fake_nodes = [idx for idx in range(max_node_idx + 1) if idx not in unique_nodes_with_attrs]
    if len(fake_nodes) > 0:
        fake_features = [-1 for _ in fake_nodes]
        fake_node_to_attrs = {
            "ego_id": ego_id,
            "u": list(fake_nodes),
            "age": fake_features,
            "city_id": fake_features,
            "sex": fake_features,
            "school": fake_features,
            "university": fake_features,
        }
        fake_nodes_df = pd.DataFrame(fake_node_to_attrs)
        fake_nodes_df = pd.concat([df_features, fake_nodes_df]).sort_values(by="u")

        result = Data(
            x=torch.tensor(fake_nodes_df.loc[:, node_feature_cols].to_numpy(), dtype=torch.float32),
            edge_index=torch.tensor(df_edges.loc[:, adj_col_names].to_numpy(), dtype=torch.int64).T,
            edge_attr=torch.tensor(df_edges.loc[:, edge_feature_cols].fillna(-1).to_numpy(), dtype=torch.float32),
            y=torch.tensor(df_edges.loc[:, "x1"].to_numpy(), dtype=torch.float32),
        ).to(device)

        return result

In [6]:
ego_id_to_graph_idx = {}
graph_dataset = []
for ego_id in tqdm.tqdm(df_test.loc[:, "ego_id"].unique()):
    ego_graph = prepare_graph_data(ego_id)
    if ego_graph:
        graph_dataset.append(prepare_graph_data(ego_id))
        ego_id_to_graph_idx[ego_id] = len(graph_dataset) - 1

100%|██████████| 20596/20596 [21:03<00:00, 16.30it/s]


In [7]:
torch.save(
    graph_dataset,
    os.path.join("data", "preprocessed", "test_tensor.pt"),
)

In [23]:
class Model(torch.nn.Module):
    def __init__(self, hidden_channels):
        super().__init__()
        self.conv1 = pyg_nn.SAGEConv((-1, -1), hidden_channels)
        self.conv2 = pyg_nn.SAGEConv((-1, -1), hidden_channels)
        self.lin1 = torch.nn.Linear(2 * hidden_channels + 3, hidden_channels // 2)
        self.lin2 = torch.nn.Linear(hidden_channels // 2, 1)

    def forward(self, x, edge_index, edge_attr):
        x = self.conv1(x, edge_index).relu()
        # x = self.conv2(x, edge_index).relu()
        x = F.dropout(x, training=self.training, p=0.05)

        idx_u, idx_v = edge_index
        x = torch.cat([x[idx_u], x[idx_v], edge_attr], dim=-1)
        x = self.lin1(x).relu()
        x = F.dropout(x, training=self.training, p=0.05)
        x = self.lin2(x)
        return x.view(-1)

hidden_state = 64
model = Model(hidden_channels=hidden_state).to(device)

In [24]:
model.load_state_dict(torch.load(os.path.join("models", "qwe.pt")))

<All keys matched successfully>

In [44]:
ego_id_to_graph_idx

{8: 0,
 11: 1,
 28: 2,
 31: 3,
 34: 4,
 35: 5,
 49: 6,
 58: 7,
 62: 8,
 73: 9,
 82: 10,
 105: 11,
 106: 12,
 108: 13,
 111: 14,
 115: 15,
 118: 16,
 129: 17,
 135: 18,
 137: 19,
 163: 20,
 171: 21,
 237: 22,
 243: 23,
 268: 24,
 276: 25,
 287: 26,
 321: 27,
 324: 28,
 328: 29,
 331: 30,
 333: 31,
 347: 32,
 348: 33,
 370: 34,
 375: 35,
 379: 36,
 387: 37,
 399: 38,
 414: 39,
 416: 40,
 425: 41,
 431: 42,
 432: 43,
 439: 44,
 450: 45,
 452: 46,
 454: 47,
 456: 48,
 505: 49,
 507: 50,
 515: 51,
 524: 52,
 533: 53,
 538: 54,
 541: 55,
 557: 56,
 584: 57,
 594: 58,
 602: 59,
 604: 60,
 608: 61,
 610: 62,
 616: 63,
 619: 64,
 622: 65,
 637: 66,
 660: 67,
 662: 68,
 670: 69,
 674: 70,
 696: 71,
 699: 72,
 716: 73,
 723: 74,
 731: 75,
 735: 76,
 736: 77,
 751: 78,
 774: 79,
 791: 80,
 799: 81,
 803: 82,
 804: 83,
 810: 84,
 829: 85,
 835: 86,
 8589934614: 87,
 8589934635: 88,
 8589934638: 89,
 8589934640: 90,
 8589934655: 91,
 8589934658: 92,
 8589934669: 93,
 8589934674: 94,
 8589934679: 95,

In [48]:
loader = DataLoader([graph_dataset[996]], batch_size=1)

In [49]:
model.eval()
for batch in loader:
    with torch.no_grad():
        pred = model(batch.x, batch.edge_index, batch.edge_attr)

In [50]:
batch

DataBatch(x=[125, 5], edge_index=[2, 746], edge_attr=[746, 3], y=[746], batch=[125], ptr=[2])

In [51]:
pred

tensor([0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845, 0.6845,
        0.6845, 0.6845, 0.6845, 0.6845, 