In [1]:
'''
@Author: Bohan Xu
@Date: 03/April/2023
'''
import os.path as osp

import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.linear_model import LogisticRegression
from torch_cluster import random_walk

import torch_geometric.transforms as T
from torch_geometric.datasets import SNAPDataset
from torch_geometric.loader import NeighborSampler as RawNeighborSampler
from torch_geometric.nn import SAGEConv
import datetime

In [2]:
dataset = 'ego-facebook'
path = osp.join('/home/xbh/tt/pyglzip/', 'data', dataset)
dataset = SNAPDataset(path, dataset, transform=T.NormalizeFeatures())
data = dataset[0]

In [3]:
print(data)

Data(x=[347, 1406], edge_index=[2, 5718], circle=[325], circle_batch=[325])


In [4]:
dataset.process()

In [5]:
data = dataset[0]

In [6]:
print(data)

Data(x=[347, 1406], edge_index=[2, 5718], circle=[325], circle_batch=[325])


In [10]:
import pandas as pd
dt = pd.DataFrame({'source':[17,15,6,1,8,4,6,3,12],'target':[19,13,22,12,3,115,22,15,19],'timestamp':[1,1,1,1,1,1,1,1,1]})

In [13]:
import numpy as np
dt

Unnamed: 0,source,target,timestamp
0,17,19,1
1,15,13,1
2,6,22,1
3,1,12,1
4,8,3,1
5,4,115,1
6,6,22,1
7,3,15,1
8,12,19,1


In [38]:
dt.sort_values(by = 'source')

Unnamed: 0,source,target,timestamp
3,1,12,1
7,3,15,1
5,4,115,1
2,6,22,1
6,6,22,1
4,8,3,1
8,12,19,1
1,15,13,1
0,17,19,1


In [39]:
df = pd.DataFrame(columns=['source', 'target', 'timestamp'])

In [44]:
dv = dt['source'].values
dtar = dt['target'].values
dtime = dt['timestamp'].values

In [46]:
temp = set()
sv = dict()
for i in range(min(len(dv),len(dtar),len(dtime))):
    tt = (dv[i],dtar[i])
    if tt in temp:
        t1 = sv[tt]
        t1.append(dtime[i])
        sv[tt] = t1
    else:
        temp.add(tt)
        sv[tt] = [dtime[i]]


In [54]:
l1 = list()
l2 = list()
l3 = list()
for (x,y) in sv:
    l1.append(x)
    l2.append(y)
    l3.append(sv[(x,y)])

In [57]:
l1 =

[[1], [1], [1, 1], [1], [1], [1], [1], [1]]

In [7]:
print(data.edge_index)

tensor([[  0,   0,   0,  ..., 346, 346, 346],
        [ 47,  52,  53,  ..., 343, 344, 345]])


In [8]:
print(data.circle_batch)

tensor([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  1,  2,  2,  2,  2,  2,  2,  2,  2,  2,  3,  3,  3,  4,  4,  4,
         4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  4,  5,  6,  6,  6,
         6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  6,  7,
         7,  8,  9,  9,  9,  9,  9,  9,  9,  9,  9,  9, 10, 10, 10, 10, 11, 11,
        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11,
        11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 12, 13, 13, 13, 13, 13, 14, 14,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15, 15,
        15, 15, 15, 15, 15, 15, 15, 15, 

In [9]:
print(data.keys)

['x', 'circle_batch', 'edge_index', 'circle']


In [10]:
data.is_undirected()

True

In [11]:
class NeighborSampler(RawNeighborSampler):
    def sample(self, batch):
        batch = torch.tensor(batch)
        row, col, _ = self.adj_t.coo()

        # For each node in `batch`, we sample a direct neighbor (as positive
        # example) and a random node (as negative example):
        pos_batch = random_walk(row, col, batch, walk_length=1,
                                coalesced=False)[:, 1]

        neg_batch = torch.randint(0, self.adj_t.size(1), (batch.numel(), ),
                                  dtype=torch.long)

        batch = torch.cat([batch, pos_batch, neg_batch], dim=0)
        return super().sample(batch)


train_loader = NeighborSampler(data.edge_index, sizes=[10, 10], batch_size=256,
                               shuffle=True, num_nodes=data.num_nodes)

In [12]:
train_loader

NeighborSampler(sizes=[10, 10])

In [13]:
class SAGE(nn.Module):
    def __init__(self, in_channels, hidden_channels, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.convs = nn.ModuleList()
        for i in range(num_layers):
            in_channels = in_channels if i == 0 else hidden_channels
            self.convs.append(SAGEConv(in_channels, hidden_channels))

    def forward(self, x, adjs):
        for i, (edge_index, _, size) in enumerate(adjs):
            x_target = x[:size[1]]  # Target nodes are always placed first.
            x = self.convs[i]((x, x_target), edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

    def full_forward(self, x, edge_index):
        for i, conv in enumerate(self.convs):
            x = conv(x, edge_index)
            if i != self.num_layers - 1:
                x = x.relu()
                x = F.dropout(x, p=0.5, training=self.training)
        return x

In [14]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = SAGE(data.num_node_features, hidden_channels=64, num_layers=2)
model = model.to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
x, edge_index = data.x.to(device), data.edge_index.to(device)

In [15]:
print(model)

SAGE(
  (convs): ModuleList(
    (0): SAGEConv(1406, 64, aggr=mean)
    (1): SAGEConv(64, 64, aggr=mean)
  )
)


In [27]:
def train():
    model.train()

    total_loss = 0
    for batch_size, n_id, adjs in train_loader:
        # `adjs` holds a list of `(edge_index, e_id, size)` tuples.
        adjs = [adj.to(device) for adj in adjs]
        optimizer.zero_grad()

        out = model(x[n_id], adjs)
        out, pos_out, neg_out = out.split(out.size(0) // 3, dim=0)

        pos_loss = F.logsigmoid((out * pos_out).sum(-1)).mean()
        neg_loss = F.logsigmoid(-(out * neg_out).sum(-1)).mean()
        loss = -pos_loss - neg_loss
        loss.backward()
        optimizer.step()

        total_loss += float(loss) * out.size(0)

    return total_loss / data.num_nodes


@torch.no_grad()
def test():
    model.eval()
    out = model.full_forward(x, edge_index).cpu()

    clf = LogisticRegression()
    clf.fit(out[data.train_mask], data.y[data.train_mask])

    val_acc = clf.score(out[data.val_mask], data.y[data.val_mask])
    test_acc = clf.score(out[data.test_mask], data.y[data.test_mask])

    return val_acc, test_acc


In [133]:
print(data)

Data(x=[347, 1406], edge_index=[2, 5718], circle=[325], circle_batch=[325])


In [236]:
from stackoverflow_loader import geo_StackOverData

In [237]:
dataset = 'sx-stackoverflow'
path = osp.join('/home/xbh/tt/pyglzip/', 'data')
print(path)
dataset = geo_StackOverData(path, dataset)
data = dataset[0]

/home/xbh/tt/pyglzip/data


Downloading https://snap.stanford.edu/data/sx-stackoverflow.txt.gz


FileNotFoundError: [Errno 2] No such file or directory: '/home/jovyan/sx-stackoverflow.txt.gz'

In [17]:
import os

In [18]:
target_file = os.path.abspath('.')+'/dataset/processed'

In [19]:
train_data = torch.load(target_file+'/train.pt')

In [20]:
test_data = torch.load(target_file+'/test.pt')

In [21]:
val_data = torch.load(target_file+'/val.pt')

In [22]:
print(type(train_data))

<class 'tuple'>


In [23]:
type(train_data[0])

torch_geometric.data.data.Data

In [24]:
import pandas as pd
import os.path as osp
tt_dd = pd.read_csv(osp.join(os.path.abspath('.')+'/dataset', 'sx-stackoverflow.txt'), header=None, sep = ' ')

In [25]:
tt_dd = tt_dd.rename(columns = {0:'source', 1:'target', 2:'timestamp'})
tt_dd = tt_dd.sort_values(by = 'timestamp')

In [33]:
train_data[0]

Data(x=[77988633, 2], edge_index=[2, 170932943], edge_attr=[170932943, 1], y=[77988633], edge_label_pos=[596045], edge_label_index_pos=[2, 596045], edge_label_neg=[596045], edge_label_index_neg=[2, 596045])

In [35]:
tt_dd['source'] == tt_dd['target']

0           False
1            True
2           False
3           False
4           False
            ...  
63497045    False
63497046    False
63497047    False
63497048    False
63497049     True
Length: 63497050, dtype: bool

In [36]:
tt_dd

Unnamed: 0,source,target,timestamp
0,9,8,1217567877
1,1,1,1217573801
2,13,1,1217606247
3,17,1,1217617639
4,48,2,1217618182
...,...,...,...
63497045,4049257,3816212,1457273371
63497046,3507137,1801524,1457273391
63497047,144088,275047,1457273420
63497048,5617035,4368648,1457273425


In [204]:
max(tt_dd.iloc[:,0])

6024260

In [212]:
tt1 = pd.read_csv(osp.join(os.path.abspath('.'),'..','data', 'products/ogbn_products/split/sales_ranking/test.csv.gz',), header=None, sep = ' ')

In [211]:
tt2 = torch.load('/home/xbh/tt/pyglzip/data/products/ogbn_products/processed/geometric_data_processed.pt')

In [215]:
tt2[0]

Data(num_nodes=2449029, edge_index=[2, 123718280], x=[2449029, 100], y=[2449029, 1])

In [213]:
tt1

Unnamed: 0,0
0,235938
1,235939
2,235940
3,235941
4,235942
...,...
2213086,2449024
2213087,2449025
2213088,2449026
2213089,2449027


In [216]:
tt3 = pd.read_csv(osp.join(os.path.abspath('.'),'..','data', 'products/ogbn_products/split/sales_ranking/test.csv.gz',), header=None, sep = ' ')

In [217]:
tt4 = pd.read_csv(osp.join(os.path.abspath('.'),'..','data', 'products/ogbn_products/split/sales_ranking/valid.csv.gz',), header=None, sep = ' ')

In [218]:
tt3

Unnamed: 0,0
0,0
1,1
2,2
3,3
4,4
...,...
196610,196610
196611,196611
196612,196612
196613,196613


In [219]:
tt4

Unnamed: 0,0
0,196615
1,196616
2,196617
3,196618
4,196619
...,...
39318,235933
39319,235934
39320,235935
39321,235936


In [224]:
tt1.shape[0] + tt3.shape[0]+tt4.shape[0]

2449029

In [232]:
tt33 = pd.read_csv(osp.join(os.path.abspath('.'),'dataset', 'sx-stackoverflow.txt',), header=None, sep = ' ')

In [233]:
tt33

Unnamed: 0,0,1,2
0,9,8,1217567877
1,1,1,1217573801
2,13,1,1217606247
3,17,1,1217617639
4,48,2,1217618182
...,...,...,...
63497045,4049257,3816212,1457273371
63497046,3507137,1801524,1457273391
63497047,144088,275047,1457273420
63497048,5617035,4368648,1457273425
