In [47]:
import os

from utils import load_all_data, preprocess


import torch
import torch.nn as nn
import torch.nn.functional as F

import dgl
from dgl.data import DGLDataset
from dgl.nn import GraphConv, SAGEConv


import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score

In [4]:
users, features, relations, labels = load_all_data()
X_train, X_test, y_train, y_test = preprocess(features, labels, include_unlabeled=True, test_size=0.2)

In [5]:
X_train.sort_index()

Unnamed: 0_level_0,languages_JavaScript,languages_Python,languages_TypeScript,languages_HTML,languages_Go,languages_Java,languages_PHP,languages_Jupyter Notebook,languages_CSS,languages_Shell,...,topics_algorithm,topics_plugin,topics_vercel,topics_music,topics_vue3,topics_security,topics_cryptocurrency,topics_data,topics_rails,topics_twitter
username,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
007jedgar,0.701,0.000,0.165,0.009,0.000,0.003,0.000,0.0,0.112,0.000,...,0,0,0,0,0,0,0,0,0,0
00Kai0,0.000,0.875,0.000,0.006,0.000,0.000,0.000,0.0,0.015,0.000,...,0,0,0,0,0,0,0,0,0,0
00imvj00,0.005,0.014,0.008,0.001,0.010,0.254,0.003,0.0,0.003,0.248,...,0,0,0,0,0,0,0,0,0,0
0101011,0.014,0.588,0.000,0.080,0.000,0.000,0.000,0.0,0.038,0.002,...,0,0,0,0,0,0,0,0,0,0
0326,0.643,0.000,0.137,0.086,0.000,0.000,0.000,0.0,0.024,0.000,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
yasik,0.000,0.000,0.000,0.000,0.981,0.000,0.000,0.0,0.000,0.007,...,0,0,0,0,0,0,0,0,0,0
yysu,0.366,0.184,0.000,0.000,0.200,0.000,0.000,0.0,0.000,0.173,...,0,0,0,0,0,0,0,0,0,0
zhouzi,0.347,0.000,0.637,0.002,0.000,0.000,0.000,0.0,0.001,0.000,...,0,0,0,0,0,0,0,0,0,0
zhukovgreen,0.009,0.750,0.218,0.000,0.000,0.000,0.000,0.0,0.008,0.003,...,0,0,0,0,0,0,0,0,0,0


In [84]:
class GithubDataset(DGLDataset):
    def __init__(self, undirected=False):
        self.undirected = undirected
        super().__init__(name='github_dataset')

    def process(self):
        _, features, relations, labels = load_all_data()
        X_train, X_test, y_train, y_test = preprocess(features, labels, include_unlabeled=True, test_size=0.2)
        features = pd.concat([X_train, X_test], axis=0)
        features = features.sort_index()
        labels = pd.concat([y_train, y_test], axis=0)
        labels = labels.sort_index()

        all_users = list(set(X_train.index).union(X_test.index))
        l_user = LabelEncoder()
        l_user.fit(all_users)

        l_label = LabelEncoder()
        labels = l_label.fit_transform(labels)


        src = l_user.transform(relations['following'])
        dest = l_user.transform(relations['follow'])

        train_index = l_user.transform(X_train.index)
        test_index = l_user.transform(X_test.index)


        edges_src = torch.from_numpy(src)
        edges_dst = torch.from_numpy(dest)
        
        self.l_user = l_user
        self.l_label = l_label
        self.num_classes = len(l_label.classes_) - 1

        self.graph = dgl.graph((edges_src, edges_dst), num_nodes=features.shape[0])
        if self.undirected:
            self.graph.add_edges(edges_dst, edges_src)
        
        node_features = torch.from_numpy(features.to_numpy())
        node_labels = torch.from_numpy(labels)
        self.graph.ndata['feat'] = node_features
        self.graph.ndata['label'] = node_labels

        n_nodes = features.shape[0]
        n_train = int(n_nodes * 0.8)
        train_mask = torch.zeros(n_nodes, dtype=torch.bool)
        test_mask = torch.zeros(n_nodes, dtype=torch.bool)
        train_mask[train_index] = True
        test_mask[test_index] = True

        labeled_mask = (node_labels < self.num_classes)
        train_labeled_mask = (labeled_mask) & (train_mask)
        test_labeled_mask = (labeled_mask) & (test_mask)
        print(train_labeled_mask.sum())
        print(test_labeled_mask.sum())


        self.graph.ndata['train_mask'] = train_mask
        self.graph.ndata['test_mask'] = test_mask
        self.graph.ndata['label_mask'] = labeled_mask
        self.graph.ndata['train_label_mask'] = train_labeled_mask
        self.graph.ndata['test_label_mask'] = test_labeled_mask

    def __getitem__(self, i):
        return self.graph

    def __len__(self):
        return 1

In [85]:
dataset = GithubDataset(undirected=True)
graph = dataset[0]
graph = graph.add_self_loop()
print(graph)

tensor(7046)
tensor(1818)
Graph(num_nodes=35909, num_edges=306717,
      ndata_schemes={'feat': Scheme(shape=(200,), dtype=torch.float64), 'label': Scheme(shape=(), dtype=torch.int64), 'train_mask': Scheme(shape=(), dtype=torch.bool), 'test_mask': Scheme(shape=(), dtype=torch.bool), 'label_mask': Scheme(shape=(), dtype=torch.bool), 'train_label_mask': Scheme(shape=(), dtype=torch.bool), 'test_label_mask': Scheme(shape=(), dtype=torch.bool)}
      edata_schemes={})


In [79]:
class GCN(nn.Module):
    def __init__(self, in_feats, h_feat, num_classes):
        super(GCN, self).__init__()
        self.conv1 = GraphConv(in_feats, h_feat)
        self.conv2 = GraphConv(h_feat, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

class GraphSAGE(nn.Module):
    def __init__(self, in_feats, h_feat, num_classes):
        super(GraphSAGE, self).__init__()
        self.conv1 = SAGEConv(in_feats, h_feat, "mean")
        self.conv2 = SAGEConv(h_feat, num_classes, "mean")

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [91]:
model = GCN(graph.ndata['feat'].shape[1], 400, dataset.num_classes)
model

GCN(
  (conv1): GraphConv(in=200, out=400, normalization=both, activation=None)
  (conv2): GraphConv(in=400, out=6, normalization=both, activation=None)
)

In [88]:
model = GraphSAGE(graph.ndata['feat'].shape[1], 400, dataset.num_classes)
model

GraphSAGE(
  (conv1): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_self): Linear(in_features=200, out_features=400, bias=False)
    (fc_neigh): Linear(in_features=200, out_features=400, bias=False)
  )
  (conv2): SAGEConv(
    (feat_drop): Dropout(p=0.0, inplace=False)
    (fc_self): Linear(in_features=400, out_features=6, bias=False)
    (fc_neigh): Linear(in_features=400, out_features=6, bias=False)
  )
)

In [92]:
def train(g, model):
    optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
    best_test_acc = 0

    features = g.ndata['feat']
    labels = g.ndata['label']
    train_mask = g.ndata['train_mask']
    test_mask = g.ndata['test_mask']
    label_mask = g.ndata['label_mask']
    train_label_mask = g.ndata['train_label_mask']
    test_label_mask = g.ndata['test_label_mask']
    for e in range(500):
        # Forward
        logits = model(g, features.float())

        # Compute prediction
        pred = logits.argmax(1)

        # Compute loss
        # Note that you should only compute the losses of the nodes in the training set.
        loss = F.cross_entropy(logits[train_label_mask], labels[train_label_mask])

        # Compute accuracy on training/validation/test
        train_acc = (pred[train_label_mask] == labels[train_label_mask]).float().mean()
        test_acc = (pred[test_label_mask] == labels[test_label_mask]).float().mean()
        train_f1 = f1_score(labels[train_label_mask], pred[train_label_mask], average='weighted')
        test_f1 = f1_score(labels[test_label_mask], pred[test_label_mask], average='weighted')

        # Save the best validation accuracy and the corresponding test accuracy.
        if best_test_acc < test_acc:
            best_test_acc = test_acc

        # Backward
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if e % 5 == 0:
            print('In epoch {}, loss: {:.3f}, train_acc: {:.3f}, test acc: {:.3f} (best {:.3f}), train_f1: {:.3f}, test_f1: {:.3f}'.format(
                e, loss, train_acc, test_acc, best_test_acc, train_f1, test_f1))


In [93]:
train(graph, model)

In epoch 0, loss: 1.810, train_acc: 0.021, test acc: 0.017 (best 0.017), train_f1: 0.023, test_f1: 0.013
In epoch 5, loss: 1.731, train_acc: 0.574, test acc: 0.579 (best 0.579), train_f1: 0.513, test_f1: 0.520
In epoch 10, loss: 1.652, train_acc: 0.633, test acc: 0.645 (best 0.645), train_f1: 0.572, test_f1: 0.587
In epoch 15, loss: 1.566, train_acc: 0.643, test acc: 0.660 (best 0.660), train_f1: 0.581, test_f1: 0.602
In epoch 20, loss: 1.472, train_acc: 0.646, test acc: 0.663 (best 0.663), train_f1: 0.585, test_f1: 0.605
In epoch 25, loss: 1.374, train_acc: 0.648, test acc: 0.662 (best 0.664), train_f1: 0.587, test_f1: 0.604
In epoch 30, loss: 1.283, train_acc: 0.647, test acc: 0.666 (best 0.667), train_f1: 0.586, test_f1: 0.608
In epoch 35, loss: 1.208, train_acc: 0.648, test acc: 0.664 (best 0.667), train_f1: 0.587, test_f1: 0.607
In epoch 40, loss: 1.149, train_acc: 0.649, test acc: 0.664 (best 0.667), train_f1: 0.588, test_f1: 0.607
In epoch 45, loss: 1.104, train_acc: 0.649, test

In [16]:
torch.mean(graph.in_degrees().float()), torch.mean(graph.out_degrees().float())

(tensor(5.7708), tensor(5.7708))

In [29]:
graph.in_degrees().float(), graph.out_degrees().float()

(tensor([ 4.,  2.,  3.,  ..., 14.,  4.,  2.]),
 tensor([18.,  3.,  7.,  ...,  8.,  3.,  4.]))