MSBD5008 Group Project 

Node Classification
 
------------------------

In [1]:
import networkx as nx

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import json

# 1. Data Loading and Preprocessing

In [2]:
graph_train_pos_nodes = np.load('./retraindata/graph_train_pos_nodes.npy')
graph_train_pos_edges = np.load('./retraindata/graph_train_pos_edges.npy')
graph_train_pos = nx.Graph()
graph_train_pos.add_nodes_from(graph_train_pos_nodes)
graph_train_pos.add_edges_from(graph_train_pos_edges)

In [3]:
# from gensim.models import KeyedVectors
# embedding = KeyedVectors.load_word2vec_format("./retraindata/graph_train_pos.bin")
# node_embedding = []
# for i in range(graph_train_pos.number_of_nodes()):
#     emb = embedding[str(i)]
#     node_embedding.append(emb)

In [4]:
nx_G = graph_train_pos
nx.info(nx_G)

'Graph with 37700 nodes and 234093 edges'

In [5]:
target = pd.read_csv("./git_web_ml/musae_git_target.csv")
target.head()

Unnamed: 0,id,name,ml_target
0,0,Eiryyy,0
1,1,shawflying,0
2,2,JpMCarrilho,1
3,3,SuhwanCha,0
4,4,sunilangadi2,1


In [6]:
labels = target['ml_target'].to_list()

In [7]:
## features
with open("./git_web_ml/musae_git_features.json") as f:
    features = json.load(f)

# convert keys from string to int
features = {int(k):v for k,v in features.items()}   

# feature array
feature_arr = np.zeros([37700, 4005])
for i in range(len(features)):
    feature_arr[i][features[i]]=1

# dimensionality reduction
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix
import numpy as np
np.random.seed(0)



In [23]:
X = csr_matrix(feature_arr)
svd = TruncatedSVD(n_components=128, random_state=42)
reduced_features = svd.fit_transform(X)

# save 
np.save("features_preprocessed_128.npy", reduced_features)

In [27]:
reduced_features = np.load("features_preprocessed_128.npy")

# 2. DGL Graph

In [9]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import dgl
from dgl.nn import GINConv

Using backend: pytorch


In [10]:
dgl_G = dgl.from_networkx(nx_G)

# 3. Split Dataset
80% training |
10% validation |
10% test

In [11]:
num_nodes = 37700
mask = np.ones(num_nodes)
test_size = int(np.round(0.1*num_nodes))
mask[:test_size] = 2 # val
mask[test_size:2*test_size] = 3 # test
np.random.seed(5008)
np.random.shuffle(mask)

train_nodes = np.where(mask==1)
val_nodes = np.where(mask==2)
test_nodes = np.where(mask==3)

train_mask = np.zeros(num_nodes)
train_mask[train_nodes] = 1
train_mask = train_mask.astype(bool)

val_mask = np.zeros(num_nodes)
val_mask[val_nodes] = 1
val_mask = val_mask.astype(bool)

test_mask = np.zeros(num_nodes)
test_mask[test_nodes] = 1
test_mask = test_mask.astype(bool)

# 4. Train model
## 4.1 Training & Evaluation Functions

In [12]:
import warnings
warnings.filterwarnings("ignore")

def validate(model, features, labels, val_mask):
    # set evaluation mode
    model.eval()
    with torch.no_grad():
        
        # features = torch.tensor(features, dtype=torch.float).to(device)
        # labels = torch.tensor(labels, dtype=torch.long).to(device)
        
        logits = model(features)
        test_mask_logits = logits[val_mask]
        predict_y = test_mask_logits.max(1)[1]
        accuracy = torch.eq(predict_y, labels[val_mask]).float().mean()

    return accuracy

def train(model, features, labels, train_mask, val_mask, best_val_acc=0):
    model.train()
    for epoch in range(201):
            
        features = torch.tensor(features, dtype=torch.float).to(device)
        labels = torch.tensor(labels, dtype=torch.long).to(device)
    
        # forward pass
        logits = model(features) # num_nodes-by-out_dim
        # calculate the cross-entropy loss for classification tasks
        preds = F.log_softmax(logits, 1)
        loss = F.nll_loss(preds[train_mask], labels[train_mask])  # nll: Negative log-likelihood 
        
        # clear the gradients of all optimized variables
        optimizer.zero_grad()
        # backward pass: compute gradients of the loss with respect to model parameters
        loss.backward()
        # perform a single optimization step
        optimizer.step()

        
        if epoch%10==0:
            accuracy = validate(model, features, labels, val_mask)            
            print("Epoch {}: Training Loss {} | Validation Accuracy {}".format(epoch, loss.item(), accuracy.item()))
            if accuracy > best_val_acc: # update best performing model
                best_val_acc = accuracy
                save_checkpoint("best_model.pth", model)
                
                

def save_checkpoint(checkpoint_path, model):
    # state_dict: a Python dictionary object that:
    # - for a model, maps each layer to its parameter tensor;
    state = {'state_dict': model.state_dict()}
    torch.save(state, checkpoint_path)
    print('model saved to %s' % checkpoint_path)

def load_checkpoint(checkpoint_path, model):
    state = torch.load(checkpoint_path)
    model.load_state_dict(state['state_dict'])
    print('model loaded from %s' % checkpoint_path)
    
def test(model, features, labels, test_mask):
    # set evaluation mode
    model.eval()
    with torch.no_grad():
        
        features = torch.tensor(features, dtype=torch.float).to(device)
        labels = torch.tensor(labels, dtype=torch.long).to(device)
        
        logits = model(features)
        test_mask_logits = logits[val_mask]
        predict_y = test_mask_logits.max(1)[1]
        accuracy = torch.eq(predict_y, labels[val_mask]).float().mean()
    return accuracy.item(), predict_y

## 4.2 Design, Train and Evaluate Model

In [13]:
learning_rate = 0.001
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")

In [28]:
from dgl.nn.pytorch.conv import APPNPConv

class GraphNetwork_APPNP(nn.Module):
    def __init__(self, g, in_dim, out_dim):
        super(GraphNetwork_APPNP, self).__init__()
        self.g = g
        hidden_dim_1 = 256
        hidden_dim_2 = 256
        hidden_dim_3 = 256
        hidden_dim_4 = 256
        self.dropout = nn.Dropout(0.5)
        self.layer1 = nn.Linear(in_dim,hidden_dim_1)
        self.layer2 = nn.Linear(hidden_dim_1, hidden_dim_2)
        self.layer3 = nn.Linear(hidden_dim_2, hidden_dim_3)
        self.layer4 = nn.Linear(hidden_dim_3, hidden_dim_4)
        self.layer_out = nn.Linear(hidden_dim_4, out_dim)
        self.propagate = APPNPConv(k=10, alpha=0.1, edge_drop=5e-4)

    def forward(self, h):
        # input params: dglGraph, features;
        h = self.dropout(h)
        h = self.layer1(h)
        h = F.relu(h)
        h = self.layer2(h)
        h = F.relu(h)
        h = self.layer3(h)
        h = F.relu(h)
        h = self.layer4(h)
        h = F.relu(h)
        h = self.dropout(h)
        h = self.layer_out(h)
        h = self.propagate(self.g, h)
        return h



In [29]:
dgl_G = dgl_G.to(device) # move to cuda

model = GraphNetwork_APPNP(dgl_G, in_dim=128, out_dim=2)
model = model.to(device)

optimizer = optim.Adam(model.parameters(), lr=learning_rate)
train(model, reduced_features, labels, train_mask, val_mask)

Epoch 0: Training Loss 0.6820642352104187 | Validation Accuracy 0.7389920353889465
model saved to best_model.pth
Epoch 10: Training Loss 0.5830787420272827 | Validation Accuracy 0.7389920353889465
Epoch 20: Training Loss 0.5538305044174194 | Validation Accuracy 0.7389920353889465
Epoch 30: Training Loss 0.5299922823905945 | Validation Accuracy 0.7389920353889465
Epoch 40: Training Loss 0.5055468082427979 | Validation Accuracy 0.7389920353889465
Epoch 50: Training Loss 0.48430687189102173 | Validation Accuracy 0.7389920353889465
Epoch 60: Training Loss 0.4651927649974823 | Validation Accuracy 0.7389920353889465
Epoch 70: Training Loss 0.44780975580215454 | Validation Accuracy 0.7424403429031372
model saved to best_model.pth
Epoch 80: Training Loss 0.4141197204589844 | Validation Accuracy 0.7758620977401733
model saved to best_model.pth
Epoch 90: Training Loss 0.3419046998023987 | Validation Accuracy 0.8435013294219971
model saved to best_model.pth
Epoch 100: Training Loss 0.328065425157

# 5. Final Testing

In [30]:
dgl_G = dgl_G.to(device)
model = GraphNetwork_APPNP(dgl_G, in_dim=128, out_dim=2)
model = model.to(device)
load_checkpoint("best_model.pth", model)
accuracy, pred = test(model, reduced_features, labels, test_mask)
print("Testing Acc {:.4}".format(accuracy))

model loaded from best_model.pth
Testing Acc 0.8615


In [31]:
pred_arr = np.array(pred.cpu())
# np.save("pred_label.npy", pred_arr)

In [22]:
pred_arr.shape

(3770,)

In [28]:
test_nodes[0].shape

(3770,)

In [32]:
pred_df = pd.DataFrame({'node_id': test_nodes[0], 'predicted_label': pred_arr})
pred_df.head()

Unnamed: 0,node_id,predicted_label
0,2,0
1,17,0
2,18,0
3,21,0
4,32,0


In [33]:
pred_df.to_csv("predictions_test_APPNP_feature.csv")