# Learning closeness centrality using deep learning model and features extracted via the RCDF matrix rep. of nodes

This notebook is completes the discussion in Example 7.3 in the article.

In [1]:
from mygraph import MyGraph
from helpers import *
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F

## Loading the dataset from numpy files

In [2]:
# closeness centrality as the target or label data
target_closeness = np.load("fb_co_pages_closeness.npy", mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')
target_closeness[0]

array([0.20703618])

In [3]:
# the RCDF matrix as the feature set
"""
ratio = 1.5
starting_length = 1
maximum_length = 35
radius (order) = 4
"""
matrix = np.load("fb_co_pages_RDFC_matrix_r1-5_sta1_max35_rad4.npy", mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')

In [4]:
target_closeness.shape, matrix.shape

((14113, 1), (14113, 4, 14))

In [5]:
matrix[11]

array([[  0.,   0.,   0.,   1.,   1.,   1.,   1.,   0.,   0.,   0.,   0.,
          0.,   0.,   0.],
       [  2.,   0.,   5.,   8.,   8.,  10.,   8.,   3.,   8.,   1.,   0.,
          0.,   0.,   0.],
       [ 16.,  19.,  67.,  93., 136., 121.,  80.,  35.,  19.,   6.,   1.,
          1.,   1.,   0.],
       [141., 212., 416., 582., 605., 434., 263., 135.,  55.,  21.,   5.,
          4.,   4.,   1.]])

# The p-aggregation of the RCDF matrix

To apply this aggregation we just need to multiply the following row matrix from left to the RCDF matrix:
### $$
\left[ 1, p, p^2, \cdots, p^{(r -1 )} \right],
$$
where $r$ is the order of the RCDF matrix representation (the number of rows).

In [6]:
p = 0.3
order = matrix.shape[1]
parameter_vector = np.zeros(order)
for i in range(order):
    parameter_vector[i] += p**i
parameter_vector

array([1.   , 0.3  , 0.09 , 0.027])

In [7]:
feature_vectors = np.matmul(parameter_vector, matrix)
feature_vectors.shape

(14113, 14)

# Converting the dataset into pytorch tensors

In [8]:
features = torch.from_numpy(feature_vectors)
targets = torch.from_numpy(target_closeness)
features.shape, targets.shape

(torch.Size([14113, 14]), torch.Size([14113, 1]))

In [9]:
# Activating the automatic gradient 
features.requires_grad_(True)
targets.requires_grad_(True)

tensor([[0.2070],
        [0.2044],
        [0.2153],
        ...,
        [0.1421],
        [0.1829],
        [0.1871]], dtype=torch.float64, requires_grad=True)

# Dividing dataset into train and test data

In [10]:
# Shuffling and dividing the indecies
n_samples = features.shape[0]
# n_test = int(0.4331 * n_samples)
n_test = 4113
shuffled_ind = torch.randperm(n_samples)
train_ind = shuffled_ind[:-n_test]
test_ind = shuffled_ind[-n_test:]
# Dividing features and targets into tain and test sets
train_features = features[train_ind]
test_features = features[test_ind]
train_targets = targets[train_ind]
test_targets = targets[test_ind]
train_features.shape, test_features.shape, train_targets.shape, test_targets.shape

(torch.Size([10000, 14]),
 torch.Size([4113, 14]),
 torch.Size([10000, 1]),
 torch.Size([4113, 1]))

## A function for dividing train data into batches

In [11]:
# dividing train_features and train_targets into batches
def next_batch(train_features, train_targets, batch_size=100):
    num_features = train_features.shape[0]
    # Shuffling
    shuffled_ind = torch.randperm(num_features)
    shuffled_train_features = train_features[shuffled_ind]
    shuffled_train_targets = train_targets[shuffled_ind]
    # Dividing
    i = 0
    while i < num_features:
        i += batch_size
        yield (shuffled_train_features[i-batch_size:i], shuffled_train_targets[i-batch_size:i])
    return

## The feedforward neural network model

In [12]:
# The Feedforward Neural Network 
class FFNN_model(nn.Module):
    def __init__(self):
        super().__init__()
        num_features = features.shape[1]
        self.fc1 = nn.Linear(num_features, 64)
        self.fc2 = nn.Linear(64, 8)
        self.fc3 = nn.Linear(8, 1)
        self.dropout1 = nn.Dropout(0.3)
        
    def forward(self, X):
        X = torch.tanh(self.fc1(X))
        X = self.dropout1(X)
        X = torch.relu(self.fc2(X))
        return self.fc3(X)

In [13]:
# Instantiation of the model
torch.manual_seed(42)
model = FFNN_model()
model

FFNN_model(
  (fc1): Linear(in_features=14, out_features=64, bias=True)
  (fc2): Linear(in_features=64, out_features=8, bias=True)
  (fc3): Linear(in_features=8, out_features=1, bias=True)
  (dropout1): Dropout(p=0.3, inplace=False)
)

In [14]:
num_para = 0
for param in model.parameters():
    print(param.numel())
    num_para += param.numel()
print(f'Number of all parameters: {num_para}')

896
64
512
8
8
1
Number of all parameters: 1489


In [15]:
# Objective function and optimizer
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-3)
# optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)

## Defining the training loop

In [16]:
def training_loop(n_epochs=1000,
                  batch_size=100,
                  optimizer=optimizer, 
                  model=model, 
                  loss_fn=criterion, 
                  train_features=train_features, 
                  test_features=test_features, 
                  train_targets=train_targets, 
                  test_targets=test_targets):
    num_features = train_features.shape[0]
    start_time = time.time()
    all_train_loss, all_test_loss = np.zeros(n_epochs), np.zeros(n_epochs)
    for epoch in range(1, n_epochs +1):
        # Training: 
        epoch_losses = []
        # looping through batches
        for train_features, train_targets in next_batch(train_features=train_features, 
                                                        train_targets=train_targets, batch_size=batch_size): 
            train_preds = model(train_features.float())
            train_loss = loss_fn(train_targets.float(), train_preds.float())
            optimizer.zero_grad()
            train_loss.backward(retain_graph=True)
            optimizer.step()
            
            epoch_losses.append(train_loss.item())
        average_epoch_loss = sum(epoch_losses)/len(epoch_losses)
        
        # Test:
        with torch.no_grad():
            test_preds = model(test_features.float())
            test_loss = loss_fn(test_targets.float(), test_preds.float())
        
        all_train_loss[epoch - 1] = average_epoch_loss
        all_test_loss[epoch - 1] = test_loss.item()
        # Printing the result: 
        if epoch == 1 or epoch % 100 == 0:
            print(f"EPOCH: {epoch:{7}}")
            print(f"MEAN TRAIN LOSS:   {average_epoch_loss:.11f},    TEST LOSS:   {test_loss.item():.11f}")
            print("-----------------------------------------")
    print("The total time = ", np.round(time.time() - start_time, 3), " seconds!")
    return all_train_loss, all_test_loss

# Training the model

In [17]:
losses = training_loop(n_epochs=2000,
                  batch_size=400,
                  optimizer=optimizer, 
                  model=model, 
                  loss_fn=criterion, 
                  train_features=train_features, 
                  test_features=test_features, 
                  train_targets=train_targets, 
                  test_targets=test_targets)

EPOCH:       1
MEAN TRAIN LOSS:   0.03411948673,    TEST LOSS:   0.01795048639
-----------------------------------------
EPOCH:     100
MEAN TRAIN LOSS:   0.00431470387,    TEST LOSS:   0.00393803185
-----------------------------------------
EPOCH:     200
MEAN TRAIN LOSS:   0.00211415370,    TEST LOSS:   0.00224594935
-----------------------------------------
EPOCH:     300
MEAN TRAIN LOSS:   0.00098659063,    TEST LOSS:   0.00109597086
-----------------------------------------
EPOCH:     400
MEAN TRAIN LOSS:   0.00050469639,    TEST LOSS:   0.00058703177
-----------------------------------------
EPOCH:     500
MEAN TRAIN LOSS:   0.00042445681,    TEST LOSS:   0.00040228470
-----------------------------------------
EPOCH:     600
MEAN TRAIN LOSS:   0.00034332447,    TEST LOSS:   0.00032935367
-----------------------------------------
EPOCH:     700
MEAN TRAIN LOSS:   0.00024077919,    TEST LOSS:   0.00023643581
-----------------------------------------
EPOCH:     800
MEAN TRAIN LOSS: 

In [18]:
# Saving the losses as numpy files
np.save("fb_co_pages_train_losses_closeness.npy", losses[0], allow_pickle=False, fix_imports=True)
np.save("fb_co_pages_test_losses_closeness.npy", losses[1], allow_pickle=False, fix_imports=True)

In [19]:
model.eval()
errors = []
for i in range(n_test):
    targ = test_targets[i].item()
    feat = test_features[i].float().view(1,1,14)
    pred = model(feat).item()
    inaccuracy = abs(1 - pred/targ) * 100
    errors.append(inaccuracy)
    if i%100 == 0:
        print(f"target {targ},    prediction: {pred}\nindex {i}:       inaccuracy: {np.round(inaccuracy, 3)}%")
        print("-----------------------------------")

target 0.1919738811046116,    prediction: 0.19004003703594208
index 0:       inaccuracy: 1.007%
-----------------------------------
target 0.16215470882934227,    prediction: 0.1618574857711792
index 100:       inaccuracy: 0.183%
-----------------------------------
target 0.18277898663350947,    prediction: 0.18470200896263123
index 200:       inaccuracy: 1.052%
-----------------------------------
target 0.1730939063879894,    prediction: 0.17585161328315735
index 300:       inaccuracy: 1.593%
-----------------------------------
target 0.22196008115887322,    prediction: 0.2249683439731598
index 400:       inaccuracy: 1.355%
-----------------------------------
target 0.24038019316265521,    prediction: 0.24164657294750214
index 500:       inaccuracy: 0.527%
-----------------------------------
target 0.19727406164814426,    prediction: 0.19582994282245636
index 600:       inaccuracy: 0.732%
-----------------------------------
target 0.1726027397260274,    prediction: 0.17821118235588074

In [20]:
print("Average inaccuracy: ", np.round(sum(errors)/len(errors), 3))

Average inaccuracy:  1.86


In [21]:
torch.save(model, "trained_model_fb_co_closeness.pt")

# Testing the model on the modified graph

In [22]:
# Loading closeness centrality for modified graph as the target set
target_closeness_modified = np.load("fb_co_pages_modified_closeness.npy", mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')
target_closeness_modified[0]

array([0.20703618])

In [23]:
# Loading the RCDF matrix for modified graph as the feature set
matrix_modified = np.load("fb_co_pages_modified_RDFC_matrix_r1-5_sta1_max35_rad4.npy", mmap_mode=None, allow_pickle=False, fix_imports=True, encoding='ASCII')
matrix_modified.shape

(14113, 4, 14)

In [24]:
feature_vectors_modified = np.matmul(parameter_vector, matrix_modified)
feature_vectors_modified.shape

(14113, 14)

# Converting the dataset into pytorch tensors

In [25]:
features_modified = torch.from_numpy(feature_vectors_modified)
targets_modified = torch.from_numpy(target_closeness_modified)
features_modified.shape, targets_modified.shape

(torch.Size([14113, 14]), torch.Size([14113, 1]))

In [26]:
errors_modified = []
for i in range(targets_modified.shape[0]):
    targ = targets_modified[i].item()
    feat = features_modified[i].float().view(1,1,14)
    pred = model(feat).item()
    inaccuracy = abs(1 - pred/targ) * 100
    errors_modified.append(inaccuracy)
    if i%500 == 0:
        print(f"target {targ},    prediction: {pred}\nindex {i}:       inaccuracy: {np.round(inaccuracy, 3)}%")
        print("-----------------------------------")

target 0.20703617851588862,    prediction: 0.21225489675998688
index 0:       inaccuracy: 2.521%
-----------------------------------
target 0.14359997150794215,    prediction: 0.15926894545555115
index 500:       inaccuracy: 10.912%
-----------------------------------
target 0.20568730049993442,    prediction: 0.20209208130836487
index 1000:       inaccuracy: 1.748%
-----------------------------------
target 0.18397517795218105,    prediction: 0.18510966002941132
index 1500:       inaccuracy: 0.617%
-----------------------------------
target 0.18901687650683097,    prediction: 0.1868850290775299
index 2000:       inaccuracy: 1.128%
-----------------------------------
target 0.14950102760768694,    prediction: 0.1511806696653366
index 2500:       inaccuracy: 1.123%
-----------------------------------
target 0.1795351322468608,    prediction: 0.1767638474702835
index 3000:       inaccuracy: 1.544%
-----------------------------------
target 0.21860767729342875,    prediction: 0.2220205217

In [27]:
print("Average inaccuracy on the modified graph: ", np.round(sum(errors_modified)/len(errors_modified), 3))

Average inaccuracy on the modified graph:  2.195
