In [2]:
import dgl
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from dgl.data import CoraGraphDataset
import random, math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')



# Setting Up DGL Dataset

In [3]:
g = None
in_feats = None
h_feats = None
num_classes = None
cora_dataset = None
features = None

def reset_dataset():
    global g, in_feats, h_feats, num_classes, cora_dataset, features

    cora_dataset = CoraGraphDataset()
    g = cora_dataset[0]
    features = g.ndata['feat']

    in_feats = features.shape[1]
    h_feats = 64
    num_classes = cora_dataset.num_classes

reset_dataset()

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.


# Setting Up Saved Model + Ground Truth

In [4]:
class GCN(nn.Module):
    def __init__(self, g, in_feats, h_feats, num_classes):
        super(GCN, self).__init__()
        self.conv1 = dgl.nn.GraphConv(in_feats, h_feats)
        self.conv2 = dgl.nn.GraphConv(h_feats, num_classes)

    def forward(self, g, in_feat):
        h = self.conv1(g, in_feat)
        h = F.relu(h)
        h = self.conv2(g, h)
        return h

In [5]:
model = GCN(g, in_feats, h_feats, num_classes)
model.load_state_dict(torch.load("../model/cora_gt.pt"))
model.eval()

GCN(
  (conv1): GraphConv(in=1433, out=64, normalization=both, activation=None)
  (conv2): GraphConv(in=64, out=7, normalization=both, activation=None)
)

In [6]:
def test(data):
    model.eval()
    out = model(data, features)
    pred = out.argmax(dim=1)

    acc = (pred[data.ndata["test_mask"]] == data.ndata["label"][data.ndata["test_mask"]]).sum().item() / data.ndata["test_mask"].sum().item()
    return acc

In [7]:
def changed_acc(gt, cv):
    print("\n----")
    if gt != cv:
        print(f'The accuracy has changed by {gt - cv:.4f}')
    else:
        print("The accuracy has not changed.")

In [8]:
ground_truth = test(cora_dataset[0])
ground_truth

0.769

# Experiments
**Note:** The ground truth here is $0.769$

In [9]:
homophilic_set = {label: [] for label in set(g.ndata["label"].tolist())}

test_indices = torch.nonzero(g.ndata["test_mask"], as_tuple=False).squeeze()
for i in test_indices:
    homophilic_set[g.ndata["label"][i].item()].append(i.item())

print("This is the dictionary containing each class and its respective elements:\n\t", homophilic_set)

This is the dictionary containing each class and its respective elements:
	 {0: [1713, 1801, 1803, 1838, 1839, 1841, 1843, 1847, 1848, 1850, 1892, 1905, 1909, 1910, 1912, 1913, 1920, 1921, 1922, 1923, 1925, 1926, 1927, 1930, 1981, 1983, 2010, 2016, 2018, 2019, 2021, 2025, 2047, 2049, 2051, 2052, 2053, 2054, 2055, 2056, 2057, 2059, 2068, 2069, 2070, 2072, 2073, 2074, 2089, 2097, 2098, 2099, 2116, 2131, 2133, 2136, 2137, 2155, 2156, 2171, 2172, 2179, 2180, 2181, 2182, 2183, 2186, 2189, 2193, 2194, 2195, 2196, 2197, 2198, 2199, 2200, 2201, 2202, 2203, 2211, 2219, 2231, 2232, 2233, 2234, 2246, 2266, 2267, 2268, 2274, 2277, 2280, 2291, 2293, 2295, 2311, 2312, 2321, 2330, 2332, 2333, 2342, 2347, 2356, 2360, 2364, 2384, 2404, 2408, 2418, 2419, 2424, 2449, 2453, 2469, 2470, 2472, 2499, 2502, 2511, 2533, 2544, 2562, 2575, 2576, 2578, 2580, 2639, 2653, 2702], 1: [1735, 1767, 1768, 1769, 1770, 1771, 1772, 1773, 1775, 1776, 1777, 1778, 1779, 1780, 1781, 1782, 1783, 1784, 1786, 1787, 1788, 1789, 17

### Connect with same class nodes which do not already have edges
Take the first element in a class and add edges to all OTHER elements in that class if they do not currently exist.

In [10]:
reset_dataset()
modified_graph = cora_dataset[0]

for i in range(0, num_classes):
    for j in range(1, len(homophilic_set[i])):
        if not modified_graph.has_edges_between([homophilic_set[i][0]], j):
            modified_graph.add_edges(j, homophilic_set[i][0])
            modified_graph.add_edges(homophilic_set[i][0], j)

changed_acc(ground_truth, test(modified_graph))

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0020


### Connect same class nodes with one other random nodes (which is is not currently a neighbor to)

In [15]:
reset_dataset()
modified_graph = cora_dataset[0]

for i in range(0, num_classes):
    r_node = random.choice(homophilic_set[i])
    n_node = random.choice(homophilic_set[i])

    while r_node == n_node or modified_graph.has_edges_between(n_node, r_node):
        n_node = random.choice(homophilic_set[i])

    modified_graph.add_edges(r_node, n_node)
    modified_graph.add_edges(n_node, r_node)

changed_acc(ground_truth, test(modified_graph))

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0020


### Create a dense graph between all nodes with the same class

In [34]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        for k in range(j + 1, len(homophilic_set[i])):
            j_node = homophilic_set[i][j]
            k_node = homophilic_set[i][k]
            if j_node == k_node or modified_graph.has_edges_between(j_node, k_node):
                continue

            modified_graph.add_edges(j_node, k_node)
            modified_graph.add_edges(k_node, j_node)
            # print("added: ", j_node, " - ", k_node)

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.2310
Change in edges:  91256.0


#### Increase each node's number homophilic edges by a certain threshold?
Let's start with $\lfloor 0.1 \times h_e \rfloor$, where $h_e$ is the number of homophilic edges.
In the current implementation, the nodes which are seen last likely see more nodes added.

In [46]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.1)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has not changed.
Change in edges:  4.0
{0: [10], 1: [], 2: [], 3: [16, 23, 23], 4: [], 5: [], 6: []}


In [47]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.15)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0010
Change in edges:  11.0
{0: [10], 1: [9], 2: [7, 7, 7], 3: [16, 16, 23, 23, 23], 4: [7], 5: [], 6: []}


In [48]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.20)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0040
Change in edges:  28.0
{0: [10, 10], 1: [9], 2: [7, 5, 6, 6, 7, 5, 7], 3: [16, 16, 16, 5, 23, 23, 23, 23, 5, 5, 5], 4: [7, 5, 6, 5], 5: [5, 5], 6: [6]}


In [49]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.25)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0030
Change in edges:  47.0
{0: [4, 4, 10, 10], 1: [9, 9, 4], 2: [7, 5, 6, 6, 7, 5, 7, 4], 3: [16, 16, 16, 16, 5, 4, 23, 23, 23, 23, 23, 5, 5, 5, 4], 4: [4, 4, 7, 5, 6, 4, 4, 4, 4, 5], 5: [4, 5, 5], 6: [6, 4, 4, 4]}


In [50]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.30)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0070
Change in edges:  55.0
{0: [4, 4, 10, 10, 10], 1: [9, 9, 4], 2: [7, 7, 5, 6, 6, 7, 7, 5, 7, 7, 4], 3: [16, 16, 16, 16, 5, 4, 23, 23, 23, 23, 23, 23, 5, 5, 5, 4], 4: [4, 4, 7, 7, 5, 6, 4, 4, 4, 4, 5], 5: [4, 5, 5, 4, 4], 6: [6, 4, 4, 4]}


In [54]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 1/3)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0100
Change in edges:  128.0
{0: [4, 3, 3, 4, 10, 10, 10, 3, 3, 3], 1: [3, 3, 9, 9, 9, 3, 3, 4, 3, 3], 2: [3, 3, 7, 7, 3, 3, 5, 6, 6, 6, 6, 3, 7, 7, 3, 3, 3, 3, 3, 3, 5, 8, 8, 4, 3, 3, 3, 3], 3: [16, 16, 16, 16, 16, 3, 3, 3, 3, 5, 3, 4, 23, 23, 23, 23, 23, 23, 23, 3, 3, 5, 3, 3, 3, 5, 3, 3, 3, 3, 5, 3, 3, 4, 3, 3, 3, 3, 3], 4: [4, 3, 4, 3, 3, 7, 7, 5, 3, 3, 3, 6, 6, 4, 3, 4, 4, 3, 3, 4, 3, 5], 5: [3, 3, 3, 3, 4, 5, 5, 3, 3, 3, 3, 3], 6: [6, 6, 4, 4, 4, 4, 4]}


In [51]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.35)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0250
Change in edges:  138.0
{0: [4, 3, 3, 4, 10, 10, 10, 3, 3, 3, 3, 3], 1: [3, 3, 9, 9, 9, 3, 3, 4, 3, 3, 3], 2: [3, 3, 7, 7, 3, 3, 3, 5, 6, 6, 6, 6, 3, 7, 7, 3, 4, 3, 3, 5, 7, 7, 3, 3, 4, 3, 3, 3], 3: [16, 16, 16, 16, 16, 3, 3, 3, 3, 5, 3, 4, 23, 23, 23, 23, 23, 23, 23, 23, 3, 4, 5, 3, 4, 5, 4, 3, 3, 3, 5, 3, 4, 3, 3, 3, 3, 3, 3], 4: [4, 3, 4, 3, 3, 3, 7, 7, 5, 3, 3, 3, 6, 6, 4, 3, 4, 3, 4, 3, 4, 3, 3, 4, 5, 3], 5: [3, 3, 4, 3, 4, 5, 5, 3, 3, 3, 3, 3, 3, 3], 6: [6, 6, 3, 3, 4, 4, 4, 3]}


In [53]:
reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = modified_graph.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges[1]:
            if (e.item() in homophilic_set[i]):
                same_class_edges.add(e.item())

        for k in range(0, math.floor(len(same_class_edges) * 0.5)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            while b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                c_node = random.choice(homophilic_set[i])

            modified_graph.add_edges(b_node, c_node)
            modified_graph.add_edges(c_node, b_node)
            class_and_added[i].append(len(same_class_edges))

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)
print(class_and_added)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by -0.0460
Change in edges:  419.0
{0: [2, 2, 2, 2, 4, 4, 2, 2, 3, 2, 3, 3, 2, 2, 4, 4, 10, 10, 10, 10, 10, 2, 2, 2, 2, 4, 4, 3, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2], 1: [2, 2, 2, 3, 3, 2, 3, 2, 2, 2, 2, 2, 9, 9, 9, 9, 3, 2, 2, 3, 3, 2, 2, 4, 4, 2, 2, 2, 2, 2, 2, 3, 2, 2, 2, 2], 2: [3, 3, 2, 2, 2, 7, 7, 7, 3, 2, 3, 3, 5, 5, 6, 6, 6, 6, 6, 6, 2, 2, 2, 3, 3, 3, 2, 2, 2, 7, 7, 7, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 2, 2, 2, 3, 3, 3, 2, 2, 3, 2, 5, 5, 2, 8, 8, 8, 8, 2, 2, 3, 2, 2, 2, 2, 5, 5, 2, 3, 3, 2, 3], 3: [2, 2, 2, 16, 16, 16, 16, 16, 16, 16, 16, 2, 3, 2, 2, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 4, 4, 3, 3, 2, 2, 5, 5, 3, 4, 4, 2, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 3, 4, 4, 2, 2, 2, 2, 2, 2, 2, 5, 5, 3, 2, 3, 3, 5, 5, 3, 3, 2, 2, 2, 2, 2, 4, 4, 2, 3, 4, 4, 2, 2, 2, 3, 

#### Accidental Degree Experiment

In [44]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()

edge_set = defaultdict(list)
c = 0.1

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has not changed.
Change in edges:  11.0


In [45]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

edge_set = defaultdict(list)
c = 0.15

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has not changed.
Change in edges:  41.0


In [46]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

edge_set = defaultdict(list)
c = 0.2

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has not changed.
Change in edges:  186.0


In [47]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

edge_set = defaultdict(list)
c = 0.25

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0020
Change in edges:  309.0


In [48]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

edge_set = defaultdict(list)
c = 0.3

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0020
Change in edges:  315.0


In [49]:
from collections import defaultdict

reset_dataset()
modified_graph = cora_dataset[0]
init_edges = modified_graph.number_of_edges()
class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

edge_set = defaultdict(list)
c = 1

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edge_set[modified_graph.out_degrees(j)].append(j)


# note this implementation is likely inefficient
for key in edge_set:
    if key == 0:
        continue
    else:
        for elem in edge_set[key]:
            n = min(int(math.floor(key * c)), len(edge_set[key]))
                
            for i in range(0, n):
                b_node = elem
                c_node = random.choice(edge_set[key])

                if b_node == c_node or modified_graph.has_edges_between(b_node, c_node):
                    continue

                modified_graph.add_edges(b_node, c_node)
                modified_graph.add_edges(c_node, b_node)
            

changed_acc(ground_truth, test(modified_graph))
print("Change in edges: ", (modified_graph.number_of_edges() - init_edges)/2)

  NumNodes: 2708
  NumEdges: 10556
  NumFeats: 1433
  NumClasses: 7
  NumTrainingSamples: 140
  NumValidationSamples: 500
  NumTestSamples: 1000
Done loading data from cached files.

----
The accuracy has changed by 0.0050
Change in edges:  1605.0
