In [2]:
# these warnings are fine. you can ignore them.
from util import *
import random, math

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Imports finished.")



Imports finished.


# Setting Up Dataset/Model/Ground Truth

In [3]:
dataset = Dataset(root='/tmp/Cora', name='Cora', device=device)
data, in_feats, h_feats, num_classes = dataset.get_data()

model = get_model(in_feats, h_feats, num_classes)

ground_truth = get_ground_truth(model, data)
print(ground_truth)

0.749


# Experiments
**Note:** The ground truth here is $0.749$

In [4]:
homophilic_set = {label: [] for label in set(data.y.tolist())}

test_indices = torch.nonzero(data.test_mask, as_tuple=False).squeeze()
for i in test_indices:
    homophilic_set[data.y[i].item()].append(i.item())

# print("This is the dictionary containing each class and its respective elements:\n\t", homophilic_set)

### Connect with same class nodes which do not already have edges
Take the first element in a class and add edges to all OTHER elements in that class if they do not currently exist.

In [5]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

for i in range(0, num_classes):
    for j in range(1, len(homophilic_set[i])):
        if not G.has_edge(i, j):
            add_edge(G, i, j, undirected=True)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has changed by 0.0030
Change in edges:  974.0  | Percentage change: 18.45%


### Connect same class nodes with one other random nodes (which is is not currently a neighbor to)

In [6]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

for i in range(0, num_classes):
    r_node = random.choice(homophilic_set[i])
    n_node = random.choice(homophilic_set[i])

    while r_node == n_node or G.has_edge(r_node, n_node):
        n_node = random.choice(homophilic_set[i])

    add_edge(G, r_node, n_node, undirected=True)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has changed by 0.0030
Change in edges:  7.0  | Percentage change: 0.13%


### Create a dense graph between all nodes with the same class

In [7]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        for k in range(j + 1, len(homophilic_set[i])):
            j_node = homophilic_set[i][j]
            k_node = homophilic_set[i][k]
            if j_node == k_node or G.has_edge(j_node, k_node):
                continue

            add_edge(G, j_node, k_node, undirected=True)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has changed by -0.2510
Change in edges:  91256.0


#### Increase each node's number homophilic edges by a certain threshold?
Let's start with $\lfloor 0.1 \times h_e \rfloor$, where $h_e$ is the number of homophilic edges.
In the current implementation, the nodes which are seen last likely see more nodes added.

In [26]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.1

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0010
Change in edges:  4.0


In [28]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.15

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0010
Change in edges:  11.0


In [29]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.20

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0030
Change in edges:  28.0


In [30]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.25

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0070
Change in edges:  47.0


In [31]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.3

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0080
Change in edges:  55.0


In [32]:
data = dataset.get_data()[0]
modified_graph = data
c = 1/3

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0160
Change in edges:  122.0


In [33]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.35

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0240
Change in edges:  133.0


In [34]:
data = dataset.get_data()[0]
modified_graph = data
c = 0.5

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

# class_and_added = {label: [] for label in set(g.ndata["label"].tolist())}

# note this implementation is likely inefficient
for i in range(0, num_classes):
    for j in range(0, len(homophilic_set[i])):
        edges = G.out_edges(homophilic_set[i][j])

        same_class_edges = set()
        for e in edges:
            if (e[1] in homophilic_set[i]):
                same_class_edges.add(e[1])

        for k in range(0, math.floor(len(same_class_edges) * c)):
            b_node = homophilic_set[i][j]
            c_node = random.choice(homophilic_set[i])

            ctr = 0
            while (b_node == c_node or G.has_edge(b_node, c_node)) and ctr <= math.floor(len(same_class_edges) * c):
                c_node = random.choice(homophilic_set[i])
                ctr += 1

            add_edge(G, b_node, c_node, undirected=True)
            # class_and_added[i].append(len(same_class_edges))

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph))
number_added_edges(init_edges, final_edges, is_undirected=True)

# print(class_and_added)


----
The accuracy has changed by -0.0480
Change in edges:  443.0
