In [2]:
# these warnings are fine. you can ignore them.
import random, math

import sys
sys.path.append('../')
from util import *

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Imports finished.")

Imports finished.


# Setting Up Dataset/Model/Ground Truth

In [3]:
dataset = Dataset(root='/tmp/CiteSeer', name='CiteSeer', device=device)
data, in_feats, h_feats, num_classes = dataset.get_data()

model = get_model(in_feats, h_feats, num_classes, 'citeseer')

ground_truth = get_ground_truth(model, data)
print(ground_truth)

0.537


# Experiments
**Note:** The ground truth here is $0.749$

Experiment ideas:
- What happens if we turn a connected component (all in a certain class) into a clique?
- What happens if we turn a connected component (all in a certain class) with a certain density into a clique? 
- What happens if we increase the density of a connected component (all in a certain class) by a certain threshold?
- What happens if we turn a connected component (across classes) into a clique?

In [4]:
homophilic_set = {label: [] for label in set(data.y.tolist())}

test_indices = torch.nonzero(data.test_mask, as_tuple=False).squeeze()
for i in test_indices:
    homophilic_set[data.y[i].item()].append(i.item())

# print("This is the dictionary containing each class and its respective vertices:\n\t", homophilic_set)

### Connected component -> Clique
What happens if we turn a connected component (all in a certain class) into a clique?

In [5]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

cc = sorted(nx.strongly_connected_components(G), key=len, reverse=True)

for i in cc:
    s = set(i)
    for j in range(0, num_classes):
        if s == s.intersection(homophilic_set[j]):
            make_clique(G, s)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has not changed.
Change in edges:  1.0  | Percentage change: 0.02%


### Connected component -> Clique
What happens if we turn a connected component with a certain density (all in a certain class) into a clique?

In [6]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

cc = sorted(nx.strongly_connected_components(G), key=len, reverse=True)

for i in cc:
    s = set(i)
    for j in range(0, num_classes):
        if (s == s.intersection(homophilic_set[j])) and (nx.density(G.subgraph(list(s))) > 0.2):
            make_clique(G, s)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has not changed.
Change in edges:  1.0  | Percentage change: 0.02%


### Connected component -> Clique
What happens if we turn a connected component (all in a certain class) into a clique?

**For this experiment, I will IGNORE other classes (meaning, I will make a subgraph of the specific class first).**

In [7]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

for j in range(0, len(homophilic_set)):
    new_G = G.subgraph(homophilic_set[j])
    cc = sorted(nx.strongly_connected_components(new_G), key=len, reverse=True)
    for i in cc:
        s = set(i)
        if len(s) > 1:
            make_clique(G, s)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has changed by 0.0180
Change in edges:  823.0  | Percentage change: 18.08%


### Connected component -> Clique

 

**For this experiment, I will IGNORE other classes (meaning, I will make a subgraph of the specific class first).**
The following values for *c* were tested:
- 0.05
- 0.1
- 0.15
- 0.2
- 0.25
- 0.3
- 0.35
- 0.4
- 0.45
- 0.50
- 0.55
- 0.60
- 0.65
- 0.7
- 0.75

In [8]:
c_values = [0.05, 0.1, 0.15, 0.2, 0.25, 0.3, 0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65, 0.70, 0.75, 0.80, 0.85]

for c in c_values:
    data = dataset.get_data()[0]
    modified_graph = data
    
    init_edges = len(modified_graph.edge_index[1])
    
    G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

    for j in range(0, len(homophilic_set)):
        new_G = G.subgraph(homophilic_set[j])
        cc = sorted(nx.strongly_connected_components(new_G), key=len, reverse=True)
        for i in cc:
            s = set(i)
            if len(s) > 1 and nx.density(G.subgraph(i)) >= c:
                make_clique(G, s)
    
    modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
    final_edges = len(modified_graph.edge_index[1])
    
    output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
    number_added_edges(init_edges, final_edges, is_undirected=True)
    print("For c value:", c)


----
The accuracy has changed by 0.0180
Change in edges:  823.0  | Percentage change: 18.08%
For c value: 0.05

----
The accuracy has changed by 0.0150
Change in edges:  504.0  | Percentage change: 11.07%
For c value: 0.1

----
The accuracy has changed by 0.0040
Change in edges:  278.0  | Percentage change: 6.11%
For c value: 0.15

----
The accuracy has changed by 0.0040
Change in edges:  215.0  | Percentage change: 4.72%
For c value: 0.2

----
The accuracy has changed by 0.0030
Change in edges:  136.0  | Percentage change: 2.99%
For c value: 0.25

----
The accuracy has changed by 0.0040
Change in edges:  115.0  | Percentage change: 2.53%
For c value: 0.3

----
The accuracy has changed by 0.0030
Change in edges:  75.0  | Percentage change: 1.65%
For c value: 0.35

----
The accuracy has changed by 0.0030
Change in edges:  75.0  | Percentage change: 1.65%
For c value: 0.4

----
The accuracy has changed by 0.0030
Change in edges:  54.0  | Percentage change: 1.19%
For c value: 0.45

----


### Connected component -> Clique
What happens if we increase the density of a connected component (all in a certain class) by a certain threshold?

In [9]:
c_values = [1.05, 1.1, 1.15, 1.2, 1.25, 1.3, 1.35, 1.4, 1.45, 1.5, 1.55, 1.6, 1.65, 1.70, 1.75, 1.80, 1.85, 1.9, 1.95, 2, 3, 4, 5, 6, 7, 8, 9, 10, 100]
# strange...it starts to plateau after 5-6
# definitely something, I'll have to think on what...


def increase_density(G, s, threshold):
    while nx.density(G.subgraph(s)) < min(threshold, 1):
        random_pair = random.sample(s, 2)
        add_edge(G, random_pair[0], random_pair[1], undirected=True)

for c in c_values:
    data = dataset.get_data()[0]
    modified_graph = data
    
    init_edges = len(modified_graph.edge_index[1])
    
    G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

    for j in range(0, len(homophilic_set)):
        new_G = G.subgraph(homophilic_set[j])
        cc = sorted(nx.strongly_connected_components(new_G), key=len, reverse=True)
        for i in cc:
            s = set(i)
            if len(s) > 1:
                threshold = nx.density(G.subgraph(s))
                increase_density(G, s, c*threshold)
    
    modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
    final_edges = len(modified_graph.edge_index[1])
    
    output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
    number_added_edges(init_edges, final_edges, is_undirected=True)
    print("For c value:", c)


----
The accuracy has changed by 0.0030
Change in edges:  43.0  | Percentage change: 0.94%
For c value: 1.05

----
The accuracy has changed by 0.0040
Change in edges:  48.0  | Percentage change: 1.05%
For c value: 1.1

----
The accuracy has changed by 0.0030
Change in edges:  55.0  | Percentage change: 1.21%
For c value: 1.15

----
The accuracy has changed by 0.0010
Change in edges:  61.0  | Percentage change: 1.34%
For c value: 1.2

----
The accuracy has changed by 0.0040
Change in edges:  69.0  | Percentage change: 1.52%
For c value: 1.25

----
The accuracy has changed by 0.0040
Change in edges:  80.0  | Percentage change: 1.76%
For c value: 1.3

----
The accuracy has changed by 0.0060
Change in edges:  96.0  | Percentage change: 2.11%
For c value: 1.35

----
The accuracy has changed by 0.0020
Change in edges:  99.0  | Percentage change: 2.17%
For c value: 1.4

----
The accuracy has changed by 0.0070
Change in edges:  111.0  | Percentage change: 2.44%
For c value: 1.45

----
The acc

## Connected-Component -> Clique
Here, I'm cliquing the ENTIRE graph (i.e. all connected components are now becoming cliques).
This is across classes (breaking the homophily).

In [10]:
data = dataset.get_data()[0]
modified_graph = data

init_edges = len(modified_graph.edge_index[1])

G, x, y, train_mask, test_mask = convert_to_networkx(modified_graph)

cc = sorted(nx.strongly_connected_components(G), key=len, reverse=True)

for i in cc:
    s = set(i)
    make_clique(G, s)

modified_graph = convert_to_pyg(G, x, y, train_mask, test_mask)
final_edges = len(modified_graph.edge_index[1])

output_accuracy_change(ground_truth, test_model(model, modified_graph)) 
number_added_edges(init_edges, final_edges, is_undirected=True)


----
The accuracy has changed by -0.2680
Change in edges:  2243611.0  | Percentage change: 49288.47%
