In [1]:
from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/ECE\ 232E
!wget https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
!tar -zxvf cora.tgz

/content/gdrive/MyDrive/ECE 232E
--2022-04-30 17:36:50--  https://linqs-data.soe.ucsc.edu/public/lbc/cora.tgz
Resolving linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)... 128.114.47.74
Connecting to linqs-data.soe.ucsc.edu (linqs-data.soe.ucsc.edu)|128.114.47.74|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 168052 (164K) [application/x-gzip]
Saving to: ‘cora.tgz.1’


2022-04-30 17:36:50 (967 KB/s) - ‘cora.tgz.1’ saved [168052/168052]

cora/
cora/README
cora/cora.cites
cora/cora.content


In [3]:
# node2vec from https://github.com/aditya-grover/node2vec.git
import numpy as np
import networkx as nx
import random


class Graph():
	def __init__(self, nx_G, is_directed, p, q):
		self.G = nx_G
		self.is_directed = is_directed
		self.p = p
		self.q = q

	def node2vec_walk(self, walk_length, start_node):
		'''
		Simulate a random walk starting from start node.
		'''
		G = self.G
		alias_nodes = self.alias_nodes
		alias_edges = self.alias_edges

		walk = [start_node]

		while len(walk) < walk_length:
			cur = walk[-1]
			cur_nbrs = sorted(G.neighbors(cur))
			if len(cur_nbrs) > 0:
				if len(walk) == 1:
					walk.append(cur_nbrs[alias_draw(alias_nodes[cur][0], alias_nodes[cur][1])])
				else:
					prev = walk[-2]
					next = cur_nbrs[alias_draw(alias_edges[(prev, cur)][0], 
						alias_edges[(prev, cur)][1])]
					walk.append(next)
			else:
				break

		return walk

	def simulate_walks(self, num_walks, walk_length):
		'''
		Repeatedly simulate random walks from each node.
		'''
		G = self.G
		walks = []
		nodes = list(G.nodes())
		print ('Walk iteration:')
		for walk_iter in range(num_walks):
			print (str(walk_iter+1), '/', str(num_walks))
			random.shuffle(nodes)
			for node in nodes:
				walks.append(self.node2vec_walk(walk_length=walk_length, start_node=node))

		return walks

	def get_alias_edge(self, src, dst):
		'''
		Get the alias edge setup lists for a given edge.
		'''
		G = self.G
		p = self.p
		q = self.q

		unnormalized_probs = []
		for dst_nbr in sorted(G.neighbors(dst)):
			if dst_nbr == src:
				unnormalized_probs.append(G[dst][dst_nbr]['weight']/p)
			elif G.has_edge(dst_nbr, src):
				unnormalized_probs.append(G[dst][dst_nbr]['weight'])
			else:
				unnormalized_probs.append(G[dst][dst_nbr]['weight']/q)
		norm_const = sum(unnormalized_probs)
		normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]

		return alias_setup(normalized_probs)

	def preprocess_transition_probs(self):
		'''
		Preprocessing of transition probabilities for guiding the random walks.
		'''
		G = self.G
		is_directed = self.is_directed

		alias_nodes = {}
		for node in G.nodes():
			unnormalized_probs = [G[node][nbr]['weight'] for nbr in sorted(G.neighbors(node))]
			norm_const = sum(unnormalized_probs)
			normalized_probs =  [float(u_prob)/norm_const for u_prob in unnormalized_probs]
			alias_nodes[node] = alias_setup(normalized_probs)

		alias_edges = {}
		triads = {}

		if is_directed:
			for edge in G.edges():
				alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
		else:
			for edge in G.edges():
				alias_edges[edge] = self.get_alias_edge(edge[0], edge[1])
				alias_edges[(edge[1], edge[0])] = self.get_alias_edge(edge[1], edge[0])

		self.alias_nodes = alias_nodes
		self.alias_edges = alias_edges

		return


def alias_setup(probs):
	'''
	Compute utility lists for non-uniform sampling from discrete distributions.
	Refer to https://hips.seas.harvard.edu/blog/2013/03/03/the-alias-method-efficient-sampling-with-many-discrete-outcomes/
	for details
	'''
	K = len(probs)
	q = np.zeros(K)
	J = np.zeros(K, dtype=np.int)

	smaller = []
	larger = []
	for kk, prob in enumerate(probs):
	    q[kk] = K*prob
	    if q[kk] < 1.0:
	        smaller.append(kk)
	    else:
	        larger.append(kk)

	while len(smaller) > 0 and len(larger) > 0:
	    small = smaller.pop()
	    large = larger.pop()

	    J[small] = large
	    q[large] = q[large] + q[small] - 1.0
	    if q[large] < 1.0:
	        smaller.append(large)
	    else:
	        larger.append(large)

	return J, q

def alias_draw(J, q):
	'''
	Draw sample from a non-uniform discrete distribution using alias sampling.
	'''
	K = len(J)

	kk = int(np.floor(np.random.rand()*K))
	if np.random.rand() < q[kk]:
	    return kk
	else:
	    return J[kk]


In [4]:
# main function https://github.com/aditya-grover/node2vec.git
'''
Reference implementation of node2vec. 

Author: Aditya Grover

For more details, refer to the paper:
node2vec: Scalable Feature Learning for Networks
Aditya Grover and Jure Leskovec 
Knowledge Discovery and Data Mining (KDD), 2016
'''
import numpy as np
import networkx as nx
from gensim.models import Word2Vec

def read_graph():
	'''
	Reads the input network in networkx.
	'''
	G = nx.read_edgelist('./cora/cora.cites', nodetype=int, create_using=nx.DiGraph())
	for edge in G.edges():
		G[edge[0]][edge[1]]['weight'] = 1
	G = G.to_undirected()
	return G

def learn_embeddings(walks, featureSize):
	'''
	Learn embeddings by optimizing the Skipgram objective using SGD.
	'''
	walks = [list(map(str, walk)) for walk in walks]
	model = Word2Vec(walks, size=featureSize, window=10, min_count=0, sg=1, workers=8, iter=1)
	return model

In [9]:
# build model
import torch
import torch.nn as nn
import torch.nn.functional as F

class Model(nn.Module):
  def __init__(self, featureSize):
    super().__init__()

    self.dense1 = nn.Linear(featureSize, 16)
    self.dense2 = nn.Linear(16, 7)
    self.activation = nn.ReLU()

  def forward(self, x):
    x = self.dense1(x)
    x = self.activation(x)
    x = self.dense2(x)
    return x

In [10]:
# dataset
class  CoraDataset(torch.utils.data.Dataset):
  def __init__(self, train_x, train_y):
    self.train_x = train_x
    self.train_y = train_y

  def __len__(self):
    return self.train_x.shape[0]

  def __getitem__(self, idx):
    return self.train_x[idx], self.train_y[idx]

In [50]:
def train(trainloader, featureSize, epochs):
  from torchsummary import summary
  model = Model(featureSize=featureSize)
  model.cuda()
  summary(model, (featureSize,))
  optimizer = torch.optim.Adam(model.parameters())
  lossfn = torch.nn.CrossEntropyLoss()
  for epoch in range(epochs):
    running_loss = 0.0
    for i, data in enumerate(trainloader):
      inputs, labels = data

      # zero the parameter gradients
      optimizer.zero_grad()

      # forward + backward + optimize
      outputs = model(inputs)
      loss = lossfn(outputs, labels)
      loss.backward()
      optimizer.step()

      # print statistics
      running_loss += loss.item()
      if i % 10 == 9:
          print(f'[{epoch + 1}, {i + 1:5d}] loss: {running_loss / 10:.3f}')
          running_loss = 0.0
  
  return model

In [12]:
def test(trainloader, testloader, model, numTrain, numTest):

  correct = 0
  total = 0
  # since we're not training, we don't need to calculate the gradients for our outputs
  with torch.no_grad():
      for data in trainloader:
          inputs, labels = data
          # calculate outputs by running images through the network
          outputs = model(inputs)
          # the class with the highest energy is what we choose as prediction
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f'Accuracy of the network on the {numTrain} train data: {100 * correct // total} %')

  correct = 0
  total = 0
  # since we're not training, we don't need to calculate the gradients for our outputs
  with torch.no_grad():
      for data in testloader:
          inputs, labels = data
          # calculate outputs by running images through the network
          outputs = model(inputs)
          # the class with the highest energy is what we choose as prediction
          _, predicted = torch.max(outputs.data, 1)
          total += labels.size(0)
          correct += (predicted == labels).sum().item()

  print(f'Accuracy of the network on the {numTest} test data: {100 * correct // total} %')

In [51]:
def getTrainTestLoader(data_x, data_y):
  id_dict = dict()
  shuffle = dict()
  for i in range(7):
    id_dict[i] = data_x[data_y==i]
    shuffle[i] = np.random.permutation(id_dict[i].shape[0])

  train_x = id_dict[0][shuffle[0][:20]]
  train_y = np.zeros(20) + 0
  test_x = id_dict[0][shuffle[0][20:]]
  test_y = np.zeros(shuffle[0][20:].shape[0]) + 0
  for i in range(1, 7):
    train_x = np.vstack((train_x, id_dict[i][shuffle[i][:20]]))
    train_y = np.hstack((train_y, np.zeros(20)+i))
    test_x = np.vstack((test_x, id_dict[i][shuffle[i][20:]]))
    test_y = np.hstack((test_y, np.zeros(shuffle[i][20:].shape[0])+i))

  train_x = torch.from_numpy(train_x).to(torch.float32).cuda()
  train_y = torch.from_numpy(train_y).to(torch.int64).cuda()
  test_x = torch.from_numpy(test_x).to(torch.float32).cuda()
  test_y = torch.from_numpy(test_y).to(torch.int64).cuda()
  
  print(train_x.shape)
  print(train_y.shape)  
  print(test_x.shape)
  print(test_y.shape)

  return torch.utils.data.DataLoader(CoraDataset(train_x, train_y), 4, shuffle=True), torch.utils.data.DataLoader(CoraDataset(test_x, test_y), 32, shuffle=False), np.zeros(train_x.shape[0]), np.zeros(test_x.shape[0])

In [24]:
def node2vecmode(p, q, featureSize):
  nx_G = read_graph()
  G = Graph(nx_G, False, p=p, q=q)
  G.preprocess_transition_probs()
  walks = G.simulate_walks(10, 80)
  model = learn_embeddings(walks, featureSize)

  # prepare training data
  # read training nodes
  import torch
  import torch.nn.functional as F
  import numpy as np
  label2id = {
      'Case_Based' : 0,
      'Genetic_Algorithms' : 1,
      'Neural_Networks' : 2,
      'Probabilistic_Methods' : 3,
      'Reinforcement_Learning' : 4,
      'Rule_Learning' : 5,
      'Theory' : 6,
  }
  data_x = np.zeros((2708, featureSize))
  data_y = np.zeros(2708)
  count = 0
  with open('./cora/cora.content', 'r') as f:
    for line in f:
      line = line.strip().split()
      data_x[count] = model.wv[line[0]]
      data_y[count] = label2id[line[-1]]
      count += 1
  
  trainloader, testloader, trainid, testid = getTrainTestLoader(data_x, data_y)

  model = train(trainloader, featureSize, 40)
  test(trainloader, testloader, model, trainid.shape[0], testid.shape[0])

In [25]:
def textfeaturemode():
  featureSize = 1433
  import torch
  import torch.nn.functional as F
  import numpy as np
  label2id = {
      'Case_Based' : 0,
      'Genetic_Algorithms' : 1,
      'Neural_Networks' : 2,
      'Probabilistic_Methods' : 3,
      'Reinforcement_Learning' : 4,
      'Rule_Learning' : 5,
      'Theory' : 6,
  }
  data_x = np.zeros((2708, featureSize))
  data_y = np.zeros(2708)
  count = 0
  with open('./cora/cora.content', 'r') as f:
    for line in f:
      line = line.strip().split()
      data_x[count] = np.asarray(list(map(int, line[1:-1])))
      data_y[count] = label2id[line[-1]]
      count += 1
  
  trainloader, testloader, trainid, testid = getTrainTestLoader(data_x, data_y)

  model = train(trainloader, featureSize, 20)
  test(trainloader, testloader, model, trainid.shape[0], testid.shape[0])

In [55]:
def combinemode(p, q, featureSize):
  nx_G = read_graph()
  G = Graph(nx_G, False, p=p, q=q)
  G.preprocess_transition_probs()
  walks = G.simulate_walks(10, 80)
  model = learn_embeddings(walks, featureSize)

  # prepare training data
  # read training nodes
  import torch
  import torch.nn.functional as F
  import numpy as np
  label2id = {
      'Case_Based' : 0,
      'Genetic_Algorithms' : 1,
      'Neural_Networks' : 2,
      'Probabilistic_Methods' : 3,
      'Reinforcement_Learning' : 4,
      'Rule_Learning' : 5,
      'Theory' : 6,
  }
  data_x = np.zeros((2708, featureSize+1433))
  data_y = np.zeros(2708)
  count = 0
  with open('./cora/cora.content', 'r') as f:
    for line in f:
      line = line.strip().split()
      data_x[count][:featureSize] = model.wv[line[0]]
      data_x[count][featureSize:] = np.asarray(list(map(int, line[1:-1])))
      data_y[count] = label2id[line[-1]]
      count += 1
  
  trainloader, testloader, trainid, testid = getTrainTestLoader(data_x, data_y)

  model = train(trainloader, featureSize+1433, 40)
  test(trainloader, testloader, model, trainid.shape[0], testid.shape[0])

In [57]:
node2vecmode(p=1., q=1., featureSize=128)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
torch.Size([140, 128])
torch.Size([140])
torch.Size([2568, 128])
torch.Size([2568])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 16]           2,064
              ReLU-2                   [-1, 16]               0
            Linear-3                    [-1, 7]             119
Total params: 2,183
Trainable params: 2,183
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.00
Forward/backward pass size (MB): 0.00
Params size (MB): 0.01
Estimated Total Size (MB): 0.01
----------------------------------------------------------------
[1,    10] loss: 1.881
[1,    20] loss: 1.977
[1,    30] loss: 1.952
[2,    10] loss: 1.828
[2,    20] loss: 1.891
[2,    30] loss: 1.847
[3,    10] loss: 1.795
[3,    20] loss: 1.726
[3,    30] 

In [52]:
textfeaturemode()

torch.Size([140, 1433])
torch.Size([140])
torch.Size([2568, 1433])
torch.Size([2568])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 16]          22,944
              ReLU-2                   [-1, 16]               0
            Linear-3                    [-1, 7]             119
Total params: 23,063
Trainable params: 23,063
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 0.09
Estimated Total Size (MB): 0.09
----------------------------------------------------------------
[1,    10] loss: 1.981
[1,    20] loss: 1.939
[1,    30] loss: 1.950
[2,    10] loss: 1.908
[2,    20] loss: 1.889
[2,    30] loss: 1.877
[3,    10] loss: 1.811
[3,    20] loss: 1.837
[3,    30] loss: 1.738
[4,    10] loss: 1.652
[4,    20] loss: 1.626
[4,    30] loss: 1.607
[5

In [56]:
combinemode(p=1., q=1., featureSize=128)

Deprecated in NumPy 1.20; for more details and guidance: https://numpy.org/devdocs/release/1.20.0-notes.html#deprecations


Walk iteration:
1 / 10
2 / 10
3 / 10
4 / 10
5 / 10
6 / 10
7 / 10
8 / 10
9 / 10
10 / 10
torch.Size([140, 1561])
torch.Size([140])
torch.Size([2568, 1561])
torch.Size([2568])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1                   [-1, 16]          24,992
              ReLU-2                   [-1, 16]               0
            Linear-3                    [-1, 7]             119
Total params: 25,111
Trainable params: 25,111
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.01
Forward/backward pass size (MB): 0.00
Params size (MB): 0.10
Estimated Total Size (MB): 0.10
----------------------------------------------------------------
[1,    10] loss: 1.958
[1,    20] loss: 1.937
[1,    30] loss: 1.919
[2,    10] loss: 1.765
[2,    20] loss: 1.785
[2,    30] loss: 1.760
[3,    10] loss: 1.582
[3,    20] loss: 1.541
[3,    