In [1]:
import networkx as nx
import numpy as np
import matplotlib.pyplot as plt
import random
import pandas as pd
import math 
import time

In [2]:
#set the parameters 
NUM_RANDOM_WALKS = 80
EPSILON = 0.15

In [29]:
filepath = "input/BlogCatalog-edgelist.txt"
filepath_Youtube = "input/youtube.txt"

embeddingsiterative = "output/BlogCatalog-edgelist.txt.embeddings-iterative"

In [4]:
def parseEdgeList(graph_file, delimiter=" ", weighted=False, direction="undirected"):
    if(weighted == False):
        G = nx.read_edgelist(graph_file, delimiter=delimiter)
    else:
        G = nx.read_edgelist(graph_file, delimiter=delimiter, nodetype=int, data=(('weight',float),))
    print(G.number_of_nodes(), G.number_of_edges(), " loaded from ", graph_file)
    if(direction == "undirected"):
        return G.to_undirected()
    else:
        return G

In [5]:
def getAdjNPList(graph):
    adjdict = {}
    for vertex in graph:
        adjdict[vertex] = np.array([n for n in G.neighbors(vertex)]) 
        np.random.shuffle(adjdict[vertex])
    return adjdict

def getNodeContextSets(graph):
    all_sets = {}
    for node in graph:
        all_sets[node] = []
    return all_sets

In [30]:
#G = parseEdgeList(filepath)
G = parseEdgeList(filepath_Youtube)

1134890 2987624  loaded from  input/youtube.txt


In [7]:
# G = nx.Graph()
# #Small example
# G.add_nodes_from(["A","B","C","D","E","F","G","H","I","J","K","L"])
# # G.add_nodes_from(["A","B","C","D","E"])
# G.add_edge("A", "D"),G.add_edge("A", "E"),G.add_edge("A", "I"),G.add_edge("A", "K")
# G.add_edge("B", "D"),G.add_edge("B", "C"),G.add_edge("B", "L"),G.add_edge("B", "K")
# G.add_edge("C", "D"),G.add_edge("D", "H"),G.add_edge("D", "G"),G.add_edge("D", "E")
# G.add_edge("E", "F"),G.add_edge("F", "G"),G.add_edge("I", "E"),G.add_edge("I", "J"),G.add_edge("K", "L")
# # Draw graph
# nx.draw(G, with_labels = True)
# plt.show()

In [8]:
def getPerNodeBudget(numNodes, budget):
    return math.floor(budget/numNodes)

In [9]:
def updateSetCount(curent_node, context_pair, context_budget):
    for i in range(0, context_budget):
        all_sets[curent_node].append(context_pair)

In [10]:
def countSum(contextPairs):
    countSum = 0
    for key, value in contextPairs.items():
        countSum += value
    print("Total value sums up to: ", countSum)

In [11]:
def countListSum(all_sets):
    countSum = 0
    for key, value in all_sets.items():
        countSum += len(value)
    print("Total value sums up to: ", countSum)

In [12]:
def WriteToFile(file):
    femb_iterative = open(file, 'w')
    for (key, value) in context_pairs.items():
        femb_iterative.write(key + " " + str(value) + "\n" )
    femb_iterative.close()

In [13]:
def chooseNodes(list_nodes, n):
    return random.sample(population=list_nodes, k=n)

In [14]:
def updateContextPairs(context_pair, num_rand_walks_ending_here, context_pairs):
    if context_pair not in context_pairs:
        context_pairs[context_pair] = num_rand_walks_ending_here
    else:
        context_pairs[context_pair] = context_pairs[context_pair] +  num_rand_walks_ending_here

In [31]:
def BFSRandomWalk(graph, startvertex, queue, context_pairs):
    while queue:
        vertex, budget = queue.pop(0)  
        
        if all_sets[vertex] and vertex != startvertex:
            current_amount_of_cp = len(all_sets[startvertex])
            needed_amount_cp = len(all_sets["1"])
            cp_to_sample = needed_amount_cp - current_amount_of_cp
            sampled_cp = random.sample(all_sets[vertex], cp_to_sample)
            all_sets[startvertex].extend(sampled_cp)
            break
              
        vertex_neighbors = [n for n in G.neighbors(vertex)]
        num_neighbors = len(vertex_neighbors)
        m = getPerNodeBudget(num_neighbors, budget)
        remainder = budget - (m * num_neighbors)
        chosen_nodes = []
        if remainder > 0:
            chosen_nodes = chooseNodes(vertex_neighbors, remainder)
        for neighbor in vertex_neighbors:
            budget_for_this_node = m 
            if neighbor in chosen_nodes:
                budget_for_this_node = budget_for_this_node + 1
            num_rand_walks_ending_here =  math.floor(budget_for_this_node * EPSILON) 
            context_pair = str(startvertex) + " " + str(neighbor)
            #print("context_pair ->", context_pair)
            if(num_rand_walks_ending_here > 0):
                #updateContextPairs(context_pair, num_rand_walks_ending_here, context_pairs)
                updateSetCount(startvertex, context_pair, num_rand_walks_ending_here )
            remaining_budget = budget_for_this_node - num_rand_walks_ending_here
            if remaining_budget > 0:
                if remaining_budget > 1:
                    queue.append((neighbor, remaining_budget))
                else:
                    randval = random.random()
                    if randval < EPSILON:
                        queue.append((neighbor, remaining_budget))
                    else:
                        #updateContextPairs(context_pair, 1, context_pairs)
                        updateSetCount(startvertex, context_pair, 1)

In [32]:
adjdict = getAdjNPList(G)
all_sets = getNodeContextSets(G)

start = time.time()
context_pairs = {}
print("Running BFS...")
for startvertex in adjdict:
    queue = [(startvertex, NUM_RANDOM_WALKS)]
    BFSRandomWalk(adjdict, startvertex, queue, context_pairs)
countListSum(all_sets)
WriteToFile(embeddingsiterative)

end = time.time()
result = end - start
print("Run in :",result) 

Running BFS...
Total value sums up to:  90791200
Run in : 100.54623460769653


In [None]:
print(all_sets)