## Random Walk
Here's the idea:
1. Start from a uniformly selected random node
2. Pick a random node incident to it and walk to it
3. With probability .15 fly back to the original node
4. With probability 1-.15 pick a random node incident to said node and walk to it
5. If "stuck" (seen 10 nodes in a row we've already seen), pick a new random node and start all over again
6. Go until we have seen enough random nodes
- Note: This takes a minute - minute and a half to run at a 10% sample size with size being 300k

In [28]:
import pandas as pd
import numpy as np
import random

In [29]:
data = pd.read_csv("web-Stanford.txt", sep="\t")

In [44]:
list_of_nodes = np.unique(np.array(data["FromNode"])) #gets the unique list of nodes
sample_size = .05*len(list_of_nodes) #so we get to hold 10% of the nodes it looks like
pf = .85 #this is going to be the number of nodes we select

In [45]:
def randomwalk():
    edges = set()
    cur_node = np.random.choice(list_of_nodes)
    starting_node = cur_node
    nodes_seen = set()
    nodes_seen.add(cur_node)
    seeninarow = 0 #number of nodes we seen in a row
    while(len(nodes_seen) < sample_size):
        #flip a biased coin
        if(random.random() < .85):
            try: #makes sure the node we're in has an endpoint and if it doesn't go back to the starting node
                toNode = int(data[data.FromNode == cur_node].sample()["ToNode"]) #becomes our next node
                edges.add((cur_node, toNode))
            except:
                toNode = int(data[data.FromNode == starting_node].sample()["ToNode"]) #becomes our next node
                edges.add((starting_node, toNode))
            
            if(toNode in nodes_seen):
                seeninarow+=1;
            else:
                nodes_seen.add(toNode);
                seeninarow = 0; #we saw a new node!
                
            cur_node = toNode
            if(seeninarow == 10):
                starting_node = np.random.choice(list_of_nodes) #will go to a new node now
                seeninarow = 0;
                cur_node = starting_node
        
        else:
            if(seeninarow == 10):
                starting_node = np.random.choice(list_of_nodes) #will go to a new node now
                seeninarow = 0;
            cur_node = starting_node #if it is just go back to our start

        
    return edges

In [46]:
for i in range(1,6):
    e = randomwalk()
    edgeHolder = pd.DataFrame(columns = ["FromNode", "ToNode"])
    for link in e:
        line = pd.DataFrame({"FromNode" : [link[0]], "ToNode" : [link[1]]})
        edgeHolder = pd.concat([edgeHolder, line], ignore_index = True)
    outputname = "OutputRW5%" + str(i) + ".csv"
    edgeHolder.to_csv(outputname, index = False)

In [22]:
edge_set = randomwalk()

In [26]:
edgeHolder = pd.DataFrame(columns = ["FromNode", "ToNode"])
for link in edge_set:
    line = pd.DataFrame({"FromNode" : [link[0]], "ToNode" : [link[1]]})
    edgeHolder = pd.concat([edgeHolder, line], ignore_index = True)

In [25]:
edgeHolder.to_csv("RandomWalkOutput.csv")

59760

In [27]:
edgeHolder

Unnamed: 0,FromNode,ToNode
0,276349,125382
1,33487,241883
2,254461,64871
3,112594,169368
4,91042,45492
...,...,...
59755,30044,166939
59756,19546,93329
59757,261118,249843
59758,46252,247241
