## Forest Fire
Here's the idea:
1. Choose a node v at random
2. Generate a random number x that is geometrically distributed with mean pf/(1-pf)
- side note use pf = .7 because according to this paper: https://ieeexplore-ieee-org.silk.library.umass.edu/document/7752223 it gives the most representative sample
3. Node v chooses x outlinks that were not yet visited (make sure we haven't gone over our sample percentage)
4. Go to those x outlinks and generate a new random x with same mean - note nodes cannot be visited a second time (so just don't include it if we hit that node)
5. If we have more samples to take just start over with a new node and do the same thing
- side note to keep in mind this is a BFS algorithm - so the "to nodes" should be equal to the sample size percentage * unique nodes in the graph

In order to run: just change the data what you want to be (must have headers: "FromNode" and "ToNode") and change sample size

In [11]:
import numpy as np
import pandas as pd
import random

In [16]:
data = pd.read_csv("web-Stanford.txt", sep="\t")

In [25]:
list_of_nodes = np.unique(np.array(data["FromNode"])) #gets the unique list of nodes
sample_size = .05*len(list_of_nodes) #so we get to hold 10% of the nodes it looks like
pf = .7*(1-.7) #this is going to be the number of nodes we select

In [26]:
def run_forest_fire4():
    nodes_seen = set()
    edges = set()
    global queue;
    queue = []
    while(len(nodes_seen) < sample_size):
        if(len(queue) == 0):
            cur_node = np.random.choice(list_of_nodes)
            queue.append(cur_node)
            nodes_seen.add(cur_node)
        def forest_fire5(node):
            if(len(nodes_seen) >= sample_size):
                return;
            else: #get a geometric random number
                outlinks_allowed = np.random.geometric(pf)
                candidates = list(data[data["FromNode"] == node]["ToNode"])
                if(outlinks_allowed <= len(candidates)):
                    candidates = random.sample(candidates, outlinks_allowed)
                for outlink in candidates:
                    if(outlink in nodes_seen):
                        edges.add((node, outlink))
                    else:
                        if(len(nodes_seen) < sample_size):
                            nodes_seen.add(outlink)
                            edges.add((node, outlink))
                            queue.append(outlink)
            queue.pop(0);     
        forest_fire5(queue[0])
    return edges

In [5]:
e = run_forest_fire4()

In [27]:
for i in range(1,6):
    e = run_forest_fire4()
    edgeHolder = pd.DataFrame(columns = ["FromNode", "ToNode"])
    for link in e:
        line = pd.DataFrame({"FromNode" : [link[0]], "ToNode" : [link[1]]})
        edgeHolder = pd.concat([edgeHolder, line], ignore_index = True)
    outputname = "OutputFF5%" + str(i) + ".csv"
    edgeHolder.to_csv(outputname, index = False)

In [10]:
edgeHolder = pd.DataFrame(columns = ["FromNode", "ToNode"])
for link in e:
    line = pd.DataFrame({"FromNode" : [link[0]], "ToNode" : [link[1]]})
    edgeHolder = pd.concat([edgeHolder, line], ignore_index = True)

In [11]:
edgeHolder.to_csv("ForestFireOutput.csv")

Unnamed: 0,FromNode,ToNode
0,266286,185099
1,21016,94163
2,197199,176790
3,222999,205477
4,11336,223724
...,...,...
94658,107482,144519
94659,259213,175444
94660,251291,183004
94661,187314,207809


## End Here

## Previous Attempts
- Here are previous attempts at it in case you want to see my failed attempts:

In [None]:
def forest_fire(node, nodes_seen):
    display(edgeHolder)
    print(nodes_seen)
    if(len(nodes_seen) >= 5):
        return;
    else: #get a geometric random number
        nodes_seen.add(node)
        outlinks_allowed = np.random.geometric(pf)
        print(outlinks_allowed)
        candidates = list(data[data["FromNode"] == node]["ToNode"])
        if(outlinks_allowed > len(candidates)):
            dummy = [node]*len(candidates)
            df2 = pd.DataFrame({"FromNode" : dummy, "ToNode" : candidates})
            edgeHolder = pd.concat([edgeHolder, df2], ignore_index = True)
            #display(edgeHolder)
        else: #randomly select some nodes and go off of that
            candidates = random.sample(candidates, outlinks_allowed)
            dummy = [node]*len(candidates)
            df2 = pd.DataFrame({"FromNode" : dummy, "ToNode" : candidates})
            edgeHolder = pd.concat([edgeHolder, df2], ignore_index = True)
            #display(edgeHolder)
        for outlink in candidates:        
                if(outlink not in nodes_seen):         
                    forest_fire(outlink, edgeHolder, nodes_seen) #shouldn't run til everything

In [None]:
def run_forest_fire():
    nodes_seen = set()
    edgeHolder = pd.DataFrame(columns = ["FromNode", "ToNode"])
    while(len(nodes_seen) < 5):
        cur_node = np.random.choice(list_of_nodes)
        print(cur_node)
        forest_fire(cur_node, edgeHolder, nodes_seen)
        
        display(edgeHolder)
        print(nodes_seen)

In [None]:
def run_forest_fire2():
    nodes_seen = set()
    edges = set()
    queue = []
    while(len(nodes_seen) < sample_size):
        cur_node = np.random.choice(list_of_nodes)
        queue.append(cur_node)
        def forest_fire3(node):
            if(len(nodes_seen) >= sample_size):
                return;
            else: #get a geometric random number
                nodes_seen.add(node)
                outlinks_allowed = np.random.geometric(pf)
                candidates = list(data[data["FromNode"] == node]["ToNode"])
                if(outlinks_allowed <= len(candidates)):
                    candidates = random.sample(candidates, outlinks_allowed)
                for outlink in candidates:
                    edges.add((node, outlink))
                    if(outlink not in nodes_seen):         
                        forest_fire3(outlink) #shouldn't run til everything
        
        forest_fire3(queue[0])
    return edges