In [1]:
from scipy.sparse import coo_matrix, csr_matrix

import numpy as np
import matplotlib.pyplot as plt

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
%matplotlib inline
np.set_printoptions(suppress=True)

In [30]:
from pagerank_naive import pageRank_naive
from pagerank_np_sparse import pagerank_sparse
from pagerank_numba import pageRank_naive_numba, pagerank_sparse_numba

In [31]:
def read_file(filename):
    links = [[]]
    with open(filename, 'r') as f:
        for line in f:
            (frm, to) = map(int, line.split(" "))
            extend = max(frm - len(links), to - len(links)) + 1
            for i in range(extend):
                links.append([])
            links[frm].append(to)
    return links

In [39]:
def read_file_to_csr(filename):
    with open(filename, 'r') as f:
        src, dest = [], []
        n = 0 # num_nodes
        for line in f:
            (frm, to) = map(int, line.split(" "))
            
            if max(frm, to) > n:
                n = max(frm, to)
            src.append(frm)
            dest.append(to)
        data = np.repeat(1, len(src))
        return coo_matrix((data, (dest,src)), shape=(n+1, n+1), dtype=np.float64).tocsr()

In [40]:
fname = "/Users/timwee/projects/page_rank/mypagerank/data/bull.txt"
fname = "/Users/timwee/projects/page_rank/mypagerank/data/erdos-80000.txt"
fname = "/Users/timwee/projects/page_rank/mypagerank/data/java-org.txt"

In [41]:
def print_debug(pr_vector):
    for i in range(len(pr_vector)):
        print(i, "=", pr_vector[i])
    print("s = " + str(sum(pr_vector)))

In [42]:
def compute_pagerank_naive(links):
    pr, num_iter =  pageRank_naive(links, alpha=0.85, convergence=0.00001, checkSteps=10)
    #print_debug(pr)
    print("took %d iterations to converge" % (num_iter))
    return pr

In [43]:
def compute_pagerank_naive_numba(links):
    pr, num_iter =  pageRank_naive_numba(links, alpha=0.85, convergence=0.00001, checkSteps=10)
    #print_debug(pr)
    print("took %d iterations to converge" % (num_iter))
    return pr

In [44]:
links = read_file(fname)
H = read_file_to_csr(fname)

In [49]:
def compute_pagerank_sparse(H):
    pr, num_iter = pagerank_sparse(H)
    #print_debug(pr)
    print("took %d iterations to converge" % (num_iter))
    return pr

In [50]:
def compute_pagerank_sparse_numba(H):
    pr, num_iter = pagerank_sparse_numba(H)
    #print_debug(pr)
    print("took %d iterations to converge" % (num_iter))
    return pr

In [46]:
%timeit pr_naive = compute_pagerank_naive(links)

took 20 iterations to converge
took 20 iterations to converge
took 20 iterations to converge
took 20 iterations to converge
1 loop, best of 3: 4.33 s per loop


In [47]:
%timeit pr_naive_numba = compute_pagerank_naive_numba(links)

took 20 iterations to converge
took 20 iterations to converge
took 20 iterations to converge
took 20 iterations to converge
1 loop, best of 3: 4.46 s per loop


In [51]:
%timeit pr_sparse = compute_pagerank_sparse(H)

took 15 iterations to converge
took 15 iterations to converge
took 15 iterations to converge
took 15 iterations to converge
1 loop, best of 3: 182 ms per loop


In [52]:
%timeit pr_sparse_numba = compute_pagerank_sparse_numba(H)

took 15 iterations to converge
took 15 iterations to converge
took 15 iterations to converge
took 15 iterations to converge
The slowest run took 29.02 times longer than the fastest. This could mean that an intermediate result is being cached.
1 loop, best of 3: 21.3 ms per loop


In [53]:
pr_naive = compute_pagerank_naive(links)

took 20 iterations to converge


In [54]:
pr_sparse = compute_pagerank_sparse(H)

took 15 iterations to converge


In [56]:
pr_sparse_numba = compute_pagerank_sparse_numba(H)

took 15 iterations to converge


In [57]:
np.allclose(pr_naive, pr_sparse), np.allclose(pr_sparse, pr_sparse_numba)

(True, True)