In [27]:
import numpy as np
def pagerank(fn_chuck,size,max_iter=300,eps=1e-10):
    '''
    pagerank need a size*size matrix which every column sum equal to 1.0
    This big file must save in some files with filename like 'xxx_0_1000.npy','xxx_1000_1500.npy',
    means the matrix from row 1000 to row 1500
    params:
        fn_chunk:string,a file name,it's content is a chunk file name per line,
                    like 'xxx_0_1000.npy\nxxx_1000_1500.npy
        size:int,the sqare matrix size
        max_iter:int,the number of max iter
        eps:float,the erros you can accept
    returns:
        pagerank value: ,size rows 1 column
        error:current error
    '''
    x1=np.zeros((size,1))+1.0/size
    x2=np.zeros((size,1))
    n_iter=0
    error = np.abs(x1-x2).mean()
    chunk_fn_list=[line.rstrip() for line in open(fn_chuck)]
    while n_iter<max_iter and error > eps:
        for fn in chunk_fn_list:
            chunk_start,chunk_end = fn.split('.')[0].split('_')[-2:]
            chunk_start,chunk_end = int(chunk_start),int(chunk_end )
            x2[chunk_start:chunk_end]=np.dot(np.load(fn),x1)
        error = np.abs(x1-x2).mean()
        x1,x2 = x2,x1
    return x2,error

In [22]:
size=100
h=np.arange(size*size).reshape(size,size).astype(np.float)
# every column added to 1.0
h=h/h.sum(axis=0)
# save to 3 files
np.save('h_0_30.npy',h[0:30])
np.save('h_30_60.npy',h[30:60])
np.save('h_60_100.npy',h[60:100])

import pagerank
x = pagerank.pagerank('fn_chunk.txt',size)
print x


In [15]:
h,hh

(array([[  0.,   1.,   2.,   3.],
        [  4.,   5.,   6.,   7.],
        [  8.,   9.,  10.,  11.],
        [ 12.,  13.,  14.,  15.]]),
 array([[ 0.        ,  0.03571429,  0.0625    ,  0.08333333],
        [ 0.16666667,  0.17857143,  0.1875    ,  0.19444444],
        [ 0.33333333,  0.32142857,  0.3125    ,  0.30555556],
        [ 0.5       ,  0.46428571,  0.4375    ,  0.41666667]]))

In [13]:
h.sum(axis=0)

array([ 24.,  28.,  32.,  36.])

In [28]:
x,error=pagerank('fn_chunk.txt',100)

In [26]:
np.abs(x-np.dot(h,x)).mean()

1.8529880493417744e-10

In [29]:
error

6.1776171743181678e-13