In [1]:
import pandas as pd
import numpy as np
graph_df = pd.read_table('web-NotreDame.txt', names=['from_node', 'to_node'], skiprows=4, dtype={'from_node': np.int32, 'to_node': np.int32})
graph_df = graph_df.loc[(graph_df.from_node < 10000) & (graph_df.to_node < 10000)]

In [2]:
graph_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 37841 entries, 0 to 63636
Data columns (total 2 columns):
from_node    37841 non-null int32
to_node      37841 non-null int32
dtypes: int32(2)
memory usage: 591.3 KB


In [3]:
M = np.zeros((10000, 10000))

In [4]:
grouped_df = graph_df.groupby(['from_node']).count().reset_index().rename(columns={'to_node': 'weight'})
grouped_df['weight'] = 1 / grouped_df['weight']
graph_df = graph_df.merge(grouped_df, on=['from_node'])

In [5]:
rows = graph_df['to_node'].values
cols = graph_df['from_node'].values
M[rows,cols] = np.array(graph_df['weight'].values)

In [6]:
def pagerank(M, alpha, num_iter):
    pr = np.ones(M.shape[0])
    beta = 1 - alpha
    for _ in range(num_iter):
        pr = alpha * M @ pr + beta 
    return pr

## Step 3.3

In [7]:
M[10:30, 10:30]

array([[ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ],
       [ 0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  

In [8]:
pr = pagerank(M, 0.85, 15)

In [9]:
dtype = [('id','int32'), ('pagerank','float32')]
pr_df = pd.DataFrame({'id': np.arange(10000),
     'pagerank': pr})

In [10]:
pr_df = pr_df.sort_values(by=['pagerank'], ascending=False).reset_index().drop('index', 1)
pr_df

Unnamed: 0,id,pagerank
0,0,224.702638
1,1973,189.250314
2,1790,53.438593
3,1828,50.954873
4,1,27.975911
5,238,26.779136
6,140,23.520898
7,14,22.232264
8,16,21.591054
9,162,18.283386
