In [4]:
#Exercise on Python and PageRank
from scipy.sparse import coo_matrix
import numpy as np
from numpy.linalg import norm

In [16]:
def myPageRank(G_file_name, epsilon = 0.1, beta = 0.8): 
    M = preprocessor(G_file_name)[1]   
    N = preprocessor(G_file_name)[0]
    A = beta * M + (1 - beta) / N
    Pi = np.array([1 / N for x in range(int(N))])
    while True:
        ancientPi = Pi
        Pi = A.dot(ancientPi)
        if norm((Pi - ancientPi), ord=1) < epsilon:
            break
    return Pi     
 

def preprocessor(G_file_name):
    #G is a list a lines of the kind i j denoting that there is an edge between node i and j (from j to i)   
    with open(G_file_name, "r") as file:
        lines = file.readlines()
    file.close()
    i_list = np.array([])
    j_list = np.array([])
    for line in lines:
        i_list = np.append(i_list, int(line.split()[0]) - 1)
    for line in lines:
        j_list = np.append(j_list, int(line.split()[1]) - 1)
    i_list, j_list = remove_dead_ends(i_list, j_list)
    N = max(np.append(i_list, j_list)) + 1
    data = [0 for x in range(len(i_list))]
    for j in j_list:
        k = j_list.tolist().count(j) #number of successors of the page j
        for index in [x for x, y in enumerate(j_list) if y == j]:
            data[index] = 1 / k
    M = coo_matrix((data, (i_list, j_list)), shape=(N, N)).toarray()
    return (N, M)


def remove_dead_ends(list1, list2):
    differences = list(set(list1.tolist()) - set(list2.tolist()))
    indices_to_remove = [i for i, x in enumerate(list1.tolist()) if x in differences]
    i_list = []
    j_list = []
    for index in range(1, len(list1) + 1):
        if index not in indices_to_remove:
            i_list = np.append(i_list, list1[index - 1])
            j_list = np.append(j_list, list2[index - 1])
    return np.array(i_list), np.array(j_list)

In [21]:
print(myPageRank("first_matrix.txt"))
print(myPageRank("second_matrix.txt", epsilon = 0.00000000000000000001, beta = 1))

print(myPageRank("dead_ends_matrix.txt"))
result_dead_ends_matrix = myPageRank("dead_ends_matrix.txt", beta = 1)
print(result_dead_ends_matrix)

if sum(result_dead_ends_matrix) == 1:
    print("Dead ends problem has been resolved correctly")
else:
    print("Dead ends problem has not yet been resolved correctly")

[ 0.25866667  0.17866667  0.56266667]
[ 0.28571429  0.14285714  0.14285714  0.14285714  0.14285714  0.14285714]
[ 0.03402667  0.03402667  0.05524148]
[ 0.02083333  0.02083333  0.04166667]
Dead ends problem has not yet been resolved correctly


In [19]:
#extract the web pages to construct a graph
import re
import os

In [20]:
path = "/Users/sun-haozhe/Documents/Python workspace/SD201/TP1/toyset/"
dictionary = {}
i = 1
hyperlinks = []

new_file_name = "web_graph.txt"
new_file = open(new_file_name, "w")

for file_name in os.listdir(path):
    dictionary[file_name] = i
    i += 1 
    with open(path + file_name, "r") as file:
        file_text = file.read()
        hyperlinks = np.append(hyperlinks, re.findall('a href="([^\'" >]+)', file_text) )        
    file.close()
    
for file_name in os.listdir(path):
    for html in hyperlinks:
        new_file.write(str(dictionary[html]) + " " + str(dictionary[file_name]) + "\n")   

In [9]:
#run the PageRank algorithm on the web graph
PageRankVector = myPageRank(new_file_name)
print(PageRankVector)

debug, page rank turn0
debug, page rank turn1
[ 0.03426624  0.00659118  0.01047817  0.01190282  0.0459668   0.06625482
  0.01829174  0.01485764  0.03478509  0.01195558  0.01389908  0.01095745
  0.01915797  0.01481807  0.01144992  0.0104518   0.0210619   0.0085083
  0.01234254  0.01097064  0.02362976  0.01532373  0.02014291  0.02211278
  0.01240848  0.01142354  0.04639332  0.02498846  0.00556667  0.0046213
  0.01565793  0.02500165  0.01629548  0.01186326  0.01288776  0.01186326
  0.00797626  0.02938112  0.01670881  0.03033968  0.01577663  0.00991976
  0.0050742   0.04399692  0.03473233  0.01480488  0.00411564  0.01091789
  0.00945367  0.01729361  0.0070177   0.01772013  0.02211278  0.01963725
  0.0138727 ]
