In [1]:
import re
import heapq
import numpy as np
from fractions import Fraction

import pyspark
from pyspark import SparkContext, SparkConf

In [2]:
pyspark.__version__

'3.2.1'

In [3]:
conf = pyspark.SparkConf().setAppName('pagerank').setMaster('local')
sc = pyspark.SparkContext(conf=conf)

22/04/10 23:33:30 WARN Utils: Your hostname, umar resolves to a loopback address: 127.0.1.1; using 10.127.80.168 instead (on interface wlp2s0)
22/04/10 23:33:30 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/04/10 23:33:31 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
# Book Example

# ExM = [
#     (1, 2),
#     (1, 3),
#     (1, 4),
#     (2, 1),
#     (2, 4),
#     (3, 1),
#     (4, 2),
#     (4, 3)
# ]

# ExM = sc.parallelize(ExM)
# ExM.collect()

## For Small Graph

In [5]:
readLines = sc.textFile('./graph-small.txt')
readLines.take(3)

# Reading in the form
# From Source Node -->  To Destination Node 

                                                                                

['100\t1', '13\t1', '28\t1']

In [6]:
NUM_ITER = 40
B = 0.8

In [7]:
# Getting all the pairs in form of tuples (Src, Dest)
nodePairs = readLines.map(lambda l: tuple([int(num) for num in re.split('[ |\t]', l)])).distinct()
nodePairs.take(10)

[(100, 1),
 (13, 1),
 (28, 1),
 (89, 1),
 (82, 1),
 (30, 1),
 (79, 1),
 (65, 1),
 (88, 1),
 (25, 1)]

In [9]:
# Getting the pairs reversed (Dest, Src)
reversePairs = nodePairs.map(lambda p: tuple(reversed(p)))
reversePairs.take(10)

[(1, 100),
 (1, 13),
 (1, 28),
 (1, 89),
 (1, 82),
 (1, 30),
 (1, 79),
 (1, 65),
 (1, 88),
 (1, 25)]

In [10]:
# Calculating the number of distinct elemenets
numElem = nodePairs.flatMap(lambda p: p)
numElem.take(10)

[100, 1, 13, 1, 28, 1, 89, 1, 82, 1]

In [11]:
numElem = numElem.distinct().count()
numElem

100

In [12]:
# Number of out going edges for each node

# Count number of elements for each key and returns as dictionary
nodeOutgoingEdges = nodePairs.countByKey()
nodeOutgoingEdges

defaultdict(int,
            {100: 11,
             13: 9,
             28: 9,
             89: 12,
             82: 12,
             30: 9,
             79: 14,
             65: 9,
             88: 8,
             25: 12,
             46: 13,
             73: 8,
             59: 17,
             50: 11,
             11: 14,
             24: 9,
             38: 11,
             44: 9,
             3: 13,
             58: 15,
             14: 11,
             31: 15,
             85: 10,
             17: 12,
             98: 6,
             51: 7,
             94: 9,
             53: 4,
             64: 8,
             33: 12,
             56: 9,
             1: 14,
             26: 7,
             45: 16,
             2: 8,
             86: 7,
             67: 11,
             12: 10,
             96: 11,
             83: 15,
             66: 8,
             77: 11,
             49: 9,
             4: 9,
             92: 9,
             87: 7,
             20: 12,
             78: 9,
 

In [13]:
def makeVector(nodes, nodeOutgoingEdges, numElem):
    vec = np.zeros((1, numElem))
    for node in nodes:
        if (node not in nodeOutgoingEdges): 
            continue
        vec[0][node - 1] = 1. / nodeOutgoingEdges[node]
    return vec

In [14]:
def toRatio(v):
    for i in v:
        print(str(Fraction(i).limit_denominator()), end=' | ')

In [15]:
M = reversePairs.groupByKey().map(lambda x: (x[0], makeVector(x[1], nodeOutgoingEdges, numElem)))
M.collect()

[(1,
  array([[0.        , 0.        , 0.07692308, 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.07142857, 0.        , 0.11111111, 0.09090909, 0.        ,
          0.        , 0.08333333, 0.        , 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.11111111, 0.08333333,
          0.        , 0.        , 0.11111111, 0.        , 0.11111111,
          0.06666667, 0.        , 0.08333333, 0.        , 0.        ,
          0.        , 0.        , 0.09090909, 0.        , 0.        ,
          0.        , 0.        , 0.        , 0.11111111, 0.        ,
          0.07692308, 0.        , 0.        , 0.        , 0.09090909,
          0.14285714, 0.        , 0.25      , 0.        , 0.        ,
          0.11111111, 0.        , 0.06666667, 0.05882353, 0.        ,
          0.        , 0.        , 0.        , 0.125     , 0.11111111,
          0.        , 0.        , 0.        , 0.        , 0.        ,
          0.   

In [16]:
r = np.ones((numElem, 1)) / numElem
r

array([[0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.01],
       [0.

In [17]:
e = (1. - B) / numElem
e = np.round(e, 4)
e

0.002

In [18]:
for _ in range(NUM_ITER):
    Mv = M.map(lambda x: (x[0], (x[1].dot(r) * B)[0][0])).sortByKey()
    k = list(Mv.keys().collect())
    v = list(Mv.values().collect())
    r_dict = dict(zip(k,v))
    r_dict = {k:v+e for k,v in r_dict.items()}
    r = np.asarray(list(r_dict.values())).reshape(-1,1)

In [19]:
r = r.reshape(-1)
toRatio(r)

19621/653903 | 2835/484441 | 7691/988432 | 161/21278 | 3305/216167 | 5323/829073 | 3878/907379 | 5908/898305 | 12679/984352 | 4847/637758 | 3280/654459 | 3167/595859 | 12269/673928 | 18246/533963 | 560/87191 | 3417/579472 | 2489/398384 | 7881/592594 | 1767/316582 | 1788/358897 | 313/50164 | 5290/346363 | 609/147677 | 3953/549305 | 199/26944 | 2396/167081 | 29522/993333 | 3310/521659 | 5793/805837 | 2785/395793 | 8137/659945 | 276/55271 | 4635/870214 | 5304/831715 | 4297/330628 | 3373/634804 | 3007/789611 | 974/136229 | 10940/891257 | 31501/936691 | 7997/995956 | 276/25463 | 2221/347558 | 6633/520219 | 1677/373343 | 5111/999916 | 708/118301 | 19992/787183 | 399/72083 | 3297/774230 | 8231/998408 | 3244/244469 | 18859/527802 | 4141/355493 | 4789/615979 | 6665/847644 | 15726/890639 | 3685/717304 | 2329/634629 | 6186/689683 | 6394/333363 | 7238/986795 | 1019/236427 | 6838/886169 | 17099/894430 | 23046/789373 | 1771/230057 | 4748/940911 | 5526/829433 | 6676/635915 | 5801/805940 | 4114/942431

In [21]:
# numbers = np.array([10,2,8,4,5,6,9])
A = {
    1:1,
    2:10,
    3:6,
    6:8,
    10:9
}

def getTop5(A):
    n = 5
#     For numpy arrays
#     idx = np.argpartition(v, -n)[-n:]
#     indices = idx[np.argsort((-v)[idx])] 
#     return indices + 1

    return heapq.nlargest(n, A, key=A.get)


def getLast5(A):
    n = 5
#     return (np.argsort(v)[:n])+1
#     v[np.argsort(v)[:n]]
    return heapq.nsmallest(n, A, key=A.get)


# getTop5(numbers), getLast5(numbers)
# getTop5(A), getLast5(A)

In [22]:
getTop5(r_dict)

[53, 14, 40, 1, 27]

In [23]:
getLast5(r_dict)

[85, 59, 81, 37, 89]

## For Graph Full

In [24]:
readLines = sc.textFile('./graph-full.txt')
nodePairs = readLines.map(lambda l: tuple([int(num) for num in re.split('[ |\t]', l)])).distinct()
reversePairs = nodePairs.map(lambda p: tuple(reversed(p)))
numElem = nodePairs.flatMap(lambda p: p)
numElem = numElem.distinct().count()
print(numElem)
nodeOutgoingEdges = nodePairs.countByKey()

1000


In [25]:
M = reversePairs.groupByKey().map(lambda x: (x[0], makeVector(x[1], nodeOutgoingEdges, numElem)))

In [26]:
r = np.ones((numElem, 1)) / numElem
e = (1. - B) / numElem
e = np.round(e, 4)

In [27]:
for _ in range(NUM_ITER):
    Mv = M.map(lambda x: (x[0], (x[1].dot(r) * B)[0][0])).sortByKey()
    k = list(Mv.keys().collect())
    v = list(Mv.values().collect())
    r_dict = dict(zip(k,v))
    r_dict = {k:v+e for k,v in r_dict.items()}
    r = np.asarray(list(r_dict.values())).reshape(-1,1)

In [28]:
getTop5(r_dict)

[263, 537, 965, 243, 285]

In [29]:
getLast5(r_dict)

[558, 93, 62, 424, 408]