# PageRank

In [153]:
import findspark

findspark.init()

In [154]:
import pyspark

sc = pyspark.SparkContext.getOrCreate()
spark = pyspark.SQLContext(sc)

In [155]:
links = ["A: B C D", "B: A D", "C: A", "D: B C"]
deadEnds = ["A: B C D", "B: A D", "C:", "D: B C"]
spiderTraps = ["A: B C D", "B: A D", "C: C", "D: B C"]


In [156]:
data = sc.parallelize(deadEnds)
initialRank = 1.0 / data.count()
beta = 1.0
ITERATIONS = 15

In [157]:
def convert(line):
    key, neighbours = line.split(':')
    if neighbours != '':
        neighbours = neighbours.lstrip().split(' ')
    else:
        neighbours = []
    return key, (neighbours, initialRank)


In [158]:
def extract_probabilities(line):
    key, (neighbours, rank) = line
    neighbours_chances = [(neighbour, rank / len(neighbours)) for neighbour in neighbours]
    output = [(key, (neighbours, 0))] + neighbours_chances
    return output

In [159]:
def is_main_part(line):
    return type(line) is not float

def sum_probabilities(first_line, second_line):
    if second_line is None:
        return first_line

    if is_main_part(first_line):
        neighbours, rank = first_line
        chance = second_line
        return neighbours, rank + chance
    elif is_main_part(second_line):
        neighbours, rank = second_line
        chance = first_line
        return neighbours, rank + chance
    else:
        return first_line + second_line

In [160]:
def taxation(line):
    key, (neighbours, rank) = line
    rank = beta * rank + (1-beta)*initialRank
    return key, (neighbours, rank)

In [161]:
data = data.map(lambda x: convert(x))
for _ in range(0, ITERATIONS):
    data = data.map(lambda x: extract_probabilities(x))\
        .flatMap(lambda t: [item for item in t])\
        .reduceByKey(lambda x, y: sum_probabilities(x, y))\
        .map(lambda x: taxation(x))


data = data.sortBy(lambda pair: -pair[1][1])
data.collect()

[('B', (['A', 'D'], 0.00240619428600049)),
 ('D', (['B', 'C'], 0.00240619428600049)),
 ('C', ([], 0.00240619428600049)),
 ('A', (['B', 'C', 'D'], 0.001650987720184256))]