In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("Simple PageRank") \
    .getOrCreate()

damping_factor = 0.85
num_pages = 3
tolerance = 1.0e-6
max_iterations = 10

links = [
    (0, 1),  # Page A links to B
    (0, 2),  # Page A links to C
    (1, 2),  # Page B links to C
    (2, 0)   # Page C links to A
]

links_rdd = spark.sparkContext.parallelize(links)

ranks_rdd = spark.sparkContext.parallelize([(i, 1.0 / num_pages) for i in range(num_pages)])

for iteration in range(max_iterations):
    # Join links with ranks
    contributions_rdd = links_rdd.join(ranks_rdd) \
        .flatMap(lambda x: [(x[1][0], x[1][1] / len([link for link in links if link[0] == x[0]]))])

    # Sum contributions for each page
    new_ranks_rdd = contributions_rdd.reduceByKey(lambda x, y: x + y) \
        .mapValues(lambda rank: (1 - damping_factor) / num_pages + damping_factor * rank)

    # Calculate the total difference (L1 norm) between ranks
    norm_diff = new_ranks_rdd.join(ranks_rdd) \
        .map(lambda x: abs(x[1][0] - x[1][1])) \
        .reduce(lambda x, y: x + y)

    # Check for convergence
    if norm_diff < tolerance:
        print(f"Convergence reached after {iteration + 1} iterations.")
        break

    # Update ranks
    ranks_rdd = new_ranks_rdd

final_ranks = ranks_rdd.collect()
for page, rank in sorted(final_ranks):
    print(f"Page {chr(page + 65)}: {rank:.4f}")


Page A: 0.3889
Page B: 0.2144
Page C: 0.3967
