In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/9a/5a/271c416c1c2185b6cb0151b29a91fff6fcaed80173c8584ff6d20e46b465/pyspark-2.4.5.tar.gz (217.8MB)
[K     |████████████████████████████████| 217.8MB 61kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 37.4MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.5-py2.py3-none-any.whl size=218257927 sha256=e0f8ffeef7d8e56e3e0ae637e6877e96812df4df0f1abb88f81eba92fc013f44
  Stored in directory: /root/.cache/pip/wheels/bf/db/04/61d66a5939364e756eb1c1be4ec5bdce6e04047fc7929a3c3c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5
open

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry, DenseMatrix

In [0]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [0]:
small_data = sc.textFile('graph-small.txt')
full_data = sc.textFile('graph-full.txt')
BETA = 0.8

In [0]:
source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[1], x[0], 1))
degrees = source_dest_pair.map(lambda x: (x[0], 1)).reduceByKey(lambda x, y: x + y).map(lambda x: (x[0], x[0], 1 / x[1]))

In [0]:
edge_matrix = CoordinateMatrix(edges).toBlockMatrix()
degree_inverse_matrix = CoordinateMatrix(degrees).toBlockMatrix()

M = edge_matrix.multiply(degree_inverse_matrix)

In [0]:
r_init = []
beta_init = []
teleport_init = []
for i in range(1000):
  r_init.append((i, 0, 1 / 1000))
  beta_init.append((i, i, BETA))
  teleport_init.append((i, 0, (1 - BETA) / 1000))

In [0]:
r = CoordinateMatrix(sc.parallelize(r_init)).toBlockMatrix()
beta = CoordinateMatrix(sc.parallelize(beta_init)).toBlockMatrix()
teleport = CoordinateMatrix(sc.parallelize(teleport_init)).toBlockMatrix()

In [0]:
for i in range(40):
  r = teleport.add(beta.multiply(M).multiply(r))

In [0]:
r_numpy = np.array(r.toLocalMatrix().toArray())
min_args = np.argsort(r_numpy, axis = 0)[:5]
max_args = np.argsort(-r_numpy, axis = 0)[:5]

In [289]:
print("The top 5 node ids with the PageRank scores:")
for args in max_args:
  print("Node id: {}, PageRank score: {}".format(args[0] + 1, r_numpy[args][0][0]))

print("\n\n")

print("The bottom 5 node ids with the PageRank scores:")
for args in min_args:
  print("Node id: {}, PageRank score: {}".format(args[0] + 1, r_numpy[args][0][0]))

The top 5 node ids with the PageRank scores:
Node id: 263, PageRank score: 0.002020291181518219
Node id: 537, PageRank score: 0.00194334157145315
Node id: 965, PageRank score: 0.0019254478071662631
Node id: 243, PageRank score: 0.001852634016241731
Node id: 285, PageRank score: 0.0018273721700645144



The bottom 5 node ids with the PageRank scores:
Node id: 558, PageRank score: 0.0003286018525215297
Node id: 93, PageRank score: 0.0003513568937516577
Node id: 62, PageRank score: 0.00035314810510596274
Node id: 424, PageRank score: 0.00035481538649301454
Node id: 408, PageRank score: 0.00038779848719291705
