In [1]:
!pip install pyspark
!pip install -U -q PyDrive
!apt install openjdk-8-jdk-headless -qq
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"

Collecting pyspark
[?25l  Downloading https://files.pythonhosted.org/packages/9a/5a/271c416c1c2185b6cb0151b29a91fff6fcaed80173c8584ff6d20e46b465/pyspark-2.4.5.tar.gz (217.8MB)
[K     |████████████████████████████████| 217.8MB 60kB/s 
[?25hCollecting py4j==0.10.7
[?25l  Downloading https://files.pythonhosted.org/packages/e3/53/c737818eb9a7dc32a7cd4f1396e787bd94200c3997c72c1dbe028587bd76/py4j-0.10.7-py2.py3-none-any.whl (197kB)
[K     |████████████████████████████████| 204kB 44.3MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-2.4.5-py2.py3-none-any.whl size=218257927 sha256=4f97ffaeb19e1b5b7e32192297e47ab1bdd1e92f7795ed850f3b8cf8303f6eef
  Stored in directory: /root/.cache/pip/wheels/bf/db/04/61d66a5939364e756eb1c1be4ec5bdce6e04047fc7929a3c3c
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.7 pyspark-2.4.5
open

In [0]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

import pyspark
from pyspark.sql import *
from pyspark.sql.types import *
from pyspark.sql.functions import *
from pyspark import SparkContext, SparkConf
from pyspark.mllib.linalg.distributed import CoordinateMatrix, MatrixEntry, DenseMatrix

In [0]:
# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = pyspark.SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [0]:
small_data = sc.textFile('graph-small.txt')
full_data = sc.textFile('graph-full.txt')

LAMBDA = 1
NU = 1

In [0]:
source_dest_pair = full_data.map(lambda x: (int(x.split('\t')[0]) - 1, int(x.split('\t')[1]) - 1)).distinct()
edges = source_dest_pair.map(lambda x: (x[0], x[1], 1))
edges_transpose = source_dest_pair.map(lambda x: (x[1], x[0], 1))

In [0]:
L = CoordinateMatrix(edges).toBlockMatrix()
L_transpose = CoordinateMatrix(edges_transpose).toBlockMatrix()

In [0]:
h_init = []

for i in range(1000):
  h_init.append((i, 0, 1))

In [0]:
h = CoordinateMatrix(sc.parallelize(h_init)).toBlockMatrix()

In [0]:
a = None

for i in range(40):

  a_new = L_transpose.multiply(h)
  a_new_max = np.max(np.array(a_new.toLocalMatrix().toArray()))
  a_new_max_inverse = []
  for j in range(1000):
    a_new_max_inverse.append((j, j, 1 / a_new_max))
  a_new_max_inverse = CoordinateMatrix(sc.parallelize(a_new_max_inverse)).toBlockMatrix()
  a = a_new_max_inverse.multiply(a_new)

  h_new = L.multiply(a)
  h_new_max = np.max(np.array(h_new.toLocalMatrix().toArray()))
  h_new_max_inverse = []
  for j in range(1000):
    h_new_max_inverse.append((j, j, 1 / h_new_max))
  h_new_max_inverse = CoordinateMatrix(sc.parallelize(h_new_max_inverse)).toBlockMatrix()
  h = h_new_max_inverse.multiply(h_new)

In [0]:
h_numpy = np.array(h.toLocalMatrix().toArray())
a_numpy = np.array(a.toLocalMatrix().toArray())
h_min_args = np.argsort(h_numpy, axis = 0)[:5]
a_min_args = np.argsort(a_numpy, axis = 0)[:5]
h_max_args = np.argsort(-h_numpy, axis = 0)[:5]
a_max_args = np.argsort(-a_numpy, axis = 0)[:5]

In [30]:
print("The 5 node ids with the highest hubbiness scores:")
for args in h_max_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, h_numpy[args][0][0]))

print("\n\n")

print("The 5 node ids with the lowest hubbiness scores:")
for args in h_min_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, h_numpy[args][0][0]))

print("\n\n")

print("The 5 node ids with the highest authority scores:")
for args in a_max_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, a_numpy[args][0][0]))

print("\n\n")

print("The 5 node ids with the lowest authority scores:")
for args in a_min_args:
  print("Node id: {}, hubbiness score: {}".format(args[0] + 1, a_numpy[args][0][0]))

print("\n\n")

The 5 node ids with the highest hubbiness scores:
Node id: 840, hubbiness score: 1.0
Node id: 155, hubbiness score: 0.9499618624906543
Node id: 234, hubbiness score: 0.8986645288972264
Node id: 389, hubbiness score: 0.863417110184379
Node id: 472, hubbiness score: 0.8632841092495217



The 5 node ids with the lowest hubbiness scores:
Node id: 23, hubbiness score: 0.042066854890936534
Node id: 835, hubbiness score: 0.05779059354433016
Node id: 141, hubbiness score: 0.06453117646225179
Node id: 539, hubbiness score: 0.06602659373418492
Node id: 889, hubbiness score: 0.07678413939216454



The 5 node ids with the highest authority scores:
Node id: 893, hubbiness score: 1.0
Node id: 16, hubbiness score: 0.9635572849634398
Node id: 799, hubbiness score: 0.9510158161074016
Node id: 146, hubbiness score: 0.9246703586198444
Node id: 473, hubbiness score: 0.899866197360405



The 5 node ids with the lowest authority scores:
Node id: 19, hubbiness score: 0.05608316377607618
Node id: 135, hubbine