# 306 Spark - Page Rank

The goal of this lab is to implement the Page Rank algorithm using Spark and RDDs.

First, load the datasets on S3:

- ```datasets/pr-test.txt``` is a dataset the can be used for testing purpose
- ```pr-4.txt``` can be downloaded from [here](https://big.csr.unibo.it/downloads/bigdata/pr-4.txt) (2.7GB)

In [None]:
%%configure -f
{"executorMemory":"6G", "numExecutors":2, "executorCores":2, "conf": {"spark.dynamicAllocation.enabled": "false"}}

In [None]:
//val bucketname = "unibo-bd2122-egallinucci"
val bucketname = "eg-myfirstbucket"

//val mat = "pr-test"
val mat = "pr-4"
val path_mat = "s3a://"+bucketname+"/datasets/"+mat+".txt"

val nPartitions = if(mat=="pr-test") { 2 } else { 50 }

"SPARK UI: Enable forwarding of port 20888 and connect to http://localhost:20888/proxy/" + sc.applicationId + "/"

In [None]:
// Free the cache
for ((k,v) <- sc.getPersistentRDDs) {
   v.unpersist()
}

## 306-1 PageRank Map-Reduce

In [None]:
import org.apache.spark.storage.StorageLevel._
import org.apache.spark.HashPartitioner

val p = new HashPartitioner(nPartitions)

val rddM = sc.
    textFile(path_mat).
    map(r => (r.split(",")(0).toInt, r.split(",")(1).toInt, r.split(",")(2).toDouble)). // (i,j,mij))
    map({case (i,j,mij) => (j,(i,mij))}).
    partitionBy(p).    
    persist(MEMORY_AND_DISK_SER)

In [None]:
val n = rddM.map({case (k,v) => k}).distinct().count()
val rddV = sc.
    parallelize(List.range(0,n).map(j => (j.toInt,(1/n.toDouble).toDouble))).
    partitionBy(p) // (j,vj)

In [None]:
var iterations = 3

var rddV1 = rddV

while(iterations > 0){
    
    rddV1 = rddM.
        join(rddV1).
        map({case(j,((i,mij),vj)) => (i,mij*vj)}).
        reduceByKey(p,_+_)
    
    iterations = iterations - 1
}

rddV1.collect()

## 306-2 PageRank Map-Reduce - Broadcasting V

In [None]:
var iterations = 3
var v = rddV.collectAsMap()

while(iterations > 0){
    var bV = sc.broadcast(v)
    
    val rddV1 = rddM.
        map({case(j,(i,mij)) => (i, mij * bV.value.get(j).get)}).
        reduceByKey(p,_+_)

    iterations = iterations - 1
    var v1 = rddV1.collectAsMap()
    bV.destroy()
    v = v1
}
v

In [None]:
// Free the cache
for ((k,v) <- sc.getPersistentRDDs) {
   v.unpersist()
}

In [None]:
val rddM1 = sc.
    textFile(path_mat).
    map(r => (r.split(",")(0).toInt, r.split(",")(1).toInt, r.split(",")(2).toDouble)). // (i,j,mij))
    map({case (i,j,mij) => (i,(j,mij))}).
    partitionBy(p).    
    persist(MEMORY_AND_DISK_SER)

val n = rddM1.map({case (k,v) => k}).distinct().count()
val rddV = sc.
    parallelize(List.range(0,n).map(j => (j.toInt,(1/n.toDouble).toDouble))).
    partitionBy(p) // (j,vj)

In [None]:
var iterations = 3
var v = rddV.collectAsMap()

while(iterations > 0){
    var bV = sc.broadcast(v)
    
    val rddV1 = rddM1.
        mapValues({case(j,mij) => (mij * bV.value.get(j).get)}).
        reduceByKey(p,_+_)

    iterations = iterations - 1
    var v1 = rddV1.collectAsMap()
    bV.destroy()
    v = v1
}
v

## 306-3 Matrix-Matrix multiplication

Use the same datasets to implement a matrix-matrix multiplication