In [None]:
val sc = new SparkContext(...)
  // 假定邻居页面的List存储为Spark objectFile
  val links = sc.objectFile[(String, Seq[String])]("links")
    .partitionBy(new HashPartitioner(100))
    .persist()

  //设置页面的初始rank值为1.0
  var ranks = links.mapValues(_ => 1.0)

  //迭代10次
  for (i <- 0 until 10) {
    val contributions = links.join(ranks).flatMap {
      case (pageId, (links, rank)) =>
        //注意此时的links为模式匹配获得的值，类型为Seq[String]，并非前面读取出来的页面List
        links.map(dest => (dest, rank / links.size))
    }
    //简化了的rank计算公式
    ranks = contributions.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _)
  }
  ranks.saveAsTextFile("ranks")

In [1]:
val linkage = sc.textFile("soc-LiveJournal1.txt")

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.174.130:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1572968146256)
SparkSession available as 'spark'


linkage: org.apache.spark.rdd.RDD[String] = soc-LiveJournal1.txt MapPartitionsRDD[1] at textFile at <console>:25


In [5]:
val lineLengths = linkage.map(s => s.length)

lineLengths: org.apache.spark.rdd.RDD[Int] = MapPartitionsRDD[2] at map at <console>:26


In [8]:
linkage.take(10).foreach(println)

# FromNodeId	ToNodeId
0	1
0	2
0	3
0	4
0	5
0	6
0	7
0	8
0	9


In [9]:
lineLengths.take(10).foreach(println)

21
3
3
3
3
3
3
3
3
3


In [18]:
val test = sc.textFile("test.txt").option("header", "true")

<console>: 25: error: value option is not a member of org.apache.spark.rdd.RDD[String]

In [21]:
var header = test.first()
var data = test.filter(row => row != header)
data.take(10).foreach(println)

0	1
0	2
0	3
0	4
0	5
0	6
0	7
0	8
0	9
0	10


header: String = # FromNodeId	ToNodeId
data: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[10] at filter at <console>:30


In [30]:
var quiz1 = data.map(_.split('\t')).map(pair => (pair(0),pair(1)))

quiz1: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[16] at map at <console>:26


In [31]:
quiz1.take(10).foreach(println)

(0,1)
(0,2)
(0,3)
(0,4)
(0,5)
(0,6)
(0,7)
(0,8)
(0,9)
(0,10)


In [29]:
quiz1.take(1).type()

<console>: 2: error: identifier expected but 'type' found.

In [73]:
val file = spark.read.textFile("a.txt").rdd
var header = file.first()
var lines = file.filter(row => row != header)
val pairs = lines.map{ s =>
      val parts = s.split("\\s+")               // Splits a line into an array of 2 elements according space(s)
             (parts(0), parts(1))                 // create the parts<url, url> for each line in the file
    }
val links = pairs.distinct().groupByKey()
links.collect().foreach(println)

(url_4,CompactBuffer(url_3, url_1))
(url_2,CompactBuffer(url_1))
(url_3,CompactBuffer(url_2, url_1))
(url_1,CompactBuffer(url_4))


file: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[146] at rdd at <console>:31
header: String = a	b
lines: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[147] at filter at <console>:33
pairs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[148] at map at <console>:34
links: org.apache.spark.rdd.RDD[(String, Iterable[String])] = ShuffledRDD[152] at groupByKey at <console>:38


In [74]:
var ranks = links.mapValues(v => 1.0)

ranks: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[153] at mapValues at <console>:26


In [75]:
ranks.collect().foreach(println)

(url_4,1.0)
(url_2,1.0)
(url_3,1.0)
(url_1,1.0)


In [None]:
for (i <- 1 to iters) {
    val contribs = links.join(ranks)         // join  -> RDD1
         .values                           // extract values from RDD1 -> RDD2          
         .flatMap{ case (urls, rank) =>    // RDD2 -> conbrib RDD
                 val size = urls.size        
                     urls.map(url => (url, rank / size))   // the ranks are distributed equally amongs the various URLs
             }
    ranks = contribs.reduceByKey(_ + _).mapValues(0.15 + 0.85 * _) // ranks RDD
}

In [72]:
val Tfile = spark.read.textFile("test.txt").rdd
var Theader = Tfile.first()
var Tlines = Tfile.filter(row => row != Theader)
val Tpairs = Tlines.map{ s =>
      val parts = s.split("\t")               // Splits a line into an array of 2 elements according space(s)
             (parts(0), parts(1))                 // create the parts<url, url> for each line in the file
    }
val Tlinks = Tpairs.distinct().groupByKey().cache()
Tlinks.collect().foreach(println)

(0,CompactBuffer(2, 9, 15, 19, 17, 13, 3, 7, 11, 5, 10, 1, 8, 12, 6, 16, 14, 18, 4))


Tfile: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[136] at rdd at <console>:31
Theader: String = # FromNodeId	ToNodeId
Tlines: org.apache.spark.rdd.RDD[String] = MapPartitionsRDD[137] at filter at <console>:33
Tpairs: org.apache.spark.rdd.RDD[(String, String)] = MapPartitionsRDD[138] at map at <console>:34
Tlinks: org.apache.spark.rdd.RDD[(String, Iterable[String])] = ShuffledRDD[142] at groupByKey at <console>:38


In [78]:
var Tranks = Tlinks.mapValues(v => 1.0)

Tranks: org.apache.spark.rdd.RDD[(String, Double)] = MapPartitionsRDD[155] at mapValues at <console>:26


In [79]:
Tranks.collect().foreach(println)

(0,1.0)
