In [1]:
%scala
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)

In [2]:
%scala
words.mapPartitions(part => Iterator[Int](1)).sum()

In [3]:
%scala
def indexedFunc(partitionIndex:Int, withinPartIterator: Iterator[String]) = {
withinPartIterator.toList.map(value => s"Partition: $partitionIndex => $value").iterator
}
words.mapPartitionsWithIndex(indexedFunc).collect()

In [4]:
%scala
words.foreachPartition { iter =>
import java.io._
import scala.util.Random
val randomFileName = new Random().nextInt()
val pw = new PrintWriter(new File(s"/tmp/random-file-${randomFileName}.txt"))
while (iter.hasNext) {
pw.write(iter.next())
}
pw.close()
}

In [5]:
%scala
sc.parallelize(Seq("Hello", "World"), 2).glom().collect()

In [6]:
%scala
words
.map(word => (word.toLowerCase, 1))

In [7]:
%scala
words
.keyBy(word => word.toLowerCase.toSeq(0))

In [8]:
%scala
words
.map(word => (word.toLowerCase.toSeq(0), word))
.mapValues(word => word.toUpperCase)
.collect()

In [9]:
%scala
words
.map(word => (word.toLowerCase.toSeq(0), word))
.flatMapValues(word => word.toUpperCase)
.collect()

In [10]:
%scala
words
.map(word => (word.toLowerCase.toSeq(0), word))
.keys
.collect()
words
.map(word => (word.toLowerCase.toSeq(0), word))
.values
.collect()

In [11]:
%scala
words
.map(word => (word.toLowerCase, 1))
.lookup("spark")

In [12]:
%scala
val chars = words
.flatMap(word => word.toLowerCase.toSeq)
val KVcharacters = chars
.map(letter => (letter, 1))
def maxFunc(left:Int, right:Int) = math.max(left, right)
def addFunc(left:Int, right:Int) = left + right
val nums = sc.parallelize(1 to 30, 5)

In [13]:
%scala
KVcharacters.countByKey()
val timeout = 1000L //milliseconds
val confidence = 0.95
KVcharacters.countByKeyApprox(timeout, confidence)

In [14]:
%scala
KVcharacters
.groupByKey()
.map(row => (row._1, row._2.reduce(addFunc)))
.collect()

In [15]:
%scala
KVcharacters.reduceByKey(addFunc).collect()

In [16]:
%scala
nums.aggregate(0)(maxFunc, addFunc)

In [17]:
%scala
nums.treeAggregate(0)(maxFunc, addFunc)

In [18]:
%scala
KVcharacters.aggregateByKey(0)(addFunc, maxFunc).collect()

In [19]:
%scala
val valToCombiner = (value:Int) => List(value)
val mergeValuesFunc = (vals:List[Int], valToAppend:Int) => valToAppend :: vals
val mergeCombinerFunc = (vals1:List[Int], vals2:List[Int]) => vals1 ::: vals2
// not we define these as function variables
val outputPartitions = 6
KVcharacters
.combineByKey(
valToCombiner,
mergeValuesFunc,
mergeCombinerFunc,
outputPartitions)
.collect()

In [20]:
%scala
KVcharacters
.foldByKey(0)(addFunc)
.collect()

In [21]:
%scala
val distinctChars = words
.flatMap(word => word.toLowerCase.toSeq)
.distinct
.collect()
import scala.util.Random
val sampleMap = distinctChars.map(c => (c, new Random().nextDouble())).toMap
words
.map(word => (word.toLowerCase.toSeq(0), word))
.sampleByKey(true, sampleMap, 6L)
.collect()

In [22]:
%scala
words
.map(word => (word.toLowerCase.toSeq(0), word))
.sampleByKeyExact(true, sampleMap, 6L)
.collect()

In [23]:
%scala
import scala.util.Random
val distinctChars = words
.flatMap(word => word.toLowerCase.toSeq)
.distinct
val charRDD = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD2 = distinctChars.map(c => (c, new Random().nextDouble()))
val charRDD3 = distinctChars.map(c => (c, new Random().nextDouble()))
charRDD.cogroup(charRDD2, charRDD3).take(5)

In [24]:
%scala
val keyedChars = sc.parallelize(distinctChars.map(c => (c, new Random().nextDouble())))
val outputPartitions = 10
KVcharacters.join(keyedChars).count()
KVcharacters.join(keyedChars, outputPartitions).count()


In [25]:
%scala
val numRange = sc.parallelize(0 to 9, 2)
words.zip(numRange).collect()

In [26]:
%scala
words.coalesce(1)

In [27]:
%scala
words.repartition(10)

In [28]:
%scala
val df = spark.read
.option("header", "true")
.option("inferSchema", "true")
.csv("dbfs:/mnt/defg/streaming/*.csv")
val rdd = df.coalesce(10).rdd

In [29]:
%scala
rdd.map(r => r(6)).take(5).foreach(println)
val keyedRDD = rdd.keyBy(row => row(6).asInstanceOf[Double])
import org.apache.spark.{HashPartitioner}
keyedRDD
.partitionBy(new HashPartitioner(10))

In [30]:
%scala
import org.apache.spark.{Partitioner}
class DomainPartitioner extends Partitioner {
def numPartitions = 20
def getPartition(key: Any): Int = {
(key.asInstanceOf[Double] / 1000).toInt
}
}
val res = keyedRDD
.partitionBy(new DomainPartitioner)

In [31]:
%scala
res
.glom()
.collect()
.map(arr => {
if (arr.length > 0) {
arr.map(_._2(6)).toSet.toSeq.length
}
})

In [32]:
%scala
keyedRDD.repartitionAndSortWithinPartitions(new DomainPartitioner)

In [33]:
%scala
class SomeClass extends Serializable {
var someValue = 0
def setSomeValue(i:Int) = {
someValue = i
this
}
}
sc.parallelize(1 to 10).map(num => new SomeClass().setSomeValue(num))

In [34]:
%scala
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)

In [35]:
my_collection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
words = spark.sparkContext.parallelize(my_collection, 2)

In [36]:
%scala
val supplementalData = Map(
"Spark" -> 1000,
"Definitive" -> 200,
  "Big" -> -300,
"Simple" -> 100
)

In [37]:
%scala
val suppBroadcast = spark.sparkContext.broadcast(supplementalData)
suppBroadcast.value
val suppWords = words.map(word => (word, suppBroadcast.value.getOrElse(word, 0)))
suppWords.sortBy(wordPair => wordPair._2).collect()

In [38]:
%scala
case class Flight(DEST_COUNTRY_NAME: String, ORIGIN_COUNTRY_NAME: String, count: BigInt)
val flights = spark.read
.parquet("/mnt/defg/chapter-1-data/parquet/2010-summary.parquet/")
.as[Flight]

In [39]:
%scala
import org.apache.spark.util.LongAccumulator
val accUnnamed = new LongAccumulator
sc.register(accUnnamed)

In [40]:
%scala
val accChina = new LongAccumulator
sc.register(accChina, "China")
val accChina2 = sc.longAccumulator("China")

In [41]:
%scala
def accChinaFunc(flight_row: Flight) = {
val destination = flight_row.DEST_COUNTRY_NAME
val origin = flight_row.ORIGIN_COUNTRY_NAME
if (destination == "China") {
accChina.add(flight_row.count.toLong)
}
if (origin == "China") {
accChina.add(flight_row.count.toLong)
}
}

In [42]:
%scala
flights.foreach(flight_row => accChinaFunc(flight_row))

In [43]:
%scala
accChina.value

In [44]:
%scala
import scala.collection.mutable.ArrayBuffer
val arr = ArrayBuffer[BigInt]()
import scala.collection.mutable.ArrayBuffer
import org.apache.spark.util.AccumulatorV2
class EvenAccumulator extends AccumulatorV2[BigInt, BigInt] {
private var num:BigInt = 0
def reset(): Unit = {
this.num = 0
}
def add(intValue: BigInt): Unit = {
if (intValue % 2 == 0) {
this.num += intValue
}
}
def merge(other: AccumulatorV2[BigInt,BigInt]): Unit = {
this.num += other.value
}
def value():BigInt = {
this.num
}
def copy(): AccumulatorV2[BigInt,BigInt] = {
new EvenAccumulator
}
def isZero():Boolean = {
this.num == 0
}
}
val acc = new EvenAccumulator
val newAcc = sc.register(acc, "evenAcc")
acc.value
flights.foreach(flight_row => acc.add(flight_row.count))
acc.value