In [1]:
%scala
case class Flight(DEST_COUNTRY_NAME: String, ORIGIN_COUNTRY_NAME: String, count: BigInt)

In [2]:
%scala
val flightsDF = spark.read.parquet("FileStore/tables/6jc7prea1497642653485/part_r_00000_1a9822ba_b8fb_4d8e_844a_ea30d0801b9e_gz-11168.parquet")
val flights = flightsDF.as[Flight]

In [3]:
%scala
flights.take(2)

In [4]:
%scala
flights.first.DEST_COUNTRY_NAME

In [5]:
%scala
def originIsDestination(flight_row: Flight): Boolean = {
return flight_row.ORIGIN_COUNTRY_NAME == flight_row.DEST_COUNTRY_NAME
}

In [6]:
%scala
flights.filter(flight_row => originIsDestination(flight_row)).first()

In [7]:
%scala
flights.collect().filter(flight_row => originIsDestination(flight_row))

In [8]:
%scala
val destinations = flights.map(f => f.DEST_COUNTRY_NAME)

In [9]:
%scala
val localDestinations = destinations.take(10)

In [10]:
%scala
case class FlightMetadata(count: BigInt, randomData: BigInt)
val flightsMeta = spark.range(500)
.map(x => (x, scala.util.Random.nextLong))
.withColumnRenamed("_1", "count")
.withColumnRenamed("_2", "randomData")
.as[FlightMetadata]
val flights2 = flights
.joinWith(flightsMeta,
flights.col("count") === flightsMeta.col("count"))

In [11]:
%scala
flights2.selectExpr("_1.DEST_COUNTRY_NAME")

In [12]:
%scala
flights2.take(2)

In [13]:
%scala
val flights2 = flights.join(flightsMeta, Seq("count"))

In [14]:
%scala
val flights2 = flights.join(flightsMeta.toDF(), Seq("count"))

In [15]:
%scala
s"${sc.uiWebUrl.get}/api/v1/applications/${sc.applicationId}"
flights.groupBy("DEST_COUNTRY_NAME").count()

In [16]:
%scala
flights.groupByKey(x => x.DEST_COUNTRY_NAME).count()

In [17]:
%scala
flights.groupByKey(x => x.DEST_COUNTRY_NAME).count().explain

In [18]:
%scala
def grpSum(countryName:String, values: Iterator[Flight]) = {
values.dropWhile(_.count < 5).map(x => (countryName, x))
}
flights.groupByKey(x => x.DEST_COUNTRY_NAME).flatMapGroups(grpSum).take(5)
def grpSum2(f:Flight):Integer = {
1
}
flights.groupByKey(x => x.DEST_COUNTRY_NAME).mapValues(grpSum2).count().take(5)
def sum2(left:Flight, right:Flight) = {
Flight(left.DEST_COUNTRY_NAME, null, left.count + right.count)
}
flights.groupByKey(x => x.DEST_COUNTRY_NAME).reduceGroups((l, r) => sum2(l, r)).take(5)

In [19]:
%scala
flights.groupBy("DEST_COUNTRY_NAME").count().explain

In [20]:
%scala
case class Transaction(customerId: BigInt, amount: Integer, unitCost:Double, itemId: BigInt)
case class Receipt(customerId: BigInt, totalCost: Double)
val localTransactions = Seq(
Transaction(1, 5, 5.5, 37),
Transaction(1, 10, 8.24, 67),
Transaction(1, 1, 3.5, 22)
)
val SparkTransactions = localTransactions.toDF().as[Transaction]
def isBigTransaction(transaction: Transaction) = {
(transaction.amount * transaction.unitCost) > 15
}

In [21]:
%scala
localTransactions.filter(isBigTransaction(_))
SparkTransactions.filter(isBigTransaction(_))

In [22]:
%scala
import org.apache.spark.SparkConf
val myConf = new SparkConf().setAppName("My Application")
spark.sparkContext

In [23]:
%scala
import org.apache.spark.SparkContext
val sc = SparkContext.getOrCreate()

In [24]:
%scala
val myCollection = "Spark The Definitive Guide : Big Data Processing Made Simple".split(" ")
val words = spark.sparkContext.parallelize(myCollection, 2)
words.setName("myWords")
words.name
words.distinct().count()

In [25]:
%scala
def startsWithS(individual:String) = {
individual.startsWith("S")
}
val onlyS = words.filter(word => startsWithS(word))
onlyS.collect()

In [26]:
%scala
val words2 = words.map(word => (word, word(0), word.startsWith("S")))
words2.filter(record => record._3).take(5)

In [27]:
%scala
val characters = words.flatMap(word => word.toSeq)
characters.take(5)
words.sortBy(word => word.length() * -1).take(2)
val fiftyFiftySplit = words.randomSplit(Array[Double](0.5, 0.5))

In [28]:
%scala
sc.parallelize(1 to 20).reduce(_ + _)

In [29]:
%scala
def wordLengthReducer(leftWord:String, rightWord:String): String = {
if (leftWord.length >= rightWord.length)
return leftWord
else
return rightWord
}
words.reduce(wordLengthReducer)
words.count()

In [30]:
%scala
val confidence = 0.95
val timeoutMilliseconds = 400
words.countApprox(timeoutMilliseconds, confidence)

In [31]:
%scala
words.countApproxDistinct(0.05)


In [32]:
%scala
words.countApproxDistinct(4, 10)

In [33]:
%scala
words.countByValue()

In [34]:
%scala
words.countByValueApprox(1000, 0.95)

In [35]:
%scala
words.first()

In [36]:
%scala
sc.parallelize(1 to 20).max()
sc.parallelize(1 to 20).min()

In [37]:
%scala
words.take(5)
words.takeOrdered(5)
words.top(5)
val withReplacement = true
val numberToTake = 6
val randomSeed = 100L
words.takeSample(withReplacement, numberToTake, randomSeed)

In [38]:
%scala
words.saveAsTextFile("file:/tmp/bookTitle")

In [39]:
%scala
import org.apache.hadoop.io.compress.BZip2Codec
words.saveAsTextFile("file:/tmp/bookTitleCompressed", classOf[BZip2Codec])

In [40]:
%scala
words.saveAsObjectFile("file:/tmp/my/sequenceFilePath")

In [41]:
%scala
words.cache()
words.getStorageLevel

In [42]:
%scala
spark.range(10).rdd
spark.range(10).toDF().rdd
spark.range(10).toDF().rdd.map(rowObject => rowObject.getLong(0))
spark.range(10).rdd.toDF()

In [43]:
spark.range(10).rdd.toDF()