## RDD example

In [4]:
// You can call the SparkContext directly with `sc`
val rdd = sc.textFile("/data/mnm_dataset.csv")

rdd = /data/mnm_dataset.csv MapPartitionsRDD[5] at textFile at <console>:24


/data/mnm_dataset.csv MapPartitionsRDD[5] at textFile at <console>:24

In [5]:
val header = rdd.first()

header = State,Color,Count


State,Color,Count

In [6]:
val rows = rdd.filter(_ != header)

rows = MapPartitionsRDD[6] at filter at <console>:25


MapPartitionsRDD[6] at filter at <console>:25

In [8]:
val mnmData = rows.map {
    row =>
      val fields = row.split(",")
    (fields(0), fields(1), fields(2))
}

mnmData = MapPartitionsRDD[7] at map at <console>:24


MapPartitionsRDD[7] at map at <console>:24

In [25]:
val countMnmColorByState = 
mnmData.map(row => ((row._1, row._2), 1L)).reduceByKey(_ + _)
.sortBy { case (_, count) => -count }
.map {row => (row._1._1, row._1._2, row._2)}

countMnmColorByState = MapPartitionsRDD[61] at map at <console>:30


MapPartitionsRDD[61] at map at <console>:30

In [27]:
countMnmColorByState.take(10).foreach(println)

(CA,Yellow,1807)
(WA,Green,1779)
(OR,Orange,1743)
(TX,Green,1737)
(TX,Red,1725)
(CA,Green,1723)
(CO,Yellow,1721)
(CA,Brown,1718)
(CO,Green,1713)
(NV,Orange,1712)


## Dataframe example

In [21]:
import org.apache.spark.sql.functions._

In [24]:
// You can call the SparkSession directly with `spark`
val mnmDF = spark
    .read
    .option("header", "true")
    .option("inferSchema", "true")
    .csv("/data/mnm_dataset.csv")

mnmDF = [State: string, Color: string ... 1 more field]


[State: string, Color: string ... 1 more field]

In [22]:
val countMnMDF =
mnmDF
  .groupBy("State", "Color")
  .agg(count("Count").alias("Total"))
  .orderBy(col("Total").desc)

countMnMDF = [State: string, Color: string ... 1 more field]


[State: string, Color: string ... 1 more field]

In [28]:
countMnMDF.show(10, truncate = false)

+-----+------+-----+
|State|Color |Total|
+-----+------+-----+
|CA   |Yellow|1807 |
|WA   |Green |1779 |
|OR   |Orange|1743 |
|TX   |Green |1737 |
|TX   |Red   |1725 |
|CA   |Green |1723 |
|CO   |Yellow|1721 |
|CA   |Brown |1718 |
|CO   |Green |1713 |
|NV   |Orange|1712 |
+-----+------+-----+
only showing top 10 rows

