In [1]:
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

val transactionsRDD = sc.textFile("test.csv")
val schemaString = "transaction_id customer_id total_amount item_count description"

val fields = schemaString.split(" ").
    map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)

val rowRDD = transactionsRDD.
    map(_.split(",")).
    map(attributes => Row(attributes(0), attributes(1), attributes(2), attributes(3), attributes(4).trim))
val transactionsDF = spark.createDataFrame(rowRDD, schema)
transactionsDF.createOrReplaceTempView("transactions")

val results = spark.sql("""
    SELECT *
    FROM transactions
    """)

Intitializing Scala interpreter ...

Spark Web UI available at http://192.168.174.130:4040
SparkContext available as 'sc' (version = 2.4.4, master = local[*], app id = local-1572854802460)
SparkSession available as 'spark'


import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
transactionsRDD: org.apache.spark.rdd.RDD[String] = test.csv MapPartitionsRDD[1] at textFile at <console>:28
schemaString: String = transaction_id customer_id total_amount item_count description
fields: Array[org.apache.spark.sql.types.StructField] = Array(StructField(transaction_id,StringType,true), StructField(customer_id,StringType,true), StructField(total_amount,StringType,true), StructField(item_count,StringType,true), StructField(description,StringType,true))
schema: org.apache.spark.sql.types.StructType = StructType(StructField(transaction_id,StringType,true), StructField(customer_id,StringType,true), StructField(total_amount,StringType,true), StructField(item_count,StringType,true), StructField(description,Strin...

In [3]:
results.show()

+--------------+-----------+------------+----------+--------------------+
|transaction_id|customer_id|total_amount|item_count|         description|
+--------------+-----------+------------+----------+--------------------+
|             1|      13931|   947.27325|        10|7tk17k7r14ixnkgbw9xc|
|             2|      24567|   119.68945|        10|yg5osqfmtqlreme6b...|
|             3|      10311|    563.8975|        10|9yc74al32gyx4hmpu...|
|             4|      14630|    737.5072|         4|bdnglgyulcln7zl9i...|
|             5|      38528|   520.28046|         3|9nvr6vb2ouppqqv8l...|
|             6|       4505|    694.3501|         6|j6eoz2r5swlpjpxkq...|
|             7|      12466|    283.8117|         8|wrwfqu49uakdxqq2c...|
|             8|      16572|    331.2615|         6|7bq937yzswhhlejos...|
|             9|      18696|    380.0146|         1|v77phaie3detj3tm5...|
|            10|      11084|    781.5895|         1|f5sj0wmi9pmmmkc5w...|
|            11|       8115|    958.57

In [4]:
val T1 = spark.sql("""
    SELECT *
    FROM transactions
    WHERE total_amount > 200
    """)
transactionsDF.registerTempTable("T1")
T1.show()

+--------------+-----------+------------+----------+--------------------+
|transaction_id|customer_id|total_amount|item_count|         description|
+--------------+-----------+------------+----------+--------------------+
|             1|      13931|   947.27325|        10|7tk17k7r14ixnkgbw9xc|
|             3|      10311|    563.8975|        10|9yc74al32gyx4hmpu...|
|             4|      14630|    737.5072|         4|bdnglgyulcln7zl9i...|
|             5|      38528|   520.28046|         3|9nvr6vb2ouppqqv8l...|
|             6|       4505|    694.3501|         6|j6eoz2r5swlpjpxkq...|
|             7|      12466|    283.8117|         8|wrwfqu49uakdxqq2c...|
|             8|      16572|    331.2615|         6|7bq937yzswhhlejos...|
|             9|      18696|    380.0146|         1|v77phaie3detj3tm5...|
|            10|      11084|    781.5895|         1|f5sj0wmi9pmmmkc5w...|
|            11|       8115|    958.5794|         2|igxkh8nzn7iralfn4...|
|            13|      36239|    302.17

T1: org.apache.spark.sql.DataFrame = [transaction_id: string, customer_id: string ... 3 more fields]


In [6]:
var T2 = spark.sql("""
    SELECT item_count, sum(total_amount) as sum, avg(total_amount) as avg, min(total_amount) as min, max(total_amount) as max
    FROM T1
    GROUP BY item_count
    """)
transactionsDF.registerTempTable("T2")
T2.show()

+----------+----------+------------------+---------+---------+
|item_count|       sum|               avg|      min|      max|
+----------+----------+------------------+---------+---------+
|         7|  907.9976|          907.9976| 907.9976| 907.9976|
|         3|1468.43426|         734.21713|520.28046| 948.1538|
|         8| 613.33373|        306.666865| 283.8117|329.52203|
|         5| 464.89387|154.96462333333332|206.60675| 235.0295|
|         6| 1025.6116|          512.8058| 331.2615| 694.3501|
|         9|  302.1748|          302.1748| 302.1748| 302.1748|
|         1| 1161.6041|         580.80205| 380.0146| 781.5895|
|        10| 1630.8602| 543.6200666666667|119.68945|947.27325|
|         4|1380.91705|        690.458525|643.40985| 737.5072|
|         2| 1805.0976|          902.5488| 846.5182| 958.5794|
+----------+----------+------------------+---------+---------+



T2: org.apache.spark.sql.DataFrame = [item_count: string, sum: double ... 3 more fields]
