# DataFrame and Dataset Examples in Spark REPL

ここの内容(前半のみ)→ https://jp.hortonworks.com/tutorial/dataframe-and-dataset-examples-in-spark-repl/

- docker-compose -f hdfs.yml exec namenode bash
  - curl -LO https://raw.githubusercontent.com/hortonworks/data-tutorials/master/tutorials/hdp/dataFrame-and-dataset-examples-in-spark-repl/assets/people.txt
  - curl -LO https://raw.githubusercontent.com/hortonworks/data-tutorials/master/tutorials/hdp/dataFrame-and-dataset-examples-in-spark-repl/assets/people.json
  - hdfs dfs -mkdir /tmp
  - hdfs dfs -put people.txt /tmp/people.txt
  - hdfs dfs -put people.json /tmp/people.json

In [1]:
val df = spark.read.json("hdfs://namenode/tmp/people.json")

Waiting for a Spark session to start...

df = [age: bigint, name: string]


[age: bigint, name: string]

In [2]:
df.show

+----+-------+
| age|   name|
+----+-------+
|null|Michael|
|  30|   Andy|
|  19| Justin|
+----+-------+



In [3]:
df.select(df("name"), df("age") + 1).show()

+-------+---------+
|   name|(age + 1)|
+-------+---------+
|Michael|     null|
|   Andy|       31|
| Justin|       20|
+-------+---------+



In [4]:
df.filter(df("age") > 21).show()

+---+----+
|age|name|
+---+----+
| 30|Andy|
+---+----+



In [5]:
df.groupBy("age").count().show()

+----+-----+
| age|count|
+----+-----+
|  19|    1|
|null|    1|
|  30|    1|
+----+-----+



In [6]:
import org.apache.spark.sql._
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._

In [7]:
val peopleRDD = spark.sparkContext.textFile("hdfs://namenode/tmp/people.txt")

peopleRDD = hdfs://namenode/tmp/people.txt MapPartitionsRDD[23] at textFile at <console>:34


hdfs://namenode/tmp/people.txt MapPartitionsRDD[23] at textFile at <console>:34

In [8]:
val schemaString = "name age"

name age

schemaString = name age


In [9]:
val fields = schemaString.split(" ").map(fieldName => StructField(fieldName, StringType, nullable = true))
val schema = StructType(fields)

fields = Array(StructField(name,StringType,true), StructField(age,StringType,true))
schema = StructType(StructField(name,StringType,true), StructField(age,StringType,true))


StructType(StructField(name,StringType,true), StructField(age,StringType,true))

In [10]:
val rowRDD = peopleRDD.map(_.split(",")).map(attributes => Row(attributes(0), attributes(1).trim))

rowRDD = MapPartitionsRDD[25] at map at <console>:34


MapPartitionsRDD[25] at map at <console>:34

In [11]:
val peopleDF = spark.createDataFrame(rowRDD, schema)

peopleDF = [name: string, age: string]


[name: string, age: string]

In [12]:
peopleDF.createOrReplaceTempView("people")

In [13]:
val results = spark.sql("SELECT name FROM people")

results = [name: string]


[name: string]

In [14]:
results.map(attributes => "Name: " + attributes(0)).show()

+-------------+
|        value|
+-------------+
|Name: Michael|
|   Name: Andy|
| Name: Justin|
+-------------+

