In [1]:
from pyspark.sql.types import (
    LongType,
    StringType,
    StructField,
    StructType,
)

rdd = sc.parallelize([
    (123, 'Katie', 19, 'brown'),
    (234, 'Michael', 22, 'green'),
    (345, 'Simone', 23, 'blue'),
])
schema = StructType([
    # StructField require name, dataType, nullable
    StructField('id', LongType(), True),
    StructField('name', StringType(), True),
    StructField('age', LongType(), True),
    StructField('eyeColor', StringType(), True),
])

In [2]:
# RDD にスキーマを適用して DataFrame を作成する
df = spark.createDataFrame(rdd, schema)
# 作成したDataFrame を使って一時的なビューを作成する
df.createOrReplaceTempView('swimmers')

In [3]:
df.printSchema()

root
 |-- id: long (nullable = true)
 |-- name: string (nullable = true)
 |-- age: long (nullable = true)
 |-- eyeColor: string (nullable = true)



In [4]:
df.count()

3

In [5]:
# フィルタ
df.select('id', 'age').filter('age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [6]:
# 上の行と同じ
df.select(df.id, df.age).filter(df.age == 22).show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+



In [7]:
df.select("name", "eyeColor").filter("eyeColor like 'b%'").show()

+------+--------+
|  name|eyeColor|
+------+--------+
| Katie|   brown|
|Simone|    blue|
+------+--------+



In [8]:
# spark.sql の場合
spark.sql("select count(*) from swimmers").show()

+--------+
|count(1)|
+--------+
|       3|
+--------+



In [9]:
spark.sql('select id, age from swimmers where age = 22').show()

+---+---+
| id|age|
+---+---+
|234| 22|
+---+---+

