In [19]:
import pyspark.sql.functions
from pyspark.sql import SparkSession

In [20]:
spark = SparkSession.builder.appName("grab_dataframe").getOrCreate()

In [21]:
df = spark.read.json("./../sample_data/people.json")

In [22]:
df.show()

+----+-----+
| age| name|
+----+-----+
|null|  Joe|
|  39| John|
|  49|Jason|
+----+-----+



### indexing [] gets column object

In [23]:
type(df["name"])

pyspark.sql.column.Column

### select method returns a dataframe of that column

In [24]:
type(df.select("name"))

df.select("name").show()

+-----+
| name|
+-----+
|  Joe|
| John|
|Jason|
+-----+



### head method will grab N Rows from the top of the dataframe

In [26]:
df.head(2)

[Row(age=None, name='Joe'), Row(age=39, name='John')]

In [35]:
row = df.head(3)[0]

In [39]:
df.select(["name", "age"]).show()

+-----+----+
| name| age|
+-----+----+
|  Joe|null|
| John|  39|
|Jason|  49|
+-----+----+



### withColumn will return a new dataframe with this new column added

In [42]:
df = df.withColumn("double_age", df["age"] * 2)
df.show()

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|null|  Joe|      null|
|  39| John|        78|
|  49|Jason|        98|
+----+-----+----------+



### withColumnRenamed to rename a column

In [44]:
df.withColumnRenamed("age", "new_age")

DataFrame[new_age: bigint, name: string, double_age: bigint]

### use SQL

In [53]:
df.createOrReplaceTempView("people")

spark.sql("select * from people").show()
spark.sql("select * from people where age > 40").show()
spark.sql("select * from people order by age desc").show()
spark.sql("select * from people order by age asc").show()
spark.sql("select * from people where age is not null").show()
spark.sql("select * from people where age is null").show()

age = 40
spark.sql("select * from people where age > {0}".format(age)).show()

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|null|  Joe|      null|
|  39| John|        78|
|  49|Jason|        98|
+----+-----+----------+

+---+-----+----------+
|age| name|double_age|
+---+-----+----------+
| 49|Jason|        98|
+---+-----+----------+

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|  49|Jason|        98|
|  39| John|        78|
|null|  Joe|      null|
+----+-----+----------+

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|null|  Joe|      null|
|  39| John|        78|
|  49|Jason|        98|
+----+-----+----------+

+---+-----+----------+
|age| name|double_age|
+---+-----+----------+
| 39| John|        78|
| 49|Jason|        98|
+---+-----+----------+

+----+----+----------+
| age|name|double_age|
+----+----+----------+
|null| Joe|      null|
+----+----+----------+

+---+-----+----------+
|age| name|double_age|
+---+-----+----------+
| 49|Jason|        98|
+---+-----+----------+



In [59]:
from pyspark.sql.functions import asc, desc

df.filter("age > 40").show()
df.filter("age is not null").show()
df.orderBy(desc("age")).show()
df.orderBy(asc("age")).show()



+---+-----+----------+
|age| name|double_age|
+---+-----+----------+
| 49|Jason|        98|
+---+-----+----------+

+---+-----+----------+
|age| name|double_age|
+---+-----+----------+
| 39| John|        78|
| 49|Jason|        98|
+---+-----+----------+

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|  49|Jason|        98|
|  39| John|        78|
|null|  Joe|      null|
+----+-----+----------+

+----+-----+----------+
| age| name|double_age|
+----+-----+----------+
|null|  Joe|      null|
|  39| John|        78|
|  49|Jason|        98|
+----+-----+----------+

