# SparkSession
A SparkSession can be used create DataFrame, register DataFrame as tables, execute SQL over tables, cache tables, and read parquet files.
The entry point to programming Spark with the Dataset and DataFrame API.

In [1]:
from datetime import datetime
from pyspark import SparkContext
from pyspark.sql import SparkSession, SQLContext

In [2]:
spark = (SparkSession.builder.appName("pyspark-dataframe-demo-{}".format(datetime.today()))
        .master("spark://spark-master:7077")      
        .getOrCreate())

sqlContext = SQLContext(spark)
# spark.sparkContext.getConf().getAll()



In [3]:
sc = spark.sparkContext
sc

# DataFrame
A distributed collection of data grouped into named columns

## From list of tuples, dictionary

In [4]:
l = [("Alice", 1)]
spark.createDataFrame(l).collect()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/spark/python/lib/py4j-0.10.9.5-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/opt/conda/lib/python3.8/socket.py", line 669, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
spark.createDataFrame(l, ["name", "age"]).collect()

In [None]:
d = [{"name": "Alice", "age": 1}]
spark.createDataFrame(d).collect()

In [None]:
spark.createDataFrame(d).show()

## From RDDs

In [None]:
l = [("Alice", 1)]
rdd = sc.parallelize(l)
spark.createDataFrame(rdd).collect()

In [None]:
# with list of column names
df = spark.createDataFrame(rdd, ["name", "age"])
df.collect()

In [None]:
# with Row definition
from pyspark.sql import Row
Person = Row("name", "age")
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()

In [None]:
df2.printSchema()

In [None]:
# with schema definition
from pyspark.sql.types import *
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True)
  ])
df3 = spark.createDataFrame(rdd, schema)
df3.collect()

In [None]:
df3.printSchema()

In [None]:
# with string definition, New in version 2.0.
rdd = sc.parallelize(l)
print(spark.createDataFrame(rdd, "a: string, b: int").collect())

rdd = rdd.map(lambda row: row[1])
print(spark.createDataFrame(rdd, "int").collect())

## From pandas

In [None]:
import pandas
print(spark.createDataFrame(df.toPandas()).collect())
print(spark.createDataFrame(pandas.DataFrame([["Alice", 2]])).collect())

# SQLContext

In [None]:
df.show()

In [None]:
# New in version 2.0
df.createOrReplaceTempView("table1")
df2 = spark.sql("SELECT name as N, age as A from table1")
df2.show()

In [None]:
sqlContext.registerDataFrameAsTable(df, "table1")
sqlContext.registerDataFrameAsTable(df2, "table2")
sqlContext.tableNames()


In [None]:
sqlContext.tables().show()

In [None]:
df3 = sqlContext.tables()
df3.filter("tableName = 'table1'").show()

In [None]:
sqlContext.dropTempTable("table1")
sqlContext.dropTempTable("table2")

In [None]:
sqlContext.tableNames()

## UDF: User Defined Function

In [None]:
sqlContext.registerFunction("stringLengthString", lambda x: len(x))
sqlContext.sql("SELECT stringLengthString('test')").collect()

In [None]:
from pyspark.sql.types import IntegerType
sqlContext.registerFunction("stringLengthInt", lambda x: len(x), IntegerType())
sqlContext.sql("SELECT stringLengthInt('test')").collect()

In [None]:
sqlContext.udf.register("stringLengthInt", lambda x: len(x), IntegerType())
sqlContext.sql("SELECT stringLengthInt('test')").collect()

# Working with DataFrame

In [None]:
l = [("Alice", 2, 12), ("Bob", 5, 25)]
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, "name: string, age: int, height: int")
df.show()

In [None]:
df.createOrReplaceTempView("people")

df2 = spark.sql("select * from people")
df2.show()

In [None]:
df.repartition(10).rdd.getNumPartitions()

In [None]:
data = df.union(df).repartition("age")
data.show()

In [None]:
data = data.repartition(7, "age")
data.show()

In [None]:
data.rdd.getNumPartitions()

In [None]:
data = data.repartition("name", "age")
data.show()

In [None]:
# withColumn(colName, col)
# Returns a new DataFrame by adding a column or replacing the existing column that has the same name.
df.withColumn("age2", df.age + 2).show()

In [None]:
df.withColumnRenamed("age", "age2").show()

In [None]:
df.select(df.age.cast("string").alias("ages")).show()

In [None]:
df.select(df.age.cast(StringType()).alias("ages")).show()

## Aggregate
Aggregate on the entire DataFrame without groups (shorthand for df.groupBy.agg()).

In [None]:
df.agg({"age": "max"}).show()

In [None]:
from pyspark.sql import functions as F
df.agg(F.min(df.age)).show()

In [None]:
gdf = df.groupBy(df.name)
gdf.agg({"*": "count"}).show()

In [None]:
from pyspark.sql import functions as F
gdf.agg(F.min(df.age)).show()

In [None]:
gdf.agg(F.min(df.age)).explain()

## Alias

In [None]:
from pyspark.sql.functions import *
df_as1 = df.alias("df_as1")
df_as2 = df.alias("df_as2")
joined_df = df_as1.join(df_as2, col("df_as1.name") == col("df_as2.name"), "inner")
joined_df = joined_df.select("df_as1.name", "df_as2.name", "df_as2.age")
joined_df.toPandas()

In [None]:
joined_df.explain()

## Stats

In [None]:
df.show()

In [None]:
df.printSchema()

In [None]:
df.schema

In [None]:
df.storageLevel

In [None]:
df.count()

In [None]:
df.groupBy().sum("age").show()

In [None]:
df.groupBy().sum("age", "height").show()

In [None]:
df.groupBy().avg("age").show()

In [None]:
df.groupBy().avg("age", "height").show()

In [None]:
df.columns

In [None]:
df.name

In [None]:
df["name"]

In [None]:
df.age + 1

In [None]:
# cube(*col): Create a multi-dimensional cube for the current DataFrame using the specified columns, so we can run aggregation on them.
df.cube("name", df.age).count().orderBy("name", "age").show()

In [None]:
df.describe(["age"]).show()

In [None]:
df.describe().show()

In [None]:
df.distinct().count()

In [None]:
df.dtypes

In [None]:
df.explain()

In [None]:
df.explain(True)

In [None]:
df.groupBy().avg().show()

In [None]:
df.groupBy("name").agg({"age": "mean"}).show()

In [None]:
df.groupBy(df.name).avg().show()

In [None]:
df.groupBy(["name", df.age]).count().show()

In [None]:
df.groupBy().max("age").show()

In [None]:
df.groupBy().max("age", "height").show()

In [None]:
df.groupBy().mean("age").show()

In [None]:
df.groupBy().mean("age", "height").show()

## Join

In [None]:
df.select("age", "name").show()

In [None]:
df2.select("name", "height").show()

In [None]:
df.drop("age").show()

In [None]:
df.drop(df.age).show()

In [None]:
df.join(df2, df.name == df2.name, "inner").drop(df.name).drop(df.age).show()

In [None]:
df.join(df2, "name", "inner").drop("age", "height").show()

In [None]:
from pyspark.sql import Row
df = sc.parallelize([
    Row(name="Alice", age=5, height=80),
    Row(name="Alice", age=5, height=80),
    Row(name="Alice", age=10, height=80)
  ]).toDF()
df.dropDuplicates().show()

In [None]:
df.dropDuplicates(["name", "height"]).show()

In [None]:
df.join(df2, df.name == df2.name, 'outer').select(df.name, df2.height).show()

In [None]:
df.join(df2, 'name', 'outer').select('name', df.height).show()

In [None]:
cond = [df.name == df2.name, df.age == df2.age]
df.join(df2, cond, 'outer').select(df.name, df2.age).show()

In [None]:
df.join(df2, 'name').select(df.name, df2.height).show()

In [None]:
df.join(df2, ['name', 'age']).select(df.name, df.age).show()

## Filter

In [None]:
l = [("Alice", 2, 12), ("Bob", 5, 25)]
rdd = sc.parallelize(l)
df = sqlContext.createDataFrame(rdd, "name: string, age: int, height: int")
df.show()


In [None]:
df.filter(df.age > 3).show()

In [None]:
df.filter("age > 3").show()

In [None]:
df.where("age=2").show()

In [None]:
df.first()

In [None]:
df.head()

In [None]:
df.limit(1).collect()

In [None]:
df.limit(0).collect()

In [None]:
# orderBy
print(df.sort(df.age.desc()).collect())
print(df.sort("age", ascending=False).collect())
print(df.orderBy(df.age.desc()).collect())

from pyspark.sql.functions import *
print(df.sort(asc("age")).collect())
print(df.sort(desc("age"), "name").collect())
print(df.orderBy(["age", "name"], ascending=[0, 1]).collect())

In [None]:
print(df.filter(df.name.endswith("ice")).collect())
print(df.filter(df.name.endswith("ice$")).collect())

In [None]:
# get subfield RDD > RDD, gets a field by name in a StructField.
from pyspark.sql import Row
df1 = sc.parallelize([Row(r=Row(a=1, b="b"))]).toDF()
df1.show()

In [None]:
df1.select(df1.r.getField("b")).show()
df1.select(df1.r.getField("a")).show()

In [None]:
# RDD contains list and dictionary
df1 = sc.parallelize([([1, 2], {"key": "value"})]).toDF(["l", "d"])
df1.show()

In [None]:
df1.select(df1.l.getItem(0), df1.d.getItem("key")).show()
df1.select(df1.l[0], df1.d["key"]).show()

In [None]:
from pyspark.sql import Row
df1 = sc.parallelize([Row(name=u"Tom", height=80), Row(name=u"Alice", height=None)]).toDF()
df1.show()

In [None]:
print(df1.filter(df1.height.isNotNull()).collect())
print(df1.filter(df1.height.isNull()).collect())

In [None]:
print(df[df.name.isin("Bob", "Mike")].collect())
print(df[df.age.isin(1, 2, 3)].collect())

In [None]:
df.filter(df.name.like("Al%")).collect()

In [None]:
from pyspark.sql import functions as F
df.select(df.name, F.when(df.age > 3, 1).otherwise(0)).show()