In [1]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()

# Create Data Frame

In [2]:
# From list
a = [('Chris', 'Berliner', 5), ('Peter', 'Bud Light', 9), ('John', 'Corona Extra', 6)]
spark.createDataFrame(a, ['drinker', 'beer', 'score']).collect()  

[Row(drinker='Chris', beer='Berliner', score=5),
 Row(drinker='Peter', beer='Bud Light', score=9),
 Row(drinker='John', beer='Corona Extra', score=6)]

In [3]:
rdd = sc.parallelize(a)
spark.createDataFrame(rdd).collect()

[Row(_1='Chris', _2='Berliner', _3=5),
 Row(_1='Peter', _2='Bud Light', _3=9),
 Row(_1='John', _2='Corona Extra', _3=6)]

In [4]:
# From RDD
df = spark.createDataFrame(rdd, ['drinker', 'beer', 'score'])
df.collect()

[Row(drinker='Chris', beer='Berliner', score=5),
 Row(drinker='Peter', beer='Bud Light', score=9),
 Row(drinker='John', beer='Corona Extra', score=6)]

In [5]:
# From RDD and add schema

from pyspark.sql.types import *
schema = StructType([
    StructField("drinker", StringType(), True),
    StructField("beer", StringType(), True),
    StructField("score", IntegerType(), True)])
df3 = spark.createDataFrame(rdd, schema)
df3.collect()

[Row(drinker='Chris', beer='Berliner', score=5),
 Row(drinker='Peter', beer='Bud Light', score=9),
 Row(drinker='John', beer='Corona Extra', score=6)]

In [6]:
# Read from CSV file
dfa = spark.read.format('csv')\
    .options(header='true', inferSchema='true',	sep=",")\
    .load("data/advertising.csv")
dfa.show()

+-----+-----+---------+-----+
|   TV|Radio|Newspaper|Sales|
+-----+-----+---------+-----+
|230.1| 37.8|     69.2| 22.1|
| 44.5| 39.3|     45.1| 10.4|
| 17.2| 45.9|     69.3| 12.0|
|151.5| 41.3|     58.5| 16.5|
|180.8| 10.8|     58.4| 17.9|
|  8.7| 48.9|     75.0|  7.2|
| 57.5| 32.8|     23.5| 11.8|
|120.2| 19.6|     11.6| 13.2|
|  8.6|  2.1|      1.0|  4.8|
|199.8|  2.6|     21.2| 15.6|
| 66.1|  5.8|     24.2| 12.6|
|214.7| 24.0|      4.0| 17.4|
| 23.8| 35.1|     65.9|  9.2|
| 97.5|  7.6|      7.2| 13.7|
|204.1| 32.9|     46.0| 19.0|
|195.4| 47.7|     52.9| 22.4|
| 67.8| 36.6|    114.0| 12.5|
|281.4| 39.6|     55.8| 24.4|
| 69.2| 20.5|     18.3| 11.3|
|147.3| 23.9|     19.1| 14.6|
+-----+-----+---------+-----+
only showing top 20 rows



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.toDF">
<img align=left src="images/pyspark-pictures-dataframes-page58.svg" width=360 height=203 />
</a>

**toDF(*cols)**. Returns a new DataFrame that with new specified column names

Parameters
- cols – list of new column names (string)

In [7]:
# toDF
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.toDF("seller","buyer","amt")
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+------+-----+---+
|seller|buyer|amt|
+------+-----+---+
| Alice|  Bob|0.1|
|   Bob|Carol|0.2|
| Carol| Dave|0.3|
+------+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.toPandas">
<img align=left src="images/pyspark-pictures-dataframes-page60.svg" width=360 height=203 />
</a>

**toPandas()** Returns the contents of this DataFrame as Pandas pandas.DataFrame. This is only available if Pandas is installed and available.

**Note** 
- This method should only be used if the resulting Pandas’s DataFrame is expected to be small, as all the data is loaded into the driver’s memory.
- It is very usefull for DataFreme visualisation in case when columns have very long content


In [8]:
# toPandas
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.toPandas()
x.show()
print(type(y))
y

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

<class 'pandas.core.frame.DataFrame'>


Unnamed: 0,from,to,amt
0,Alice,Bob,0.1
1,Bob,Carol,0.2
2,Carol,Dave,0.3


# DataFrame to RDD

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.rdd">
<img align=left src="images/pyspark-pictures-dataframes-page42.svg" width=360 height=203 />
</a>

**property rdd** Returns the content as an pyspark.RDD of Row. 

Use map(tuple) to conver to a RDD of tuples

In [9]:
# rdd
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.rdd
x.show()
print(y.collect())

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

[Row(from='Alice', to='Bob', amt=0.1), Row(from='Bob', to='Carol', amt=0.2), Row(from='Carol', to='Dave', amt=0.3)]


In [10]:
# rdd
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.rdd.map(tuple)
x.show()
print(y.collect())

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

[('Alice', 'Bob', 0.1), ('Bob', 'Carol', 0.2), ('Carol', 'Dave', 0.3)]


# Showing dataframe and metadata

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.show">
<img align=left src="images/pyspark-pictures-dataframes-page52.svg" width=360 height=203 />
</a>

**show(n=20, truncate=True, vertical=False)**.
Prints the first n rows to the console.

Parameters:
- n – Number of rows to show.
- truncate – If set to True, truncate strings longer than 20 chars by default. If set to a number greater than one, truncates long strings to length truncate and align cells right.
- vertical – If set to True, print output rows vertically (one line per column value).

In [11]:
# show
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show(vertical=True)

-RECORD 0-----
 from | Alice 
 to   | Bob   
 amt  | 0.1   
-RECORD 1-----
 from | Bob   
 to   | Carol 
 amt  | 0.2   
-RECORD 2-----
 from | Carol 
 to   | Dave  
 amt  | 0.3   



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.printSchema">
<img align=left src="images/pyspark-pictures-dataframes-page40.svg" width=360 height=203 />
</a>

**printSchema()**. Prints out the schema in the tree format.

In [12]:
# printSchema
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()
x.printSchema()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

root
 |-- from: string (nullable = true)
 |-- to: string (nullable = true)
 |-- amt: double (nullable = true)



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.schema">
<img align=left src="images/pyspark-pictures-dataframes-page49.svg" width=360 height=203 />
</a>

**property schema**. Returns the schema of this DataFrame as a pyspark.sql.types.StructType.

In [13]:
# schema
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.schema
x.show()
print(y)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

StructType(List(StructField(from,StringType,true),StructField(to,StringType,true),StructField(amt,DoubleType,true)))


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.columns">
<img align=left src="images/pyspark-pictures-dataframes-page8.svg" width=360 height=203 />
</a>

**property columns**, Returns all column names as a list.

In [14]:
# columns
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.columns #creates list of column names on driver
x.show()
print(y)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

['from', 'to', 'amt']


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.describe">
<img align=left src="images/pyspark-pictures-dataframes-page14.svg" width=360 height=203 />
</a>

**describe(*cols)**. Computes basic statistics for numeric and string columns.

This include count, mean, stddev, min, and max. If no columns are given, this function computes statistics for all numerical or string columns.

In [15]:
# describe
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()
x.describe().show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-------+-----+----+-------------------+
|summary| from|  to|                amt|
+-------+-----+----+-------------------+
|  count|    3|   3|                  3|
|   mean| null|null|0.20000000000000004|
| stddev| null|null|0.09999999999999998|
|    min|Alice| Bob|                0.1|
|    max|Carol|Dave|                0.3|
+-------+-----+----+-------------------+



# Selecting columns

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.select">
<img align=left src="images/pyspark-pictures-dataframes-page50.svg" width=360 height=203 />
</a>

**select(*cols)**. Projects a set of expressions and returns a new DataFrame.

Parameters
- cols – list of column names (string) or expressions (Column). If one of the column names is ‘*’, that column is expanded to include all columns in the current DataFrame.

In [16]:
# 1 - Pandas like select using a list of columns
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.select(['from','amt'])
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+---+
| from|amt|
+-----+---+
|Alice|0.1|
|  Bob|0.2|
|Carol|0.3|
+-----+---+



In [17]:
# 2 - Columns as a prameters
x.select("from","amt").show()

+-----+---+
| from|amt|
+-----+---+
|Alice|0.1|
|  Bob|0.2|
|Carol|0.3|
+-----+---+



In [18]:
# 3 - Using F.col
from pyspark.sql import functions as F

x.select(F.col("from"),F.col("amt")).show()


+-----+---+
| from|amt|
+-----+---+
|Alice|0.1|
|  Bob|0.2|
|Carol|0.3|
+-----+---+



In [19]:
# 4 - Using list of F.col and unpack it
x.select(*[F.col("from"),F.col("amt")]).show()

+-----+---+
| from|amt|
+-----+---+
|Alice|0.1|
|  Bob|0.2|
|Carol|0.3|
+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.selectExpr">
<img align=left src="images/pyspark-pictures-dataframes-page51.svg" width=360 height=203 />
</a>

**selectExpr(*expr)**. Projects a set of SQL expressions and returns a new DataFrame. 

This is a variant of select() that accepts SQL expressions.

In [20]:
# selectExpr
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.selectExpr(['substr(from,1,1)','amt+10'])
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+------------------+----------+
|substr(from, 1, 1)|(amt + 10)|
+------------------+----------+
|                 A|      10.1|
|                 B|      10.2|
|                 C|      10.3|
+------------------+----------+



# Projection operations

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.where">
<img align=left src="images/pyspark-pictures-dataframes-page63.svg" width=360 height=203 />
</a>

**whare(condition)**
Filters rows using the given condition.

**filter()** is an alias for where().

Parameters
- condition – a Column of types.BooleanType or a string of SQL expression.

In [21]:
# where (filter)
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.where("amt > 0.1")
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



In [22]:
# filter
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.filter("amt > 0.1")
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.distinct">
<img align=left src="images/pyspark-pictures-dataframes-page15.svg" width=360 height=203 />
</a>

**distinct()**. Returns a new DataFrame containing the distinct rows in this DataFrame.

In [23]:
# distinct
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3),("Bob","Carol",0.2)], ['from','to','amt'])
y = x.distinct()
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
|  Bob|Carol|0.2|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Alice|  Bob|0.1|
|Carol| Dave|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.dropna">
<img align=left src="images/pyspark-pictures-dataframes-page18.svg" width=360 height=203 />
<

**dropna(how='any', thresh=None, subset=None)**
Returns a new DataFrame omitting rows with null values. DataFrame.dropna() and DataFrameNaFunctions.drop() are aliases of each other.

Parameters
- how – ‘any’ or ‘all’. If ‘any’, drop a row if it contains any nulls. If ‘all’, drop a row only if all its values are null.
- thresh – int, default None If specified, drop rows that have less than thresh non-null values. This overwrites the how parameter.
- subset – optional list of column names to consider.

In [24]:
# dropna
x = spark.createDataFrame([(None,"Bob",0.1),("Bob","Carol",None),("Carol",None,0.3),("Bob","Carol",0.2)], ['from','to','amt'])
y = x.dropna(how='any',subset=['from','to'])
x.show()
y.show()

+-----+-----+----+
| from|   to| amt|
+-----+-----+----+
| null|  Bob| 0.1|
|  Bob|Carol|null|
|Carol| null| 0.3|
|  Bob|Carol| 0.2|
+-----+-----+----+

+----+-----+----+
|from|   to| amt|
+----+-----+----+
| Bob|Carol|null|
| Bob|Carol| 0.2|
+----+-----+----+



# Add columns to Data Frame

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumn">
<img align=left src="images/pyspark-pictures-dataframes-page64.svg" width=360 height=203 />
</a>

**withColumn(colName, col)**. Returns a new DataFrame by adding a column or replacing the existing column that has the same name.

The column expression must be an expression over this DataFrame; attempting to add a column from some other DataFrame will raise an error.

Parameters
- colName – string, name of the new column.
- col – a Column expression for the new column.

**Note**: This method introduces a projection internally. Therefore, calling it multiple times, for instance, via loops in order to add multiple columns can generate big plans which can cause performance issues and even StackOverflowException. To avoid this, use select() with the multiple columns at once.

In [25]:

# withColumn
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",None),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.withColumn('conf',x.amt.isNotNull())
x.show()
y.show()

+-----+-----+----+
| from|   to| amt|
+-----+-----+----+
|Alice|  Bob| 0.1|
|  Bob|Carol|null|
|Carol| Dave| 0.3|
+-----+-----+----+

+-----+-----+----+-----+
| from|   to| amt| conf|
+-----+-----+----+-----+
|Alice|  Bob| 0.1| true|
|  Bob|Carol|null|false|
|Carol| Dave| 0.3| true|
+-----+-----+----+-----+



In [26]:
# lit(val) Creates a Column of a literal value.
from pyspark.sql.functions import lit
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",None),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.withColumn('constant',lit(1))
x.show()
y.show()

+-----+-----+----+
| from|   to| amt|
+-----+-----+----+
|Alice|  Bob| 0.1|
|  Bob|Carol|null|
|Carol| Dave| 0.3|
+-----+-----+----+

+-----+-----+----+--------+
| from|   to| amt|constant|
+-----+-----+----+--------+
|Alice|  Bob| 0.1|       1|
|  Bob|Carol|null|       1|
|Carol| Dave| 0.3|       1|
+-----+-----+----+--------+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.withColumnRenamed">
<img align=left src="images/pyspark-pictures-dataframes-page65.svg" width=360 height=203 />
</a>

**withColumnRenamed(existing, new)**. Returns a new DataFrame by renaming an existing column. This is a no-op if schema doesn’t contain the given column name.

Parameters
- existing – string, name of the existing column to rename.
- new – string, new name of the column.

In [27]:
# withColumnRenamed
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.withColumnRenamed('amt','amount')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+------+
| from|   to|amount|
+-----+-----+------+
|Alice|  Bob|   0.1|
|  Bob|Carol|   0.2|
|Carol| Dave|   0.3|
+-----+-----+------+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.drop">
<img align=left src="images/pyspark-pictures-dataframes-page16.svg" width=360 height=203 />
</a>

**drop(cols)** Returns a new DataFrame that drops the specified column. This is a no-op if schema doesn’t contain the given column name(s).

Parameters
- cols – a string name of the column to drop, or a Column to drop, or a list of string name of the columns to drop.

In [28]:
# drop
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.drop('amt')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+
| from|   to|
+-----+-----+
|Alice|  Bob|
|  Bob|Carol|
|Carol| Dave|
+-----+-----+



# Aggregates

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.groupBy">
<img align=left src="images/pyspark-pictures-dataframes-page28.svg" width=360 height=203 />
</a>

**groupBy(*cols)**.
Groups the DataFrame using the specified columns, so we can run aggregation on them. See GroupedData for all the available aggregate functions. groupby() is an alias for groupBy().

Parameters
- cols – list of columns to group by. Each element should be a column name (string) or an expression (Column).

The available aggregate functions can be:
- built-in aggregation functions, such as avg, max, min, sum, count
- group aggregate pandas UDFs, created  with pandas_udf()

**Note:** 
- There is no partial aggregation with group aggregate UDFs, i.e., a full shuffle is required. 
- Also, all the data of a group will be loaded into memory, so the user should be aware of the potential OOM risk if data is skewed and certain groups are too large to fit in memory.
- If exprs is a single dict mapping from string to string, then 
    - the key is the column to perform aggregation on, and 
    - the value is the aggregate function.
- Alternatively, exprs can also be a list of aggregate Column expressions. 



In [29]:
# groupBy
x = spark.createDataFrame([('Alice',"Bob",1),("Alice","Carol",2),("Carol","Dave",3),('Carol',"Bob",4)], ['from','to','amt'])
y = x.groupBy('from')
x.show()
print(y)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|  1|
|Alice|Carol|  2|
|Carol| Dave|  3|
|Carol|  Bob|  4|
+-----+-----+---+

<pyspark.sql.group.GroupedData object at 0x00000182D130FD30>


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.groupBy">
<img align=left src="images/pyspark-pictures-dataframes-page29.svg" width=360 height=203 />
</a>

In [30]:
# groupBy(col1).avg(col2)
y = x.groupBy('from').avg('amt')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|  1|
|Alice|Carol|  2|
|Carol| Dave|  3|
|Carol|  Bob|  4|
+-----+-----+---+

+-----+--------+
| from|avg(amt)|
+-----+--------+
|Carol|     3.5|
|Alice|     1.5|
+-----+--------+



In [31]:
from pyspark.sql import functions as F
print(x.agg({"amt": "max"}).collect())
print(x.agg(F.min(x.amt)).collect())


[Row(max(amt)=4)]
[Row(min(amt)=1)]


# Top K queries

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.orderBy">
<img align=left src="images/pyspark-pictures-dataframes-page38.svg" width=360 height=203 />
</a>

**orderBy(*cols, *kwargs)**. Returns a new DataFrame sorted by the specified column(s).

Parameters

- cols – list of Column or column names to sort by.
- ascending – boolean or list of boolean (default True). Sort ascending vs. descending. Specify list for multiple sort orders. If a list is specified, length of the list must equal length of the cols.

In [32]:
# orderBy
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.orderBy(['to'],ascending=[False])
y = x.orderBy(F.desc('to'))
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Carol| Dave|0.3|
|  Bob|Carol|0.2|
|Alice|  Bob|0.1|
+-----+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.limit">
<img align=left src="images/pyspark-pictures-dataframes-page34.svg" width=360 height=203 />
</a>

**limit(num)**. Limits the result count to the number specified.

In [33]:
# limit
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.limit(2)
z = x.orderBy(['amt'],ascending=False).limit(2)
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Carol| Dave|0.3|
|  Bob|Carol|0.2|
+-----+-----+---+



# Join

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.join">
<img align=left src="images/pyspark-pictures-dataframes-page33.svg" width=360 height=203 />
</a>

**join(other, on=None, how=None)**. Joins with another DataFrame, using the given join expression.

Parameters
- other – Right side of the join
- on – a string for the join column name, a list of column names, a join expression (Column), or a list of Columns. If on is a string or a list of strings indicating the name of the join column(s), the column(s) must exist on both sides, and this performs an equi-join.
- how – str, default inner. Must be one of: inner, cross, outer, full, fullouter, full_outer, left, leftouter, left_outer, right, rightouter, right_outer, semi, leftsemi, left_semi, anti, leftanti and left_anti.

In [34]:
# join
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = spark.createDataFrame([('Alice',20),("Bob",40),("Dave",80)], ['name','age'])
z = x.join(y,x.to == y.name,'inner').select('from','to','amt','age')
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+---+
| name|age|
+-----+---+
|Alice| 20|
|  Bob| 40|
| Dave| 80|
+-----+---+

+-----+----+---+---+
| from|  to|amt|age|
+-----+----+---+---+
|Alice| Bob|0.1| 40|
|Carol|Dave|0.3| 80|
+-----+----+---+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.intersect">
<img align=left src="images/pyspark-pictures-dataframes-page31.svg" width=360 height=203 />
</a>

**intersect(other)**. Return a new DataFrame containing rows only in both this DataFrame and another DataFrame. This is equivalent to INTERSECT in SQL.

In [35]:
# intersect
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Alice",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.intersect(y)
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Alice|0.2|
|Carol| Dave|0.1|
+-----+-----+---+

+-----+---+---+
| from| to|amt|
+-----+---+---+
|Alice|Bob|0.1|
+-----+---+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.subtract">
<img align=left src="images/pyspark-pictures-dataframes-page56.svg" width=360 height=203 />
</a>

**subtract(other)**. Return a new DataFrame containing rows in this DataFrame but not in another DataFrame. This is equivalent to EXCEPT DISTINCT in SQL.

In [36]:
# subtract
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.subtract(y)
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.1|
+-----+-----+---+

+-----+----+---+
| from|  to|amt|
+-----+----+---+
|Carol|Dave|0.3|
+-----+----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.unionAll">
<img align=left src="images/pyspark-pictures-dataframes-page61.svg" width=360 height=203 />
</a>

**unionAll(other)**. Return a new DataFrame containing union of rows in this and another DataFrame. This is equivalent to UNION ALL in SQL. To do a SQL-style set union (that does deduplication of elements), use this function followed by distinct(). Also as standard in SQL, this function resolves columns by position (not by name).

In [37]:
# unionAll
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2)], ['from','to','amt'])
y = spark.createDataFrame([("Bob","Carol",0.2),("Carol","Dave",0.1)], ['from','to','amt'])
z = x.unionAll(y)
x.show()
y.show()
z.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Carol| Dave|0.1|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|  Bob|Carol|0.2|
|Carol| Dave|0.1|
+-----+-----+---+



# SQL like Operations on DataFrames

**sql(sqlQuery)**. Returns a DataFrame representing the result of the given query.

**createOrReplaceTempView(name)**. Registers this DataFrame as a temporary table using the given name.
The lifetime of this temporary table is tied to the SparkSession that was used to create this DataFrame.

In [38]:
df.createOrReplaceTempView("rates")
df2 = spark.sql("SELECT drinker AS d, beer as b, score as s from rates")
df2.collect()

[Row(d='Chris', b='Berliner', s=5),
 Row(d='Peter', b='Bud Light', s=9),
 Row(d='John', b='Corona Extra', s=6)]

In [39]:
# registerTempTable
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.createOrReplaceTempView(name="TRANSACTIONS")
y = spark.sql('SELECT * FROM TRANSACTIONS WHERE amt > 0.1')
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



# Statistics

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.count">
<img align=left src="images/pyspark-pictures-dataframes-page10.svg" width=360 height=203 />
</a>

**count()**. Returns the number of rows in this DataFrame.

In [40]:
# count
x = spark.createDataFrame([("Alice","Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.show()
print(x.count())

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

3


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.corr">
<img align=left src="images/pyspark-pictures-dataframes-page9.svg" width=360 height=203 />
</a>

**corr(col1, col2, method=None)**. Calculates the correlation of two columns of a DataFrame as a double value. Currently only supports the Pearson Correlation Coefficient. DataFrame.corr() and DataFrameStatFunctions.corr() are aliases of each other.

Parameters
- col1 – The name of the first column
- col2 – The name of the second column
- method – The correlation method. Currently only supports “pearson”

In [41]:
# corr
x = spark.createDataFrame([("Alice","Bob",0.1,0.001),("Bob","Carol",0.2,0.02),("Carol","Dave",0.3,0.02)], ['from','to','amt','fee'])
y = x.corr(col1="amt",col2="fee")
x.show()
print(y)

+-----+-----+---+-----+
| from|   to|amt|  fee|
+-----+-----+---+-----+
|Alice|  Bob|0.1|0.001|
|  Bob|Carol|0.2| 0.02|
|Carol| Dave|0.3| 0.02|
+-----+-----+---+-----+

0.8660254037844389


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.cov">
<img align=left src="images/pyspark-pictures-dataframes-page11.svg" width=360 height=203 />
</a>

**cov(col1, col2)**. Calculate the sample covariance for the given columns, specified by their names, as a double value. DataFrame.cov() and DataFrameStatFunctions.cov() are aliases.

Parameters
- col1 – The name of the first column
- col2 – The name of the second column

In [42]:
# cov
x = spark.createDataFrame([("Alice","Bob",0.1,0.001),("Bob","Carol",0.2,0.02),("Carol","Dave",0.3,0.02)], ['from','to','amt','fee'])
y = x.cov(col1="amt",col2="fee")
x.show()
print(y)

+-----+-----+---+-----+
| from|   to|amt|  fee|
+-----+-----+---+-----+
|Alice|  Bob|0.1|0.001|
|  Bob|Carol|0.2| 0.02|
|Carol| Dave|0.3| 0.02|
+-----+-----+---+-----+

0.0009500000000000001


# Other

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.persist">
<img align=left src="images/pyspark-pictures-dataframes-page39.svg" width=360 height=203 />
</a>

**persist(storageLevel=StorageLevel(True, True, False, False, 1))**. Sets the storage level to persist the contents of the DataFrame across operations after the first time it is computed. This can only be used to assign a new storage level if the DataFrame does not have a storage level set yet. If no storage level is specified defaults to (MEMORY_AND_DISK). 

**Note**: The default storage level has changed to MEMORY_AND_DISK to match Scala in 2.0.

In [44]:
# persist
from pyspark.storagelevel import StorageLevel
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.persist(storageLevel=StorageLevel(True,True,False,True,1)) # StorageLevel(useDisk,useMemory,useOffHeap,deserialized,replication=1)
x.show()
x.is_cached

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+



True

<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.unpersist">
<img align=left src="images/pyspark-pictures-dataframes-page62.svg" width=360 height=203 />
</a>

**unpersist(blocking=False)**. Marks the DataFrame as non-persistent, and remove all blocks for it from memory and disk.

In [45]:
# unpersist
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
x.cache()
x.count()
x.show()
print(x.is_cached)
x.unpersist()
print(x.is_cached)

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

True
False


<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.replace">
<img align=left src="images/pyspark-pictures-dataframes-page45.svg" width=360 height=203 />
</a>

**replace(to_replace, value=<no value>, subset=None)**. Returns a new DataFrame replacing a value with another value. DataFrame.replace() and DataFrameNaFunctions.replace() are aliases of each other. Values to_replace and value must have the same type and can only be numerics, booleans, or strings. Value can have None. When replacing, the new value will be cast to the type of the existing column. For numeric replacements all values to be replaced should have unique floating point representation. In case of conflicts (for example with {42: -1, 42.0: 1}) and arbitrary replacement will be used.

Parameters
- to_replace – bool, int, long, float, string, list or dict. Value to be replaced. If the value is a dict, then value is ignored or can be omitted, and to_replace must be a mapping between a value and a replacement.
- value – bool, int, long, float, string, list or None. The replacement value must be a bool, int, long, float, string or None. If value is a list, value should be of the same length and type as to_replace. If value is a scalar and to_replace is a sequence, then value is used as a replacement for each item in to_replace.
- subset – optional list of column names to consider. Columns specified in subset that do not have matching data type are ignored. For example, if value is a string, and subset contains a non-string column, then the non-string column is simply ignored.

In [46]:
# replace
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.replace('Dave','David',['from','to'])
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol|David|0.3|
+-----+-----+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.sample">
<img align=left src="images/pyspark-pictures-dataframes-page47.svg" width=360 height=203 />
</a>

**sample(withReplacement=None, fraction=None, seed=None)**. Returns a sampled subset of this DataFrame.

Parameters
- withReplacement – Sample with replacement or not (default False).
- fraction – Fraction of rows to generate, range [0.0, 1.0].
- seed – Seed for sampling (default a random seed).

In [47]:
# sample
x = spark.createDataFrame([('Alice',"Bob",0.1),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.sample(False,0.5)
x.show()
y.show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+---+---+
| from| to|amt|
+-----+---+---+
|Alice|Bob|0.1|
+-----+---+---+



<a href="http://spark.apache.org/docs/3.0.0/api/python/pyspark.sql.html#pyspark.sql.DataFrame.randomSplit">
<img align=left src="images/pyspark-pictures-dataframes-page41.svg" width=360 height=203 />
</a>

**randomSplit(weights, seed=None)**. Randomly splits this DataFrame with the provided weights.

Parameters
- weights – list of doubles as weights with which to split the DataFrame. Weights will be normalized if they don’t sum up to 1.0.
- seed – The seed for sampling.

In [48]:
# randomSplit
x = spark.createDataFrame([('Alice',"Bob",0.1),('Alice',"Bob",0.4),('Alice',"Bob",0.5),('Alice',"Bob",0.6),("Bob","Carol",0.2),("Carol","Dave",0.3)], ['from','to','amt'])
y = x.randomSplit([0.5,0.5],10)
x.show()
y[0].show()
y[1].show()

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|Alice|  Bob|0.4|
|Alice|  Bob|0.5|
|Alice|  Bob|0.6|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+-----+---+
| from|   to|amt|
+-----+-----+---+
|Alice|  Bob|0.1|
|  Bob|Carol|0.2|
|Carol| Dave|0.3|
+-----+-----+---+

+-----+---+---+
| from| to|amt|
+-----+---+---+
|Alice|Bob|0.4|
|Alice|Bob|0.5|
|Alice|Bob|0.6|
+-----+---+---+

