In [1]:
# Spark Imports
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
from pyspark.sql.types import (StructField,StringType, 
                               IntegerType,StructType)

In [2]:
# Regular Python imports.
from datetime import datetime
import random

In [3]:
# Initialize the seed for the Random Number Generator.
random.seed(datetime.now())

In [4]:
# Start the Spark session.
spark = SparkSession.builder.appName('aggs').getOrCreate()

In [5]:
# We can infer the schema/types (only in CSV), and header tells us
# that the first row are the names of the columns.
df = spark.read.csv('Data/sales_info.csv',
                    inferSchema=True, header=True)

In [6]:
# Number of rows we read.
df.count()

12

In [7]:
# See what schema was inferred (together with column names from row).
df.printSchema()

root
 |-- Company: string (nullable = true)
 |-- Person: string (nullable = true)
 |-- Sales: double (nullable = true)



In [8]:
df.show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   GOOG|  Frank|340.0|
|   MSFT|   Tina|600.0|
|   MSFT|    Amy|124.0|
|   MSFT|Vanessa|243.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
+-------+-------+-----+



In [9]:
# This returns a GroupedData object. You can't call directly on it
# show but now you can call many aggregation functions.
grouped = df.groupBy("Company")

In [10]:
# You can call several functions on grouped data, but mathematical ones
# will only work on numerical columns. You can't call .show() directly
# on data that has been grouped, you have to run an aggregate function.
grouped.mean().show() 

+-------+-----------------+
|Company|       avg(Sales)|
+-------+-----------------+
|   APPL|            370.0|
|   GOOG|            220.0|
|     FB|            610.0|
|   MSFT|322.3333333333333|
+-------+-----------------+



In [11]:
# For grouped data you can call max, min, count, mean, avg, etc.
grouped.count().show()

+-------+-----+
|Company|count|
+-------+-----+
|   APPL|    4|
|   GOOG|    3|
|     FB|    2|
|   MSFT|    3|
+-------+-----+



In [12]:
# A User-Defined Function. To be called in DataFrame operations,
# receiving the values from each Column and returning a Column type.
random_udf = f.udf(lambda value: random.randint(0, value), IntegerType())

In [13]:
# In each row lets put an extra column where the value is a random
# integer value between 0 and the value of the 'Sales' column.
new_grouped = (df.withColumn('Random', random_udf(f.col('Sales')))
                .groupBy("Company"))

In [14]:
# If you don't want to compute something based on a column but instead
# fixed or column independent data, use the literal function.
(df.withColumn('Twelve', f.lit(12))).show()

+-------+-------+-----+------+
|Company| Person|Sales|Twelve|
+-------+-------+-----+------+
|   GOOG|    Sam|200.0|    12|
|   GOOG|Charlie|120.0|    12|
|   GOOG|  Frank|340.0|    12|
|   MSFT|   Tina|600.0|    12|
|   MSFT|    Amy|124.0|    12|
|   MSFT|Vanessa|243.0|    12|
|     FB|   Carl|870.0|    12|
|     FB|  Sarah|350.0|    12|
|   APPL|   John|250.0|    12|
|   APPL|  Linda|130.0|    12|
|   APPL|   Mike|750.0|    12|
|   APPL|  Chris|350.0|    12|
+-------+-------+-----+------+



In [15]:
# Multiple aggregator functions can be applied in a single call on
# different numerical columns!
new_grouped.agg({'Sales' : 'sum', 'Random' : 'avg'}).show()

+-------+----------+------------------+
|Company|sum(Sales)|       avg(Random)|
+-------+----------+------------------+
|   APPL|    1480.0|             207.0|
|   GOOG|     660.0| 84.33333333333333|
|     FB|    1220.0|             510.0|
|   MSFT|     967.0|231.33333333333334|
+-------+----------+------------------+



In [16]:
# The dataframe has to be persisted 'cache()' because otherwise
# everytime we'll perform some computation on it, it will retrieve
# the columns and the 'random' column will be re-rolled.
rnd_sales = df.withColumn('Random', random_udf(f.col('Sales'))).cache()

In [17]:
# Since data is cache'd, the random column won't be recomputed.
# Remember that a DataFrame is just a set of operations, only
# certain actions causes the whole thing to be computed.
rnd_sales.show()

+-------+-------+-----+------+
|Company| Person|Sales|Random|
+-------+-------+-----+------+
|   GOOG|    Sam|200.0|    51|
|   GOOG|Charlie|120.0|    32|
|   GOOG|  Frank|340.0|    92|
|   MSFT|   Tina|600.0|   118|
|   MSFT|    Amy|124.0|    98|
|   MSFT|Vanessa|243.0|   148|
|     FB|   Carl|870.0|   664|
|     FB|  Sarah|350.0|   277|
|   APPL|   John|250.0|   202|
|   APPL|  Linda|130.0|    40|
|   APPL|   Mike|750.0|   363|
|   APPL|  Chris|350.0|   229|
+-------+-------+-----+------+



In [18]:
# You can also make use of other really nice Spark functions.
# In the select you can pass some of the predefined functions,
# which there's PLENTY of in the pyspark.sql.functions module.
df.select(f.countDistinct('Sales')).show()

+---------------------+
|count(DISTINCT Sales)|
+---------------------+
|                   11|
+---------------------+



In [19]:
# You can alias columns to different names for clarity!
# `stddev_samp` is an ugly name. But you still have a
# hard to read number with a bunch of significant digits.
sales_std = df.select(f.stddev('Sales').alias('STDDEV'))
# format a column to two significant digits!
sales_std = sales_std.select(f.format_number('STDDEV', 2).alias('ST2'))
sales_std.show()

+------+
|   ST2|
+------+
|250.09|
+------+



In [20]:
# You can order by one (or multiple) keys, the first keys take precedence
# and then in case of a tie it takes into account the following keys and
# so on. By default it will sort ascending and receiving only the column
# name(s), you have to manually specify the column object and .desc() if
# you want to use descending.
df.orderBy('Company', f.col('Sales').desc(), df['Person'].asc()).show()

+-------+-------+-----+
|Company| Person|Sales|
+-------+-------+-----+
|   APPL|   Mike|750.0|
|   APPL|  Chris|350.0|
|   APPL|   John|250.0|
|   APPL|  Linda|130.0|
|     FB|   Carl|870.0|
|     FB|  Sarah|350.0|
|   GOOG|  Frank|340.0|
|   GOOG|    Sam|200.0|
|   GOOG|Charlie|120.0|
|   MSFT|   Tina|600.0|
|   MSFT|Vanessa|243.0|
|   MSFT|    Amy|124.0|
+-------+-------+-----+

