# Aggregations and Grouping

## Prerrequisites

Install Spark and Java in VM

In [12]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [13]:
ls -l # check the .tgz is there

total 391016
drwxr-xr-x 1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
-rw-r--r-- 1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz


In [14]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [15]:
!pip install -q findspark

In [16]:
!pip install py4j

# For maps
!pip install folium
!pip install plotly



Define the environment

In [17]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

---

In [18]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Aggregations and Grouping") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [19]:
spark

In [20]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [21]:
# Import sql functions
from pyspark.sql.functions import *

Download datasets

In [22]:
!mkdir -p dataset
!wget -q https://raw.githubusercontent.com/paponsro/spark_edem_2324/master/dataset/movies.json -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/vehicles.csv -P /dataset
!wget -q https://github.com/masfworld/datahack_docker/raw/master/zeppelin/data/characters.csv -P /dataset
!ls /dataset

characters.csv	movies.json  vehicles.csv


Read JSON file

---

In [23]:
moviesDF = spark.read \
    .option("inferSchema", True) \
    .json("/dataset/movies.json")

In [24]:
moviesDF.show(2, False)
print(moviesDF.schema.fields)
moviesDF.columns

+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+----------------------+------------+--------+---------------+
|Creative_Type|Director|Distributor|IMDB_Rating|IMDB_Votes|MPAA_Rating|Major_Genre|Production_Budget|Release_Date|Rotten_Tomatoes_Rating|Running_Time_min|Source|Title                 |US_DVD_Sales|US_Gross|Worldwide_Gross|
+-------------+--------+-----------+-----------+----------+-----------+-----------+-----------------+------------+----------------------+----------------+------+----------------------+------------+--------+---------------+
|NULL         |NULL    |Gramercy   |6.1        |1071      |R          |NULL       |8000000          |12-Jun-98   |NULL                  |NULL            |NULL  |The Land Girls        |NULL        |146083  |146083         |
|NULL         |NULL    |Strand     |6.9        |207       |R          |Drama      |300000           |7-Aug-9

['Creative_Type',
 'Director',
 'Distributor',
 'IMDB_Rating',
 'IMDB_Votes',
 'MPAA_Rating',
 'Major_Genre',
 'Production_Budget',
 'Release_Date',
 'Rotten_Tomatoes_Rating',
 'Running_Time_min',
 'Source',
 'Title',
 'US_DVD_Sales',
 'US_Gross',
 'Worldwide_Gross']

## Examples

Count

In [25]:
# df rows counting, including NULLS
moviesDF.count()

3201

In [26]:
# using sql functions, NOT including NULLS
genresCountDF = moviesDF.select(count(col("Major_Genre")))
genresCountDF.show()

+------------------+
|count(Major_Genre)|
+------------------+
|              2926|
+------------------+



In [27]:
directorsCountDF = moviesDF.select(count(moviesDF.Director))
directorsCountDF.show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+



In [28]:
moviesDF.select(count(moviesDF.Major_Genre).alias("countMajor"), count(moviesDF.Director)).show()

+----------+---------------+
|countMajor|count(Director)|
+----------+---------------+
|      2926|           1870|
+----------+---------------+



In [29]:
#using SQL syntax
moviesDF.select(expr("count(Director)")).show()
moviesDF.selectExpr("count(Director) as count").show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+

+-----+
|count|
+-----+
| 1870|
+-----+



In [30]:
# using SQL
moviesDF.createOrReplaceTempView("movies")

In [31]:
spark.sql("select count(Director) from movies").show()

+---------------+
|count(Director)|
+---------------+
|           1870|
+---------------+



In [32]:
spark.sql("select count(Director) as countDirector, count(Major_Genre) from movies").show()

+-------------+------------------+
|countDirector|count(Major_Genre)|
+-------------+------------------+
|         1870|              2926|
+-------------+------------------+



Count Distinct

In [33]:
moviesDF.select(countDistinct(moviesDF.Major_Genre)).show()

+---------------------------+
|count(DISTINCT Major_Genre)|
+---------------------------+
|                         12|
+---------------------------+



In [34]:
spark.sql("select count(distinct Major_Genre) from movies").show()

+---------------------------+
|count(DISTINCT Major_Genre)|
+---------------------------+
|                         12|
+---------------------------+



Min and max

In [35]:
moviesDF.select(min(moviesDF.Production_Budget), max(moviesDF.Production_Budget)).show()

+----------------------+----------------------+
|min(Production_Budget)|max(Production_Budget)|
+----------------------+----------------------+
|                   218|             300000000|
+----------------------+----------------------+



In [36]:
spark.sql("select min(Production_Budget) from movies").show()

+----------------------+
|min(Production_Budget)|
+----------------------+
|                   218|
+----------------------+



Sum

In [37]:
moviesDF.select(sum(moviesDF.US_DVD_Sales).alias("salesUS")).show()
moviesDF.selectExpr("sum(US_DVD_Sales) as sales").show()

+-----------+
|    salesUS|
+-----------+
|19684472405|
+-----------+

+-----------+
|      sales|
+-----------+
|19684472405|
+-----------+



Average

In [38]:
moviesDF.select(avg(moviesDF.Production_Budget)).show()
spark.sql("select avg(Production_Budget) from movies").show()

+----------------------+
|avg(Production_Budget)|
+----------------------+
|    3.10691714484375E7|
+----------------------+

+----------------------+
|avg(Production_Budget)|
+----------------------+
|    3.10691714484375E7|
+----------------------+



Stats

In [39]:
moviesDF.select(mean(moviesDF.Rotten_Tomatoes_Rating)).show()
moviesDF.select(stddev(moviesDF.Rotten_Tomatoes_Rating)).show()

+---------------------------+
|avg(Rotten_Tomatoes_Rating)|
+---------------------------+
|          54.33692373976734|
+---------------------------+

+------------------------------+
|stddev(Rotten_Tomatoes_Rating)|
+------------------------------+
|             28.07659263787602|
+------------------------------+



### Grouping

---

In [40]:
countByGenreGF = moviesDF.groupBy(moviesDF.Major_Genre).count().orderBy("count")
countByGenreGF.show()

+-------------------+-----+
|        Major_Genre|count|
+-------------------+-----+
|Concert/Performance|    5|
|       Black Comedy|   36|
|            Western|   36|
|        Documentary|   43|
|            Musical|   53|
|    Romantic Comedy|  137|
|             Horror|  219|
|  Thriller/Suspense|  239|
|          Adventure|  274|
|               NULL|  275|
|             Action|  420|
|             Comedy|  675|
|              Drama|  789|
+-------------------+-----+



In [41]:
spark.sql("select Major_Genre, count(Major_Genre) as count from movies where Major_Genre is not null group by Major_Genre order by count").show()

+-------------------+-----+
|        Major_Genre|count|
+-------------------+-----+
|Concert/Performance|    5|
|       Black Comedy|   36|
|            Western|   36|
|        Documentary|   43|
|            Musical|   53|
|    Romantic Comedy|  137|
|             Horror|  219|
|  Thriller/Suspense|  239|
|          Adventure|  274|
|             Action|  420|
|             Comedy|  675|
|              Drama|  789|
+-------------------+-----+



In [42]:
avgRatingByGenreDF = moviesDF.groupBy(col("Major_Genre")).avg("IMDB_Rating").orderBy(col("avg(IMDB_Rating)").desc())
avgRatingByGenreDF.show()

+-------------------+------------------+
|        Major_Genre|  avg(IMDB_Rating)|
+-------------------+------------------+
|        Documentary| 6.997297297297298|
|            Western| 6.842857142857142|
|       Black Comedy|6.8187500000000005|
|              Drama| 6.773441734417339|
|               NULL|  6.50082644628099|
|            Musical|             6.448|
|  Thriller/Suspense| 6.360944206008582|
|          Adventure| 6.345019920318729|
|Concert/Performance|             6.325|
|             Action| 6.114795918367349|
|    Romantic Comedy| 5.873076923076922|
|             Comedy| 5.853858267716529|
|             Horror|5.6760765550239185|
+-------------------+------------------+



In [43]:
moviesDF.groupBy(col("Major_Genre")).agg(avg("IMDB_Rating") \
    .alias("avg")).orderBy(col("avg").desc()).show()

+-------------------+------------------+
|        Major_Genre|               avg|
+-------------------+------------------+
|        Documentary| 6.997297297297298|
|            Western| 6.842857142857142|
|       Black Comedy|6.8187500000000005|
|              Drama| 6.773441734417339|
|               NULL|  6.50082644628099|
|            Musical|             6.448|
|  Thriller/Suspense| 6.360944206008582|
|          Adventure| 6.345019920318729|
|Concert/Performance|             6.325|
|             Action| 6.114795918367349|
|    Romantic Comedy| 5.873076923076922|
|             Comedy| 5.853858267716529|
|             Horror|5.6760765550239185|
+-------------------+------------------+



In [44]:
aggregationsByGenreDF = moviesDF.groupBy("Major_Genre") \
    .agg(
        count("*").alias("N_Movies"),
        avg("IMDB_Rating").alias("rating")
    ) \
    .orderBy(col("rating").desc()).show()

+-------------------+--------+------------------+
|        Major_Genre|N_Movies|            rating|
+-------------------+--------+------------------+
|        Documentary|      43| 6.997297297297298|
|            Western|      36| 6.842857142857142|
|       Black Comedy|      36|6.8187500000000005|
|              Drama|     789| 6.773441734417339|
|               NULL|     275|  6.50082644628099|
|            Musical|      53|             6.448|
|  Thriller/Suspense|     239| 6.360944206008582|
|          Adventure|     274| 6.345019920318729|
|Concert/Performance|       5|             6.325|
|             Action|     420| 6.114795918367349|
|    Romantic Comedy|     137| 5.873076923076922|
|             Comedy|     675| 5.853858267716529|
|             Horror|     219|5.6760765550239185|
+-------------------+--------+------------------+



## Exercises
   1. Sum up all the worldwide profits of ALL the movies in the DF. Then sum the worldwide profits per director
   2. Count how many distinct directors we have
   3. Show the mean and standard deviation of US gross revenue for the movies (all the movies)
   4. Compute the average IMDB rating and the average US gross revenue PER DIRECTOR
   5. Sum up ALL the profits of ALL the movies in the DF. Then sum ALL the profits per director. Can you see null values? Why? How you can solve it?


Exercise 1



In [47]:
moviesDF.select(sum(moviesDF.US_DVD_Sales).alias("worldwide_profits")).show()
moviesDF.selectExpr("sum(US_DVD_Sales) as sales").show()

#Profits from all the movies
total_worldwide_profit = moviesDF.select(sum("Worldwide_Gross").alias("Total_Worldwide_Profit")).collect()[0]["Total_Worldwide_Profit"]

#Pofits by director
profits_per_director = moviesDF.groupBy("Director").agg(sum("Worldwide_Gross").alias("Total_Profit"))

# Show the total worldwide profit and profits per director
print(f"Total Worldwide Profit: {total_worldwide_profit}")
profits_per_director.show()



+-----------------+
|worldwide_profits|
+-----------------+
|      19684472405|
+-----------------+

+-----------+
|      sales|
+-----------+
|19684472405|
+-----------+

Total Worldwide Profit: 272586820052
+-------------------+------------+
|           Director|Total_Profit|
+-------------------+------------+
|        John Milius|    52735525|
|       Jim Jarmusch|     6030230|
|    Sheldon Lettich|    29090445|
|     Chan-wook Park|    23471871|
|         King Vidor|    73702232|
|         Jared Hess|   145338949|
|     Michael Curtiz|    25462500|
|      Steven Seagal|    38590458|
|  Bradley Rust Grey|       25572|
|          Jeff Burr|     1355728|
|       Bob Rafelson|    45283350|
|Paul Michael Glaser|    38122000|
|      Griffin Dunne|    68450430|
|        Mark Waters|   563693780|
|  Richard Fleischer|    91054152|
|     Akira Kurosawa|      320592|
|          Mira Nair|    79806867|
|  Denzel Washington|    53593730|
| Billy Bob Thornton|    18120267|
|     Emilio Estevez|

Exercise 2

In [52]:
distinct_directos_count=moviesDF.select(countDistinct("Director")).show()

moviesDF.groupBy("Director").count().show()

spark.sql("SELECT Director, COUNT(*) FROM movies GROUP BY Director").show()



+------------------------+
|count(DISTINCT Director)|
+------------------------+
|                     550|
+------------------------+

+-------------------+-----+
|           Director|count|
+-------------------+-----+
|        John Milius|    2|
|       Jim Jarmusch|    1|
|    Sheldon Lettich|    1|
|     Chan-wook Park|    1|
|         King Vidor|    4|
|         Jared Hess|    2|
|     Michael Curtiz|    3|
|      Steven Seagal|    1|
|  Bradley Rust Grey|    1|
|          Jeff Burr|    1|
|       Bob Rafelson|    2|
|Paul Michael Glaser|    1|
|      Griffin Dunne|    2|
|        Mark Waters|    5|
|  Richard Fleischer|    4|
|     Akira Kurosawa|    2|
|          Mira Nair|    6|
|  Denzel Washington|    2|
| Billy Bob Thornton|    1|
|     Emilio Estevez|    1|
+-------------------+-----+
only showing top 20 rows

+-------------------+--------+
|           Director|count(1)|
+-------------------+--------+
|        John Milius|       2|
|       Jim Jarmusch|       1|
|    Sheldo

Exercise 3

In [56]:
from pyspark.sql.functions import mean, stddev

# Calculate the mean and sample standard deviation of US gross revenue
moviesDF.agg(
    mean("US_Gross").alias("Mean_US_Gross"),
    stddev("US_Gross").alias("StdDev_US_Gross")
).show()

----------------------------------------------------------


moviesDF.select(mean(moviesDF.US_Gross)).show()
moviesDF.select(stddev(moviesDF.US_Gross)).show()


+--------------------+-------------------+
|       Mean_US_Gross|    StdDev_US_Gross|
+--------------------+-------------------+
|4.4002085163744524E7|6.255531139066214E7|
+--------------------+-------------------+

+--------------------+
|       avg(US_Gross)|
+--------------------+
|4.4002085163744524E7|
+--------------------+

+-------------------+
|   stddev(US_Gross)|
+-------------------+
|6.255531139066214E7|
+-------------------+



Exercise 4

In [83]:
# Group by director and calculate the averages
moviesDF.groupBy("Director").agg(
    avg("IMDB_Rating").alias("Average_IMDB_Rating"),
    avg("US_Gross").alias("Average_US_Gross")
).show()


# Calculate the average IMDB rating and US gross revenue per director
spark.sql("""
    SELECT
        Director,
        AVG(IMDB_Rating) AS Average_IMDB_Rating,
        AVG(US_Gross) AS Average_US_Gross
    FROM movies
    GROUP BY Director
""").show()


+-------------------+-------------------+----------------+
|           Director|Average_IMDB_Rating|Average_US_Gross|
+-------------------+-------------------+----------------+
|        John Milius|               6.05|    2.63677625E7|
|       Jim Jarmusch|               NULL|       3330230.0|
|    Sheldon Lettich|                4.7|     2.9090445E7|
|     Chan-wook Park|                7.7|        211667.0|
|         King Vidor|              7.375|     1.5675558E7|
|         Jared Hess|  6.300000000000001|    6.23694745E7|
|     Michael Curtiz|                7.5|       8487500.0|
|      Steven Seagal|                3.8|     3.8590458E7|
|  Bradley Rust Grey|                6.2|         25572.0|
|          Jeff Burr|                5.6|       1355728.0|
|       Bob Rafelson|               6.25|       6641675.0|
|Paul Michael Glaser|                6.4|        3.8122E7|
|      Griffin Dunne|                5.5|    2.34819955E7|
|        Mark Waters|  6.380000000000001|    6.52360426E

Exercise 5

In [75]:

movies_with_total_profit = moviesDF.withColumn("Total_Profit", col("Worldwide_Gross") + col("US_Gross"))

# Cambiar el allias
total_profits = movies_with_total_profit.select(sum("Total_Profit").alias("Profits_Totales"))

# Show the result
total_profits.show()



+---------------+
|Profits_Totales|
+---------------+
|   413129480065|
+---------------+

