# Counting M&Ms 
Write a notebook that reads a file with thousands of entries, compute and aggregate the counts for each color and state.

In [0]:
# from the PySpark module, import the SparkSession and the count function

from pyspark.sql import SparkSession
from pyspark.sql.functions import count

## SparkSession in Databricks
In the Databricks notebook, when you create a cluster, the SparkSession is created for you. In both cases it’s accessible through a variable called *spark*. And through this variable you can access all its public fields and methods.

In [0]:
# Build a SparkSession using the SparkSession APIs.

spark = (SparkSession
        .builder
        .appName("mnmCount")
        .getOrCreate())

[0;31m---------------------------------------------------------------------------[0m
[0;31mNameError[0m                                 Traceback (most recent call last)
[0;32m<command-2601478670424854>[0m in [0;36m<cell line: 3>[0;34m()[0m
[1;32m      1[0m [0;31m# Build a SparkSession using the SparkSession APIs.[0m[0;34m[0m[0;34m[0m[0;34m[0m[0m
[1;32m      2[0m [0;34m[0m[0m
[0;32m----> 3[0;31m spark = (SparkSession
[0m[1;32m      4[0m         [0;34m.[0m[0mbuilder[0m[0;34m[0m[0;34m[0m[0m
[1;32m      5[0m         [0;34m.[0m[0mappName[0m[0;34m([0m[0;34m"mnmCount"[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m

[0;31mNameError[0m: name 'SparkSession' is not defined

In [0]:
%fs

ls dbfs:/databricks-datasets/learning-spark-v2/mnm_dataset.csv

path,name,size,modificationTime
dbfs:/databricks-datasets/learning-spark-v2/mnm_dataset.csv,mnm_dataset.csv,1284872,1587070414000


### Next:
Read the file into a Spark DataFrame using the CSV format by inferring the schema and specifying that the file contains a header, which provides column names for comma-separated fields.

In [0]:
mnm_file = "dbfs:/databricks-datasets/learning-spark-v2/mnm_dataset.csv"

mnm_df = (spark.read.format("csv")
                    .option("header", "true")
                    .option("inferSchema", "true")
                    .load(mnm_file))

# mnm_df.show(10)
display(mnm_df.limit(10))

State,Color,Count
TX,Red,20
NV,Blue,66
CO,Blue,79
OR,Blue,71
WA,Yellow,93
WY,Blue,16
CA,Yellow,53
WA,Green,60
OR,Green,71
TX,Green,68


In [0]:
# number of rows of the df

mnm_df.count()

# 99,999

Out[2]: 99999

### Next:
- select from the previous df the fields "State", "Color", "Count"
- since we want to group each state and its m&m color count, we use GroupBy()
- we need to aggregate the result, counting the "Count" field for what we have grouped by.
- ordering the result

In [0]:
count_mnm_df = (mnm_df
                .select("State", "Color", "Count")
                .groupBy("State", "Color")
                .agg(count("Count").alias("Total"))
                .orderBy("Total", ascending=False))

# Because some of Spark's functions return the same object, we can chain function calls.

display(count_mnm_df)

State,Color,Total
CA,Yellow,1807
WA,Green,1779
OR,Orange,1743
TX,Green,1737
TX,Red,1725
CA,Green,1723
CO,Yellow,1721
CA,Brown,1718
CO,Green,1713
NV,Orange,1712


In [0]:
print("Total Rows = %d" % (count_mnm_df.count()))

Total Rows = 60


### Next:
We just want to see the data for a single State -> California
- Select from all rows in the df
- Filter only CA State
- groupBy() per state and color as above
- aggregate the counts
- order by in descending orderd

In [0]:
ca_count_mnm_df = (mnm_df
                        .select("State", "Color", "Count")
                        .where(mnm_df.State == "CA")
                        .groupBy("State", "Color")
                        .agg(count("Count").alias("Total"))
                        .orderBy("Total", ascending=False))

display(ca_count_mnm_df)

State,Color,Total
CA,Yellow,1807
CA,Green,1723
CA,Brown,1718
CA,Orange,1657
CA,Red,1656
CA,Blue,1603


In [0]:
# nota: dal df completo, gia' aggregato si poteva filtrare direttamente per vedere solo i record relativi alla California

from pyspark.sql.functions import col

display(
        count_mnm_df.filter(col("State") == "CA")
                    .orderBy(col("Total"), ascending=False)
)

State,Color,Total
CA,Yellow,1807
CA,Green,1723
CA,Brown,1718
CA,Orange,1657
CA,Red,1656
CA,Blue,1603


In [0]:
# eseguire questo comando porta al distacco tra cluster e notebook. Sara' quindi necessario restartare il cluster.

spark.stop()