In [85]:
# install Java8
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
# download spark 3.5.0
!wget -q https://apache.osuosl.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz

In [86]:
ls -l # check the .tgz is there

total 782060
drwxr-xr-x  1 root root      4096 Jan 11 17:02 [0m[01;34msample_data[0m/
drwxr-xr-x 13 1000 1000      4096 Sep  9 02:08 [01;34mspark-3.5.0-bin-hadoop3[0m/
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz
-rw-r--r--  1 root root 400395283 Sep  9 02:10 spark-3.5.0-bin-hadoop3.tgz.1
-rw-r--r--  1 root root     26898 Jan 12 22:18 starbucks.csv


In [87]:
# unzip it
!tar xf spark-3.5.0-bin-hadoop3.tgz

In [88]:
!pip install -q findspark

In [89]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"
os.environ["PYSPARK_SUBMIT_ARGS"] = "--master local[*] pyspark-shell"

Start Spark Session

In [90]:
import findspark
findspark.init("spark-3.5.0-bin-hadoop3")# SPARK_HOME

from pyspark.sql import SparkSession

# create the session
spark = SparkSession \
        .builder \
        .appName("Window Partitioning") \
        .master("local[*]") \
        .getOrCreate()

spark.version

'3.5.0'

In [91]:
spark

In [92]:
# For Pandas conversion optimization
spark.conf.set("spark.sql.execution.arrow.enabled", "true")

In [93]:
# Import sql functions
from pyspark.sql.functions import *
from pyspark.sql.window import Window
from pyspark.sql.functions import col, row_number

In [94]:
starbucksDF = spark.read.option("header", "true").csv("/content/starbucks.csv")
starbucksDF.show(30)


+--------------------+--------------------+------------------+--------+--------------+--------------+-----------------+------------+-------------------------+----------------+------------------+-----------+-------------+-----------------+----------------+----------------+------------+-------------+
|   Beverage_category|            Beverage|     Beverage_prep|Calories| Total Fat (g)|Trans Fat (g) |Saturated Fat (g)| Sodium (mg)| Total Carbohydrates (g) |Cholesterol (mg)| Dietary Fibre (g)| Sugars (g)| Protein (g) |Vitamin A (% DV) |Vitamin C (% DV)| Calcium (% DV) |Iron (% DV) |Caffeine (mg)|
+--------------------+--------------------+------------------+--------+--------------+--------------+-----------------+------------+-------------------------+----------------+------------------+-----------+-------------+-----------------+----------------+----------------+------------+-------------+
|              Coffee|       Brewed Coffee|             Short|       3|           0.1|             0

In [95]:
starbucksDF.printSchema()

root
 |-- Beverage_category: string (nullable = true)
 |-- Beverage: string (nullable = true)
 |-- Beverage_prep: string (nullable = true)
 |-- Calories: string (nullable = true)
 |--  Total Fat (g): string (nullable = true)
 |-- Trans Fat (g) : string (nullable = true)
 |-- Saturated Fat (g): string (nullable = true)
 |--  Sodium (mg): string (nullable = true)
 |--  Total Carbohydrates (g) : string (nullable = true)
 |-- Cholesterol (mg): string (nullable = true)
 |--  Dietary Fibre (g): string (nullable = true)
 |--  Sugars (g): string (nullable = true)
 |--  Protein (g) : string (nullable = true)
 |-- Vitamin A (% DV) : string (nullable = true)
 |-- Vitamin C (% DV): string (nullable = true)
 |--  Calcium (% DV) : string (nullable = true)
 |-- Iron (% DV) : string (nullable = true)
 |-- Caffeine (mg): string (nullable = true)



In [96]:
starbucksDF.count()

242

In [97]:
#Cuantas bebidas hay por categoria de bebida
bebidasDF = starbucksDF.groupBy("Beverage_category").agg(count("Beverage"))
bebidasDF.show()

+--------------------+---------------+
|   Beverage_category|count(Beverage)|
+--------------------+---------------+
|           Smoothies|              9|
|Frappuccino® Blen...|             13|
|Frappuccino® Ligh...|             12|
|Frappuccino® Blen...|             36|
|    Tazo® Tea Drinks|             52|
|              Coffee|              4|
|Shaken Iced Bever...|             18|
|Signature Espress...|             40|
|Classic Espresso ...|             58|
+--------------------+---------------+



In [98]:
#Cuantas preparaciones de bebidas hay por bebidas
preparacionDF = starbucksDF.groupBy("Beverage").agg(countDistinct("Beverage_prep"))
preparacionDF.show()

+--------------------+-----------------------------+
|            Beverage|count(DISTINCT Beverage_prep)|
+--------------------+-----------------------------+
|         Caffè Latte|                            6|
|   Caramel Macchiato|                            6|
|       Brewed Coffee|                            4|
|            Espresso|                            2|
|Mocha (Without Wh...|                            5|
|Caffè Mocha (With...|                            6|
|             Caramel|                            3|
|Iced Brewed Coffe...|                            5|
|     Caffè Americano|                            4|
|Shaken Iced Tazo®...|                            3|
|Tazo® Full-Leaf R...|                            6|
|               Mocha|                            3|
|Java Chip (Withou...|                            5|
|Strawberries & Cr...|                            5|
|Vanilla Bean (Wit...|                            4|
|Tazo® Green Tea L...|                        

In [99]:
# Cuantas calorias promedio tienen las bebidas
caloriasDF = starbucksDF.groupBy("Beverage").agg(round(avg("Calories"),2).alias("Calorias promedio"))
caloriasDF.show()

+--------------------+-----------------+
|            Beverage|Calorias promedio|
+--------------------+-----------------+
|         Caffè Latte|           139.17|
|   Caramel Macchiato|           184.17|
|       Brewed Coffee|             4.25|
|            Espresso|              7.5|
|Mocha (Without Wh...|           272.22|
|Caffè Mocha (With...|            210.0|
|             Caramel|            150.0|
|Iced Brewed Coffe...|           122.22|
|     Caffè Americano|            13.75|
|Shaken Iced Tazo®...|            140.0|
|Tazo® Full-Leaf R...|           148.33|
|               Mocha|           156.67|
|Java Chip (Withou...|           327.78|
|Strawberries & Cr...|           248.89|
|Vanilla Bean (Wit...|            197.5|
|Tazo® Green Tea L...|            275.0|
|              Coffee|            210.0|
|Hot Chocolate (Wi...|           236.67|
|           Java Chip|            220.0|
|Tazo® Chai Tea Latte|           196.67|
+--------------------+-----------------+
only showing top

In [100]:
#Cual es la cantidad maxima y minima de las bebidas
bmascaloriasDF = starbucksDF.agg(max("Calories").alias("Calorias maximas"),min("Calories").alias("Calorias minimas"))
bmascaloriasDF.show()

+----------------+----------------+
|Calorias maximas|Calorias minimas|
+----------------+----------------+
|              90|               0|
+----------------+----------------+



In [116]:
#Cual es la suma total de carbohidratos y grasas
totalDF2 = starbucksDF.select(col("Beverage"), col("Beverage_prep"), col(" Total Fat (g)"), col(" Total Carbohydrates (g) ")).withColumn("Total", col(" Total Fat (g)")+ col(" Total Carbohydrates (g) "))
totalDF2.show()


+--------------------+------------------+--------------+-------------------------+-----+
|            Beverage|     Beverage_prep| Total Fat (g)| Total Carbohydrates (g) |Total|
+--------------------+------------------+--------------+-------------------------+-----+
|       Brewed Coffee|             Short|           0.1|                        5|  5.1|
|       Brewed Coffee|              Tall|           0.1|                       10| 10.1|
|       Brewed Coffee|            Grande|           0.1|                       10| 10.1|
|       Brewed Coffee|             Venti|           0.1|                       10| 10.1|
|         Caffè Latte| Short Nonfat Milk|           0.1|                       75| 75.1|
|         Caffè Latte|           2% Milk|           3.5|                       85| 88.5|
|         Caffè Latte|           Soymilk|           2.5|                       65| 67.5|
|         Caffè Latte|  Tall Nonfat Milk|           0.2|                      120|120.2|
|         Caffè Latte

In [102]:
#Seleccionar las cuatro bebidas que mas tienen por azucar y cafeina

In [113]:
azucar = Window.partitionBy("Beverage").orderBy(col(" Sugars (g)").desc())
azucarDF = starbucksDF.withColumn("rank_azucar", row_number().over(azucar)).filter(col("rank_azucar")<=4)
azucarDF.show(20)

cafeina = Window.partitionBy("Beverage").orderBy(col("Caffeine (mg)").desc())
cafeinaDF = starbucksDF.withColumn("rank_cafeina", row_number().over(cafeina)).filter(col("rank_cafeina")<=4)
cafeinaDF.show(20)




+--------------------+--------------------+------------------+--------+--------------+--------------+-----------------+------------+-------------------------+----------------+------------------+-----------+-------------+-----------------+----------------+----------------+------------+-------------+-----------+
|   Beverage_category|            Beverage|     Beverage_prep|Calories| Total Fat (g)|Trans Fat (g) |Saturated Fat (g)| Sodium (mg)| Total Carbohydrates (g) |Cholesterol (mg)| Dietary Fibre (g)| Sugars (g)| Protein (g) |Vitamin A (% DV) |Vitamin C (% DV)| Calcium (% DV) |Iron (% DV) |Caffeine (mg)|rank_azucar|
+--------------------+--------------------+------------------+--------+--------------+--------------+-----------------+------------+-------------------------+----------------+------------------+-----------+-------------+-----------------+----------------+----------------+------------+-------------+-----------+
|           Smoothies|Banana Chocolate ...|Grande Nonfat Milk|  