In [3]:
""" PySpark setup """

# Find the latest version of spark 3.0 from http://www-us.apache.org/dist/spark/ and enter as the spark version environment variable
import os
spark_version = 'spark-3.0.2'
os.environ['SPARK_VERSION'] = spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www-us.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop2.7.tgz
!tar xf $SPARK_VERSION-bin-hadoop2.7.tgz
!pip install -q findspark

# Set environment variables
os.environ["JAVA_HOME"] = '/usr/lib/jvm/java-11-openjdk-amd64'
os.environ["SPARK_HOME"] = f'/content/{spark_version}-bin-hadoop2.7'

# Locate Spark
import findspark
findspark.init()

Hit:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Hit:3 http://security.ubuntu.com/ubuntu bionic-security InRelease
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Hit:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Hit:7 http://archive.ubuntu.com/ubuntu bionic InRelease
Hit:8 http://archive.ubuntu.com/ubuntu bionic-updates InRelease
Hit:9 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease
Hit:11 http://archive.ubuntu.com/ubuntu bionic-backports InRelease
Hit:13 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Hit:14 http://ppa.launchpad.net/deadsnakes/ppa/ubuntu bionic InRelease
Hit:15 http://ppa.launchpad.net/graphics-drivers/ppa/ubuntu bionic 

In [4]:
# Dependencies
from pyspark import SparkFiles
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Spark session
spark = SparkSession.builder.appName('app').getOrCreate()
spark

In [7]:
# Add food data from S3
food_url = 'https://s3.amazonaws.com/dataviz-curriculum/day_1/food.csv'
spark.sparkContext.addFile(food_url)

# Define food schema
fields = [
  StructField('food', StringType(), nullable=True),
  StructField('price', IntegerType(), nullable=True)
]
schema = StructType(fields=fields)

# Read in food data
food_df = spark.read.csv(SparkFiles.get('food.csv'), schema=schema, sep=',', header=True)
food_df.printSchema() # schema
food_df.show() # head

root
 |-- food: string (nullable = true)
 |-- price: integer (nullable = true)

+-------+-----+
|   food|price|
+-------+-----+
|  pizza|    0|
|  sushi|   12|
|chinese|   10|
+-------+-----+



In [8]:
# Add new discount column
food_df = food_df.withColumn('discount', food_df['price'] / 2)
food_df = food_df.withColumnRenamed('discount', 'half_price') # rename new column
food_df.show()

+-------+-----+----------+
|   food|price|half_price|
+-------+-----+----------+
|  pizza|    0|       0.0|
|  sushi|   12|       6.0|
|chinese|   10|       5.0|
+-------+-----+----------+



In [9]:
# Add wine data
wine_url = 'https://s3.amazonaws.com/dataviz-curriculum/day_1/wine.csv'
spark.sparkContext.addFile(wine_url)

# Read in wine data
wine_df = spark.read.csv(SparkFiles.get('wine.csv'), sep=',', header=True)
wine_df.printSchema()
wine_df.show(5)

root
 |-- country: string (nullable = true)
 |-- description: string (nullable = true)
 |-- designation: string (nullable = true)
 |-- points: string (nullable = true)
 |-- price: string (nullable = true)
 |-- province: string (nullable = true)
 |-- region_1: string (nullable = true)
 |-- region_2: string (nullable = true)
 |-- variety: string (nullable = true)
 |-- winery: string (nullable = true)

+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|country|         description|         designation|points|price|      province|         region_1|         region_2|           variety|              winery|
+-------+--------------------+--------------------+------+-----+--------------+-----------------+-----------------+------------------+--------------------+
|     US|This tremendous 1...|   Martha's Vineyard|    96|  235|    California|      Napa Valley|             Napa|Cabernet Sauvi

In [10]:
# Show average points
wine_df.select(avg('points')).show()

+-----------------+
|      avg(points)|
+-----------------+
|87.88834105383143|
+-----------------+



In [12]:
# Sort wines under $20 by descending points (with SQL filtering)
selected_cols = ['country', 'designation', 'points', 'price', 'variety']
cheap_wines = wine_df.filter('price < 20').select(selected_cols).orderBy(wine_df['points'].desc()) # transformation
cheap_wines.show(5) # action

+--------+--------------------+------+-----+--------------+
| country|         designation|points|price|       variety|
+--------+--------------------+------+-----+--------------+
|      US|                null|    94|   19|  Muscat Blanc|
|   Spain|Cardenal Cisneros...|    94|   15| Pedro Xim̩nez|
|      US|         Dijon Clone|    94|   18|    Chardonnay|
|      US|Stone's Throw Vin...|    94|   18|      Riesling|
|Portugal|                null|    94|   19|Portuguese Red|
+--------+--------------------+------+-----+--------------+
only showing top 5 rows



In [13]:
# Sort California wines over $15 by price (with Python filtering)
wine_df.filter((wine_df['province'] == 'California') & (wine_df['price'] > 15)).orderBy(wine_df['price']).show(5)

+-------+--------------------+--------------------+------+-----+----------+--------------------+-----------+--------------------+---------------+
|country|         description|         designation|points|price|  province|            region_1|   region_2|             variety|         winery|
+-------+--------------------+--------------------+------+-----+----------+--------------------+-----------+--------------------+---------------+
|     US|Sweet in blackber...|     Single Vineyard|    87|  100|California|          Yountville|       Napa|  Cabernet Sauvignon|    Ghost Block|
|     US|This lush wine ep...|            Red Wine|    94|  100|California|         Napa Valley|       Napa|Bordeaux-style Re...|         Viader|
|     US|A beautiful, rich...|           J. Schram|    94|  100|California|         North Coast|North Coast|     Sparkling Blend|    Schramsberg|
|     US|Pricy, but flashy...|     Single Vineyard|    90|  100|California|          Yountville|       Napa|  Cabernet Sauvi