In [None]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

In [None]:
# Import packages
from pyspark.sql import SparkSession
from pyspark.sql import Row
from pyspark.sql.types import StructType,StructField,StringType, DateType,IntegerType
from pyspark import SparkFiles

# Create a SparkSession
spark = SparkSession.builder.appName("TempViews").getOrCreate()

Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
22/11/09 15:13:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/11/09 15:13:55 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [None]:
# In this example we are going to read data directly from a github repo.
url='https://raw.githubusercontent.com/datasets/five-thirty-eight-datasets/master/datasets/candy-power-ranking/data/candy-data.csv'

In [None]:
# SparkFiles will allow you to resolves paths to files added through `SparkContext.addFile`

spark.sparkContext.addFile(url)
candy_data = spark.read.option('header', 'true').csv(SparkFiles.get("candy-data.csv"), inferSchema=True, sep=',', timestampFormat="mm/dd/yy")

# Show DataFrame
candy_data.show()

+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|      competitorname|chocolate|fruity|caramel|peanutyalmondy|nougat|crispedricewafer|hard|bar|pluribus|sugarpercent|pricepercent|winpercent|
+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|           100 grand|        1|     0|      1|             0|     0|               1|   0|  1|       0|  0.73199999|  0.86000001| 66.971725|
|        3 musketeers|        1|     0|      0|             0|     1|               0|   0|  1|       0|  0.60399997|  0.51099998| 67.602936|
|            one dime|        0|     0|      0|             0|     0|               0|   0|  0|       0|       0.011|       0.116| 32.261086|
|         one quarter|        0|     0|      0|             0|     0|               0|   0|  0|       0|       0.011|  0.51099998| 46.116505|
|     

In [None]:
# Create a temporary view. The temp view is a pre-requisite for Spark to allow you to read with sql.  


In [None]:
# Using sql in Spark just requires you to call spark.sql(<sql>) and showing the data.
# Here we are peeking at the data using SparkSQL


+------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|    competitorname|chocolate|fruity|caramel|peanutyalmondy|nougat|crispedricewafer|hard|bar|pluribus|sugarpercent|pricepercent|winpercent|
+------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|         100 grand|        1|     0|      1|             0|     0|               1|   0|  1|       0|  0.73199999|  0.86000001| 66.971725|
|      3 musketeers|        1|     0|      0|             0|     1|               0|   0|  1|       0|  0.60399997|  0.51099998| 67.602936|
|          one dime|        0|     0|      0|             0|     0|               0|   0|  0|       0|       0.011|       0.116| 32.261086|
|       one quarter|        0|     0|      0|             0|     0|               0|   0|  0|       0|       0.011|  0.51099998| 46.116505|
|         air heads|

In [None]:
# Get the the first 10 highest "winpercent" in desencending order.


+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|      competitorname|chocolate|fruity|caramel|peanutyalmondy|nougat|crispedricewafer|hard|bar|pluribus|sugarpercent|pricepercent|winpercent|
+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|reeses peanut but...|        1|     0|      0|             1|     0|               0|   0|  0|       0|  0.72000003|  0.65100002|  84.18029|
|   reeses miniatures|        1|     0|      0|             1|     0|               0|   0|  0|       0| 0.034000002|  0.27900001| 81.866257|
|                twix|        1|     0|      1|             0|     0|               1|   0|  1|       0|       0.546|  0.90600002| 81.642914|
|             kit kat|        1|     0|      0|             0|     0|               1|   0|  1|       0|  0.31299999|  0.51099998|   76.7686|
|     

In [None]:
# Get the candies that have chocolate and caramel and order by the lowest sugar percentage.


+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|      competitorname|chocolate|fruity|caramel|peanutyalmondy|nougat|crispedricewafer|hard|bar|pluribus|sugarpercent|pricepercent|winpercent|
+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|           milk duds|        1|     0|      1|             0|     0|               0|   0|  0|       1|  0.30199999|  0.51099998| 55.064072|
|                twix|        1|     0|      1|             0|     0|               1|   0|  1|       0|       0.546|  0.90600002| 81.642914|
|            snickers|        1|     0|      1|             1|     1|               0|   0|  1|       0|       0.546|  0.65100002| 76.673782|
|           baby ruth|        1|     0|      1|             1|     1|               0|   0|  1|       0|  0.60399997|  0.76700002| 56.914547|
|    s

In [None]:
# Demonstrate how to write a SQL query on multiple lines. 


+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|      competitorname|chocolate|fruity|caramel|peanutyalmondy|nougat|crispedricewafer|hard|bar|pluribus|sugarpercent|pricepercent|winpercent|
+--------------------+---------+------+-------+--------------+------+----------------+----+---+--------+------------+------------+----------+
|           milk duds|        1|     0|      1|             0|     0|               0|   0|  0|       1|  0.30199999|  0.51099998| 55.064072|
|                twix|        1|     0|      1|             0|     0|               1|   0|  1|       0|       0.546|  0.90600002| 81.642914|
|            snickers|        1|     0|      1|             1|     1|               0|   0|  1|       0|       0.546|  0.65100002| 76.673782|
|           baby ruth|        1|     0|      1|             1|     1|               0|   0|  1|       0|  0.60399997|  0.76700002| 56.914547|
|    s