In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
            Get:2 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 14.2 kB/1140% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 14.2 kB/114                                                                               Ign:3 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu2004/x86_64  InRelease
0% [Waiting for headers] [1 InRelease 48.9 kB/114 kB 43%] [Waiting for headers]                                                                               Hit:4 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2004/x86_64  InRelease
0% [Waiting for headers] [1 InRelease 66.3 kB/114 kB 58%] [Waiting for headers]                                                                               Hit:5 https://developer.download.nvidia.c

In [2]:
# Start Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Retail Transactions").getOrCreate()

In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/2/retail_transactions.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("retail_transactions.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+-----------+----------------+------------------+
|customer_id|transaction_date|transaction_amount|
+-----------+----------------+------------------+
|     CS5295|       11-Feb-13|                35|
|     CS4768|       15-Mar-15|                39|
|     CS2122|       26-Feb-13|                52|
|     CS1217|       16-Nov-11|                99|
|     CS1850|       20-Nov-13|                78|
|     CS5539|       26-Mar-14|                81|
|     CS2724|       06-Feb-12|                93|
|     CS5902|       30-Jan-15|                89|
|     CS6040|       08-Jan-13|                76|
|     CS3802|       20-Aug-13|                75|
|     CS3494|       02-Jul-13|                94|
|     CS3780|       25-Mar-13|                80|
|     CS1171|       03-Nov-12|                59|
|     CS2892|       12-May-13|                43|
|     CS5552|       29-Dec-14|                78|
|     CS6043|       15-Jan-14|                98|
|     CS4147|       08-Jul-13|                81|


In [4]:
# Convert the "transaction_amount" column from the string type to the `float` type.
amountUpdated = df.withColumn("transaction_amount", df["transaction_amount"].cast('float'))

In [5]:
# Find the average "transaction_amount" value by grouping by "customer_id" value.
amountUpdated.groupBy('customer_id').avg().show()

+-----------+-----------------------+
|customer_id|avg(transaction_amount)|
+-----------+-----------------------+
|     CS6001|      69.57894736842105|
|     CS3128|                   64.0|
|     CS5241|      64.05263157894737|
|     CS3155|                  74.32|
|     CS2754|      74.45454545454545|
|     CS3950|      65.73684210526316|
|     CS5130|                  66.64|
|     CS3935|      67.34782608695652|
|     CS3083|      62.88235294117647|
|     CS3587|      69.80952380952381|
|     CS2282|      69.14285714285714|
|     CS1664|                  78.16|
|     CS1802|      70.85185185185185|
|     CS2412|                  67.75|
|     CS4335|                   66.0|
|     CS3997|      69.63157894736842|
|     CS4915|                   67.0|
|     CS3993|                  69.25|
|     CS2811|      65.29411764705883|
|     CS6062|      64.33333333333333|
+-----------+-----------------------+
only showing top 20 rows



In [6]:
# Find the maximum "transaction_amount" value per customer by grouping by "customer_id" value.
amountUpdated.groupBy('customer_id').max().show()

+-----------+-----------------------+
|customer_id|max(transaction_amount)|
+-----------+-----------------------+
|     CS6001|                  105.0|
|     CS3128|                  100.0|
|     CS5241|                   94.0|
|     CS3155|                  105.0|
|     CS2754|                  104.0|
|     CS3950|                  105.0|
|     CS5130|                  104.0|
|     CS3935|                  104.0|
|     CS3083|                   94.0|
|     CS3587|                  105.0|
|     CS2282|                   97.0|
|     CS1664|                  105.0|
|     CS1802|                  104.0|
|     CS2412|                  102.0|
|     CS4335|                   96.0|
|     CS3997|                   99.0|
|     CS4915|                  102.0|
|     CS3993|                  103.0|
|     CS2811|                   96.0|
|     CS6062|                  100.0|
+-----------+-----------------------+
only showing top 20 rows



In [7]:
# Find the sum of the "transaction_amount" values per customer by grouping by "customer_id" value.
amountUpdated.groupBy('customer_id').sum().show()

+-----------+-----------------------+
|customer_id|sum(transaction_amount)|
+-----------+-----------------------+
|     CS6001|                 1322.0|
|     CS3128|                 1472.0|
|     CS5241|                 1217.0|
|     CS3155|                 1858.0|
|     CS2754|                 1638.0|
|     CS3950|                 1249.0|
|     CS5130|                 1666.0|
|     CS3935|                 1549.0|
|     CS3083|                 1069.0|
|     CS3587|                 1466.0|
|     CS2282|                 1452.0|
|     CS1664|                 1954.0|
|     CS1802|                 1913.0|
|     CS2412|                 1084.0|
|     CS4335|                  660.0|
|     CS3997|                 1323.0|
|     CS4915|                 1541.0|
|     CS3993|                 1385.0|
|     CS2811|                 1110.0|
|     CS6062|                  772.0|
+-----------+-----------------------+
only showing top 20 rows



In [8]:
# Find the average "transaction_amount" value per "transaction_date" value.
amountUpdated.groupBy('transaction_date').avg().show()

+----------------+-----------------------+
|transaction_date|avg(transaction_amount)|
+----------------+-----------------------+
|       01-Feb-15|      62.19277108433735|
|       20-Feb-12|      66.10344827586206|
|       22-Jul-12|                   67.0|
|       17-Jan-15|      62.89655172413793|
|       28-Aug-11|      63.70754716981132|
|       29-Jul-12|                   64.7|
|       17-Jan-14|                   62.3|
|       12-Jun-13|      68.32530120481928|
|       23-Oct-12|      65.56666666666666|
|       02-Nov-12|      65.55445544554455|
|       06-Mar-14|       64.3529411764706|
|       11-Sep-12|      68.19540229885058|
|       29-Dec-13|      67.68656716417911|
|       14-Feb-13|     63.857142857142854|
|       18-Aug-13|       67.5764705882353|
|       09-Jan-14|      67.68888888888888|
|       28-May-11|      72.83720930232558|
|       26-Oct-13|      66.19354838709677|
|       21-Nov-11|      65.57142857142857|
|       12-Feb-13|                   64.5|
+----------

In [9]:
# Find the sum of "transaction_amount" values per "transaction_date" value.
amountUpdated.groupBy('transaction_date').sum().show()

+----------------+-----------------------+
|transaction_date|sum(transaction_amount)|
+----------------+-----------------------+
|       01-Feb-15|                 5162.0|
|       20-Feb-12|                 5751.0|
|       22-Jul-12|                 6164.0|
|       17-Jan-15|                 3648.0|
|       28-Aug-11|                 6753.0|
|       29-Jul-12|                 5176.0|
|       17-Jan-14|                 4361.0|
|       12-Jun-13|                 5671.0|
|       23-Oct-12|                 5901.0|
|       02-Nov-12|                 6621.0|
|       06-Mar-14|                 5470.0|
|       11-Sep-12|                 5933.0|
|       29-Dec-13|                 4535.0|
|       14-Feb-13|                 4470.0|
|       18-Aug-13|                 5744.0|
|       09-Jan-14|                 6092.0|
|       28-May-11|                 6264.0|
|       26-Oct-13|                 6156.0|
|       21-Nov-11|                 5508.0|
|       12-Feb-13|                 4902.0|
+----------

In [10]:
# Find the maximum "transaction_amount" value per "transaction_date" value.
amountUpdated.groupBy('transaction_date').max().show()

+----------------+-----------------------+
|transaction_date|max(transaction_amount)|
+----------------+-----------------------+
|       01-Feb-15|                  105.0|
|       20-Feb-12|                  105.0|
|       22-Jul-12|                  105.0|
|       17-Jan-15|                  105.0|
|       28-Aug-11|                  105.0|
|       29-Jul-12|                  104.0|
|       17-Jan-14|                  104.0|
|       12-Jun-13|                  105.0|
|       23-Oct-12|                  105.0|
|       02-Nov-12|                  105.0|
|       06-Mar-14|                  105.0|
|       11-Sep-12|                  105.0|
|       29-Dec-13|                  105.0|
|       14-Feb-13|                  103.0|
|       18-Aug-13|                  105.0|
|       09-Jan-14|                  105.0|
|       28-May-11|                  105.0|
|       26-Oct-13|                  105.0|
|       21-Nov-11|                  105.0|
|       12-Feb-13|                  105.0|
+----------