In [1]:
import os
# Find the latest version of spark 3.x  from http://www.apache.org/dist/spark/ and enter as the spark version
# For example:
# spark_version = 'spark-3.4.0'
spark_version = 'spark-3.4.0'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q http://www.apache.org/dist/spark/$SPARK_VERSION/$SPARK_VERSION-bin-hadoop3.tgz
!tar xf $SPARK_VERSION-bin-hadoop3.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop3"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 http://security.ubuntu.com/ubuntu focal-security InRelease [114 kB]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 8,192 B/114                                                                               Get:2 https://cloud.r-project.org/bin/linux/ubuntu focal-cran40/ InRelease [3,622 B]
0% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 95.3 kB/1140% [Connecting to archive.ubuntu.com (185.125.190.39)] [1 InRelease 114 kB/114 0% [Waiting for headers] [Waiting for headers] [Connecting to ppa.launchpad.net                                                                               Hit:3 http://archive.ubuntu.com/ubuntu focal InRelease
                                                                               0% [Waiting for headers] [Waiting for headers] [Waiting for headers]                                                                    Get:4 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu

In [2]:
# Start a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PCard Transactions").getOrCreate()

In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/2/pcard_transactions.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("pcard_transactions.csv"), sep=",", header=True)

# Show DataFrame
df.show()

+----------+-------------+--------------------+--------------------+------------------------+--------------------+------+--------------------+--------------------+--------------------+----------------------------+
|Year-Month|Agency Number|         Agency Name|Cardholder Last Name|Cardholder First Initial|         Description|Amount|              Vendor|    Transaction Date|         Posted Date|Merchant Category Code (MCC)|
+----------+-------------+--------------------+--------------------+------------------------+--------------------+------+--------------------+--------------------+--------------------+----------------------------+
|    201307|         1000|OKLAHOMA STATE UN...|               Mason|                       C|    GENERAL PURCHASE|   890|               NACAS|07/30/2013 12:00:...|07/31/2013 12:00:...|        CHARITABLE AND SO...|
|    201307|         1000|OKLAHOMA STATE UN...|               Mason|                       C|        ROOM CHARGES|368.96|      SHERATON HOTEL|07

In [4]:
# Get the data types of the columns. 
df.printSchema()

root
 |-- Year-Month: string (nullable = true)
 |-- Agency Number: string (nullable = true)
 |-- Agency Name: string (nullable = true)
 |-- Cardholder Last Name: string (nullable = true)
 |-- Cardholder First Initial: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- Vendor: string (nullable = true)
 |-- Transaction Date: string (nullable = true)
 |-- Posted Date: string (nullable = true)
 |-- Merchant Category Code (MCC): string (nullable = true)



In [5]:
# Create a new DataFrame with an "Amount" column as a float datatype. 
amountUpdated = df.withColumn("Amount", df["Amount"].cast('float'))

In [6]:
# Group the data on the Agency Name and get the maximum of the "Amount."
amountUpdated.groupBy('Agency Name').max().show()

+--------------------+-----------+
|         Agency Name|max(Amount)|
+--------------------+-----------+
|BOLL WEEVIL ERADI...|    1639.99|
|   ROSE STATE COLEGE|     4580.0|
|EMPLOYMENT SECURI...|   12872.19|
|DEPARTMENT OF TOU...|   61000.72|
|DEPARTMENT OF LIB...|     4000.0|
|S. W. OKLA. ST. U...|   11073.09|
|OKLAHOMA MILITARY...|  183367.94|
| COMPSOURCE OKLAHOMA|     4995.0|
|COUNCIL ON LAW EN...|    4547.92|
|MERIT PROTECTION ...|     2550.0|
|OKLAHOMA ABSTRACT...|      405.3|
|SPEECH-LANGUAGE P...|      418.7|
|     STATE TREASURER|     4875.0|
|  HISTORICAL SOCIETY|    90108.0|
|OK. LAW ENFORCE. ...|    7063.03|
|DEPARTMENT OF VET...|   12395.03|
|CONSERVATION COMM...|     4520.0|
|MARGINALLY PROD. ...|     111.99|
|STATE ELECTION BOARD|    58875.0|
|OKLAHOMA ACCOUNTA...|    2165.68|
+--------------------+-----------+
only showing top 20 rows



In [7]:
# Group the data on the Agency Name and get the total of the "Amount."
amountUpdated.groupBy('Agency Name').sum().show()

+--------------------+------------------+
|         Agency Name|       sum(Amount)|
+--------------------+------------------+
|BOLL WEEVIL ERADI...|  18689.0999147892|
|   ROSE STATE COLEGE| 233808.7006199453|
|EMPLOYMENT SECURI...| 416517.2709596157|
|DEPARTMENT OF TOU...| 5971885.291795868|
|DEPARTMENT OF LIB...| 179487.1599612832|
|S. W. OKLA. ST. U...| 1423548.999507472|
|OKLAHOMA MILITARY...|3697283.2271535005|
| COMPSOURCE OKLAHOMA|351460.51985532045|
|COUNCIL ON LAW EN...|189667.93968850374|
|MERIT PROTECTION ...| 7984.070019245148|
|OKLAHOMA ABSTRACT...|1821.1800107955933|
|SPEECH-LANGUAGE P...| 4198.130038261414|
|     STATE TREASURER| 84117.73010754585|
|  HISTORICAL SOCIETY|1587215.1306678057|
|OK. LAW ENFORCE. ...|  33194.2096862793|
|DEPARTMENT OF VET...| 4242096.650280096|
|CONSERVATION COMM...| 166412.8598972559|
|MARGINALLY PROD. ...|111.98999786376953|
|STATE ELECTION BOARD| 167896.3599333465|
|OKLAHOMA ACCOUNTA...|18057.079823493958|
+--------------------+------------

In [8]:
# Group the data on the Agency Name and get the total of the "Amount" using the `agg()` function.
agency_groups = amountUpdated.groupBy('Agency Name')
agency_groups.agg({'Amount':'Sum'}).show()

+--------------------+------------------+
|         Agency Name|       sum(Amount)|
+--------------------+------------------+
|BOLL WEEVIL ERADI...|  18689.0999147892|
|   ROSE STATE COLEGE| 233808.7006199453|
|EMPLOYMENT SECURI...| 416517.2709596157|
|DEPARTMENT OF TOU...| 5971885.291795868|
|DEPARTMENT OF LIB...| 179487.1599612832|
|S. W. OKLA. ST. U...| 1423548.999507472|
|OKLAHOMA MILITARY...|3697283.2271535005|
| COMPSOURCE OKLAHOMA|351460.51985532045|
|COUNCIL ON LAW EN...|189667.93968850374|
|MERIT PROTECTION ...| 7984.070019245148|
|OKLAHOMA ABSTRACT...|1821.1800107955933|
|SPEECH-LANGUAGE P...| 4198.130038261414|
|     STATE TREASURER| 84117.73010754585|
|  HISTORICAL SOCIETY|1587215.1306678057|
|OK. LAW ENFORCE. ...|  33194.2096862793|
|DEPARTMENT OF VET...| 4242096.650280096|
|CONSERVATION COMM...| 166412.8598972559|
|MARGINALLY PROD. ...|111.98999786376953|
|STATE ELECTION BOARD| 167896.3599333465|
|OKLAHOMA ACCOUNTA...|18057.079823493958|
+--------------------+------------

In [9]:
# Group the data on the Agency Name and get the average of the "Amount" using the agg() function.
agency_groups.agg({'Amount':'Avg'}).show()

+--------------------+------------------+
|         Agency Name|       avg(Amount)|
+--------------------+------------------+
|BOLL WEEVIL ERADI...|109.93588185170117|
|   ROSE STATE COLEGE|285.48070893766214|
|EMPLOYMENT SECURI...| 355.3901629348257|
|DEPARTMENT OF TOU...|346.55787440783826|
|DEPARTMENT OF LIB...|201.21878919426368|
|S. W. OKLA. ST. U...|190.11071040430983|
|OKLAHOMA MILITARY...| 651.0447661830428|
| COMPSOURCE OKLAHOMA|224.43200501616886|
|COUNCIL ON LAW EN...| 352.5426388262151|
|MERIT PROTECTION ...|469.65117760265576|
|OKLAHOMA ABSTRACT...|182.11800107955932|
|SPEECH-LANGUAGE P...|31.097259542677136|
|     STATE TREASURER| 364.1460177815838|
|  HISTORICAL SOCIETY| 550.3519870554111|
|OK. LAW ENFORCE. ...| 349.4127335397821|
|DEPARTMENT OF VET...| 550.3498508406974|
|CONSERVATION COMM...|263.31148717920235|
|MARGINALLY PROD. ...|111.98999786376953|
|STATE ELECTION BOARD| 739.6315415565925|
|OKLAHOMA ACCOUNTA...|126.27328547897872|
+--------------------+------------

In [10]:
# Group the data on the Agency Name and get the count of transactions of the "Amount" using the agg() function.
agency_groups.agg({'Amount':'count'}).show()

+--------------------+-------------+
|         Agency Name|count(Amount)|
+--------------------+-------------+
|BOLL WEEVIL ERADI...|          170|
|   ROSE STATE COLEGE|          819|
|EMPLOYMENT SECURI...|         1172|
|DEPARTMENT OF TOU...|        17232|
|DEPARTMENT OF LIB...|          892|
|S. W. OKLA. ST. U...|         7488|
|OKLAHOMA MILITARY...|         5679|
| COMPSOURCE OKLAHOMA|         1566|
|COUNCIL ON LAW EN...|          538|
|MERIT PROTECTION ...|           17|
|OKLAHOMA ABSTRACT...|           10|
|SPEECH-LANGUAGE P...|          135|
|     STATE TREASURER|          231|
|  HISTORICAL SOCIETY|         2884|
|OK. LAW ENFORCE. ...|           95|
|DEPARTMENT OF VET...|         7708|
|CONSERVATION COMM...|          632|
|MARGINALLY PROD. ...|            1|
|STATE ELECTION BOARD|          227|
|OKLAHOMA ACCOUNTA...|          143|
+--------------------+-------------+
only showing top 20 rows



In [11]:
# Group the data on the cardholder last name and get the maximum transaction per cardholder using the agg() function.
lastname_groups = amountUpdated.groupBy('CardHolder Last Name')
lastname_groups.agg({'Amount':'Max'}).show()

+--------------------+-----------+
|CardHolder Last Name|max(Amount)|
+--------------------+-----------+
|           Fairbanks|    3310.23|
|           Worcester|       6.97|
|              Eppler|     235.98|
|               Abner|     1380.6|
|               Tyler|     3939.5|
|                Silk|     3288.0|
|             Mudgett|     2342.0|
|             JOHNSON|  103101.57|
|              Grimes|     2000.0|
|               Roach|   17118.46|
|             Custard|    1195.01|
|            Rawlings|    5400.75|
|              ARNETT|     4995.0|
|           BARKSDALE|      680.0|
|                  XU|     2970.0|
|            HOWERTON|    3508.46|
|           MCCANLIES|   31279.29|
|              GILROY|    2476.95|
|            Callaham|     3775.0|
|             Creager|    1062.24|
+--------------------+-----------+
only showing top 20 rows



In [12]:
# Get the average transaction per cardholder using the agg() function.
lastname_groups.agg({'Amount':'Avg'}).show()

+--------------------+------------------+
|CardHolder Last Name|       avg(Amount)|
+--------------------+------------------+
|           Fairbanks|441.81742568697246|
|           Worcester| 5.734999895095825|
|              Eppler|53.098823252846216|
|               Abner| 178.2785699026925|
|               Tyler| 435.9507018306799|
|                Silk| 293.2611743702608|
|             Mudgett| 187.5479070086812|
|             JOHNSON| 421.0230957194851|
|              Grimes|398.16411837409527|
|               Roach| 737.5033726484879|
|             Custard|234.92732082520212|
|            Rawlings|498.77307654619216|
|              ARNETT|289.10142089865707|
|           BARKSDALE| 133.1273700814498|
|                  XU|  455.950713865611|
|            HOWERTON|398.45714359493047|
|           MCCANLIES|  1878.50646034409|
|              GILROY|246.38892486531248|
|            Callaham| 401.3934002552392|
|             Creager|122.80977259982716|
+--------------------+------------