In [1]:
# Import findspark and initialize. 
import findspark
findspark.init()

In [2]:
# Start a Spark session
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("PCard Transactions").getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/08 11:00:46 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in data from S3 Buckets
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/2/pcard_transactions.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("pcard_transactions.csv"), sep=",", header=True)

# Show DataFrame
df.show()

                                                                                

+----------+-------------+--------------------+--------------------+------------------------+--------------------+------+--------------------+--------------------+--------------------+----------------------------+
|Year-Month|Agency Number|         Agency Name|Cardholder Last Name|Cardholder First Initial|         Description|Amount|              Vendor|    Transaction Date|         Posted Date|Merchant Category Code (MCC)|
+----------+-------------+--------------------+--------------------+------------------------+--------------------+------+--------------------+--------------------+--------------------+----------------------------+
|    201307|         1000|OKLAHOMA STATE UN...|               Mason|                       C|    GENERAL PURCHASE|   890|               NACAS|07/30/2013 12:00:...|07/31/2013 12:00:...|        CHARITABLE AND SO...|
|    201307|         1000|OKLAHOMA STATE UN...|               Mason|                       C|        ROOM CHARGES|368.96|      SHERATON HOTEL|07

In [4]:
# Get the data types of the columns. 
df.printSchema()

root
 |-- Year-Month: string (nullable = true)
 |-- Agency Number: string (nullable = true)
 |-- Agency Name: string (nullable = true)
 |-- Cardholder Last Name: string (nullable = true)
 |-- Cardholder First Initial: string (nullable = true)
 |-- Description: string (nullable = true)
 |-- Amount: string (nullable = true)
 |-- Vendor: string (nullable = true)
 |-- Transaction Date: string (nullable = true)
 |-- Posted Date: string (nullable = true)
 |-- Merchant Category Code (MCC): string (nullable = true)



In [5]:
# Create a new DataFrame with an "Amount" column as a float datatype. 
amountUpdated = df.withColumn("Amount", df["Amount"].cast('float'))

In [6]:
# Group the data on the Agency Name and get the maximum of the "Amount."
amountUpdated.groupBy('Agency Name').max().show()



+--------------------+-----------+
|         Agency Name|max(Amount)|
+--------------------+-----------+
|OKLAHOMA STATE UN...|   27967.38|
|BOLL WEEVIL ERADI...|    1639.99|
|EMPLOYMENT SECURI...|   12872.19|
|OKLAHOMA MILITARY...|  183367.94|
|MERIT PROTECTION ...|     2550.0|
|OKLAHOMA ABSTRACT...|      405.3|
|  HISTORICAL SOCIETY|    90108.0|
|STATE ELECTION BOARD|    58875.0|
|OKLAHOMA ACCOUNTA...|    2165.68|
|  STATE ARTS COUNCIL|     2045.4|
| DEPARTMENT OF MINES|     1850.0|
|CORPORATION COMMI...|     6038.0|
|            GOVERNOR|    3071.95|
|DEPARTMENT OF EME...|     3200.0|
|ST. BD. OF CHIROP...|     2466.0|
|   ETHICS COMMISSION|     1027.8|
|J. M. DAVIS ARMS ...|     635.93|
|DISTRICT ATTORNEY...|     5000.0|
|OKLAHOMA AERONAUT...|     3000.0|
|PARDON AND PAROLE...|      577.0|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [7]:
# Group the data on the Agency Name and get the total of the "Amount."
amountUpdated.groupBy('Agency Name').sum().show()

+--------------------+-------------------+
|         Agency Name|        sum(Amount)|
+--------------------+-------------------+
|OKLAHOMA STATE UN...|3.377883993597806E7|
|BOLL WEEVIL ERADI...|   18689.0999147892|
|EMPLOYMENT SECURI...|  416517.2709596157|
|OKLAHOMA MILITARY...| 3697283.2271535005|
|MERIT PROTECTION ...|  7984.070019245148|
|OKLAHOMA ABSTRACT...| 1821.1800107955933|
|  HISTORICAL SOCIETY| 1587215.1306678057|
|STATE ELECTION BOARD|  167896.3599333465|
|OKLAHOMA ACCOUNTA...| 18057.079823493958|
|  STATE ARTS COUNCIL|  47705.35992574692|
| DEPARTMENT OF MINES|  28480.51993075013|
|CORPORATION COMMI...| 208050.81010117382|
|            GOVERNOR|   63720.0095512867|
|DEPARTMENT OF EME...| 58851.640033721924|
|ST. BD. OF CHIROP...|  25892.29986524582|
|   ETHICS COMMISSION| 21327.539993047714|
|J. M. DAVIS ARMS ...|  5135.909971237183|
|DISTRICT ATTORNEY...|  156613.7489424944|
|OKLAHOMA AERONAUT...|  49949.04994082451|
|PARDON AND PAROLE...| 3854.1600222587585|
+----------



In [8]:
# Group the data on the Agency Name and get the total of the "Amount" using the `agg()` function.
agency_groups = amountUpdated.groupBy('Agency Name')
agency_groups.agg({'Amount':'Sum'}).show()

+--------------------+-------------------+
|         Agency Name|        sum(Amount)|
+--------------------+-------------------+
|OKLAHOMA STATE UN...|3.377883993597806E7|
|BOLL WEEVIL ERADI...|   18689.0999147892|
|EMPLOYMENT SECURI...|  416517.2709596157|
|OKLAHOMA MILITARY...| 3697283.2271535005|
|MERIT PROTECTION ...|  7984.070019245148|
|OKLAHOMA ABSTRACT...| 1821.1800107955933|
|  HISTORICAL SOCIETY| 1587215.1306678057|
|STATE ELECTION BOARD|  167896.3599333465|
|OKLAHOMA ACCOUNTA...| 18057.079823493958|
|  STATE ARTS COUNCIL|  47705.35992574692|
| DEPARTMENT OF MINES|  28480.51993075013|
|CORPORATION COMMI...| 208050.81010117382|
|            GOVERNOR|   63720.0095512867|
|DEPARTMENT OF EME...| 58851.640033721924|
|ST. BD. OF CHIROP...|  25892.29986524582|
|   ETHICS COMMISSION| 21327.539993047714|
|J. M. DAVIS ARMS ...|  5135.909971237183|
|DISTRICT ATTORNEY...|  156613.7489424944|
|OKLAHOMA AERONAUT...|  49949.04994082451|
|PARDON AND PAROLE...| 3854.1600222587585|
+----------

In [9]:
# Group the data on the Agency Name and get the average of the "Amount" using the agg() function.
agency_groups.agg({'Amount':'Avg'}).show()

+--------------------+------------------+
|         Agency Name|       avg(Amount)|
+--------------------+------------------+
|OKLAHOMA STATE UN...|291.20944813119587|
|BOLL WEEVIL ERADI...|109.93588185170117|
|EMPLOYMENT SECURI...| 355.3901629348257|
|OKLAHOMA MILITARY...| 651.0447661830428|
|MERIT PROTECTION ...|469.65117760265576|
|OKLAHOMA ABSTRACT...|182.11800107955932|
|  HISTORICAL SOCIETY| 550.3519870554111|
|STATE ELECTION BOARD| 739.6315415565925|
|OKLAHOMA ACCOUNTA...|126.27328547897872|
|  STATE ARTS COUNCIL| 190.0611949232945|
| DEPARTMENT OF MINES|212.54119351306068|
|CORPORATION COMMI...|229.13084812904606|
|            GOVERNOR|199.12502984777092|
|DEPARTMENT OF EME...|246.24117168921308|
|ST. BD. OF CHIROP...|190.38455783268984|
|   ETHICS COMMISSION|156.82014700770378|
|J. M. DAVIS ARMS ...|100.70411708308201|
|DISTRICT ATTORNEY...|  241.315483732657|
|OKLAHOMA AERONAUT...|229.12408229736013|
|PARDON AND PAROLE...|192.70800111293792|
+--------------------+------------



In [10]:
# Group the data on the Agency Name and get the count of transactions of the "Amount" using the agg() function.
agency_groups.agg({'Amount':'count'}).show()

+--------------------+-------------+
|         Agency Name|count(Amount)|
+--------------------+-------------+
|OKLAHOMA STATE UN...|       115995|
|BOLL WEEVIL ERADI...|          170|
|EMPLOYMENT SECURI...|         1172|
|OKLAHOMA MILITARY...|         5679|
|MERIT PROTECTION ...|           17|
|OKLAHOMA ABSTRACT...|           10|
|  HISTORICAL SOCIETY|         2884|
|STATE ELECTION BOARD|          227|
|OKLAHOMA ACCOUNTA...|          143|
|  STATE ARTS COUNCIL|          251|
| DEPARTMENT OF MINES|          134|
|CORPORATION COMMI...|          908|
|            GOVERNOR|          320|
|DEPARTMENT OF EME...|          239|
|ST. BD. OF CHIROP...|          136|
|   ETHICS COMMISSION|          136|
|J. M. DAVIS ARMS ...|           51|
|DISTRICT ATTORNEY...|          649|
|OKLAHOMA AERONAUT...|          218|
|PARDON AND PAROLE...|           20|
+--------------------+-------------+
only showing top 20 rows



In [11]:
# Group the data on the cardholder last name and get the maximum transaction per cardholder using the agg() function.
lastname_groups = amountUpdated.groupBy('CardHolder Last Name')
lastname_groups.agg({'Amount':'Max'}).show()



+--------------------+-----------+
|CardHolder Last Name|max(Amount)|
+--------------------+-----------+
|           Fairbanks|    3310.23|
|           Worcester|       6.97|
|              GILROY|    2476.95|
|            Callaham|     3775.0|
|             Creager|    1062.24|
|               Dunne|    3812.62|
|             Edmonds|     892.27|
|            Wooliver|     1833.2|
|               Gerlt|     3582.0|
|             Guthals|     2113.5|
|              Ownbey|    1335.09|
|              Porter|    3655.99|
|                Frie|    1341.65|
|                Deal|     518.52|
|            McDowell|     2000.0|
|               Scott|     4157.0|
|              Wilson|     5000.0|
|            Wilbourn|    1463.21|
|           Robertson|    7909.28|
|                Goad|     3900.0|
+--------------------+-----------+
only showing top 20 rows



                                                                                

In [12]:
# Get the average transaction per cardholder using the agg() function.
lastname_groups.agg({'Amount':'Avg'}).show()



+--------------------+------------------+
|CardHolder Last Name|       avg(Amount)|
+--------------------+------------------+
|           Fairbanks|441.81742568697246|
|           Worcester| 5.734999895095825|
|              GILROY|246.38892486531248|
|            Callaham| 401.3934002552392|
|             Creager|122.80977259982716|
|               Dunne|503.36100047429403|
|             Edmonds|118.90054634267634|
|            Wooliver|427.08141817365373|
|               Gerlt| 246.3202815844979|
|             Guthals|179.90105317768297|
|              Ownbey|452.52363204956055|
|              Porter|182.50486135746235|
|                Frie|361.63952436901275|
|                Deal|123.70600147247315|
|            McDowell| 383.4832420606871|
|               Scott| 314.9947040113684|
|              Wilson| 283.8881809467646|
|            Wilbourn|171.29541704389783|
|           Robertson|329.73286159785397|
|                Goad| 299.3217814126234|
+--------------------+------------

                                                                                