In [1]:
# Import findspark and initialize. 
import findspark
findspark.init()

In [2]:
# Import packages
from pyspark.sql import SparkSession
import time

# Create a SparkSession
spark = SparkSession.builder\
    .appName("SparkSQL")\
    .config("spark.sql.debug.maxToStringFields", 2000)\
    .config("spark.driver.memory", "2g")\
    .getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


23/02/08 13:15:24 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
# Read in data from S3 Bucket
from pyspark import SparkFiles
url = "https://2u-data-curriculum-team.s3.amazonaws.com/dataviz-classroom/v1.2/22-big-data/3/DelayedFlights.csv"
spark.sparkContext.addFile(url)
df = spark.read.csv(SparkFiles.get("DelayedFlights.csv"), sep=",", header=True)
df.show()

                                                                                

+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
| id|Year|Month|DayofMonth|DayOfWeek|DepTime|CRSDepTime|ArrTime|CRSArrTime|UniqueCarrier|FlightNum|TailNum|ActualElapsedTime|CRSElapsedTime|AirTime|ArrDelay|DepDelay|Origin|Dest|Distance|TaxiIn|TaxiOut|Cancelled|CancellationCode|Diverted|CarrierDelay|WeatherDelay|NASDelay|SecurityDelay|LateAircraftDelay|
+---+----+-----+----------+---------+-------+----------+-------+----------+-------------+---------+-------+-----------------+--------------+-------+--------+--------+------+----+--------+------+-------+---------+----------------+--------+------------+------------+--------+-------------+-----------------+
|  0|2008|    1|         3|        4|   2003|      1955|   2211|      2225|       

In [4]:
# Create temp view named "delays"
df.createOrReplaceTempView('delays')

# Start the runtime
start_time = time.time()

# Using spark.sql write a query that gives you the total distance 
# and the count of every unique Origin, Dest combination.
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from delays group by 1,2""").show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))



+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   LAS| LIT|      72520.0|      56|
|   PHL| MCO|     986706.0|    1146|
|   SMF| BUR|     178284.0|     498|
|   SNA| PHX|     218010.0|     645|
|   MCI| IAH|     156249.0|     243|
|   BFL| SAN|       4515.0|      21|
|   ROC| CLE|      39935.0|     163|
|   SPI| ORD|      34104.0|     196|
|   ATL| GSP|      54621.0|     357|
|   SFO| PMD|      26860.0|      85|
|   LAX| OXR|       6958.0|     142|
|   ORD| PDX|     909497.0|     523|
|   PBI| DCA|      99412.0|     116|
|   FSD| ATL|      56286.0|      59|
|   MLI| MCO|      25900.0|      25|
|   SJC| ONT|     145521.0|     437|
|   AUS| ELP|     112992.0|     214|
|   ICT| IAH|      97560.0|     180|
|   CLE| MCI|     129084.0|     186|
|   CPR| DEN|      31510.0|     137|
+------+----+-------------+--------+
only showing top 20 rows

--- 4.247038841247559 seconds ---


                                                                                

In [5]:
# Write out the data in parquet format
df.write.parquet('parquet_delay_basic', mode='overwrite')

                                                                                

In [6]:
# Read in our new parquet formatted data
p_df=spark.read.parquet('parquet_delay_basic')

In [7]:
# Convert the DataFrame to a view.
p_df.createOrReplaceTempView('p_delays')

In [8]:
# Start the runtime
start_time = time.time()

# Run the same query here
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays group by 1,2""").show()

# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))



+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   FSD| ATL|      56286.0|      59|
|   ATL| GSP|      54621.0|     357|
|   MSP| AVL|      24969.0|      29|
|   ORD| PDX|     909497.0|     523|
|   BQN| MCO|      48547.0|      43|
|   MCI| IAH|     156249.0|     243|
|   EWR| STT|      73530.0|      45|
|   PHL| MCO|     986706.0|    1146|
|   SNA| PHX|     218010.0|     645|
|   LAS| LIT|      72520.0|      56|
|   SMF| BUR|     178284.0|     498|
|   ROC| CLE|      39935.0|     163|
|   SPI| ORD|      34104.0|     196|
|   LAX| OXR|       6958.0|     142|
|   SFO| PMD|      26860.0|      85|
|   PBI| DCA|      99412.0|     116|
|   JFK| ORD|     494320.0|     668|
|   TPA| CVG|      90441.0|     117|
|   CVG| BDL|     109065.0|     165|
|   BTV| MCO|      19120.0|      16|
+------+----+-------------+--------+
only showing top 20 rows

--- 1.615084171295166 seconds ---


                                                                                

In [9]:
# Write out your parquet data, partitioning on the Origin column
df.write.partitionBy("Origin").mode("overwrite").parquet("delayed_partitioned")

                                                                                

In [10]:
# Read in our new parquet formatted data
p_df_p=spark.read.parquet('delayed_partitioned')

                                                                                

In [11]:
# Convert the dataframe to a view.
p_df_p.createOrReplaceTempView('p_delays_p')

In [12]:
# Start the runtime
start_time = time.time()

# Run your query against your partitioned data one more time.
spark.sql("""select Origin, Dest ,sum(Distance), count(*) from p_delays_p group by 1,2""").show()

# Print out the runtime
print("--- %s seconds ---" % (time.time() - start_time))



+------+----+-------------+--------+
|Origin|Dest|sum(Distance)|count(1)|
+------+----+-------------+--------+
|   ATL| GSP|      54621.0|     357|
|   ORD| PDX|     909497.0|     523|
|   LAX| OXR|       6958.0|     142|
|   EWR| STT|      73530.0|      45|
|   ATL| HDN|      30820.0|      23|
|   ORD| FWA|      89961.0|     573|
|   DFW| PNS|     228916.0|     379|
|   DFW| SDF|     142202.0|     194|
|   LAX| PIT|      49128.0|      23|
|   ORD| CAE|     225108.0|     338|
|   ORD| BUF|     332992.0|     704|
|   DFW| HOU|     102258.0|     414|
|   DEN| ANC|     235788.0|      98|
|   EWR| CMH|     134442.0|     291|
|   DEN| ABQ|     225105.0|     645|
|   DEN| RAP|      69230.0|     230|
|   LAX| SBP|      46965.0|     303|
|   PHX| TUL|     163625.0|     175|
|   IAH| LIT|      72930.0|     195|
|   LAX| MCO|     540948.0|     244|
+------+----+-------------+--------+
only showing top 20 rows

--- 5.463308811187744 seconds ---


                                                                                

In [13]:
# Start  the runtime
start_time = time.time()

# Filter the data on something that selects your partition choice.
spark.sql("""Select distinct Origin, TailNum from p_delays where TailNum='N712SW' """).show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))

+------+-------+
|Origin|TailNum|
+------+-------+
|   MSY| N712SW|
|   MDW| N712SW|
|   ABQ| N712SW|
|   BUR| N712SW|
|   PIT| N712SW|
|   MCO| N712SW|
|   TPA| N712SW|
|   BNA| N712SW|
|   SNA| N712SW|
|   SMF| N712SW|
|   JAN| N712SW|
|   SFO| N712SW|
|   AUS| N712SW|
|   PHX| N712SW|
|   SAN| N712SW|
|   SLC| N712SW|
|   DAL| N712SW|
|   OAK| N712SW|
|   HOU| N712SW|
|   DEN| N712SW|
+------+-------+
only showing top 20 rows

--- 0.5014269351959229 seconds ---


In [14]:
# Start  the runtime
start_time = time.time()

# Filter the data on something that has nothing to do with your partition choice.
spark.sql("""Select distinct Dest, TailNum from p_delays_p where TailNum='N712SW' """).show()

# Print out the runtime.
print("--- %s seconds ---" % (time.time() - start_time))



+----+-------+
|Dest|TailNum|
+----+-------+
| RNO| N712SW|
| MDW| N712SW|
| STL| N712SW|
| BNA| N712SW|
| SNA| N712SW|
| SMF| N712SW|
| OMA| N712SW|
| ONT| N712SW|
| PHX| N712SW|
| LAS| N712SW|
| LAX| N712SW|
| MCI| N712SW|
| RDU| N712SW|
| ABQ| N712SW|
| BUR| N712SW|
| MCO| N712SW|
| TPA| N712SW|
| ELP| N712SW|
| BUF| N712SW|
| BHM| N712SW|
+----+-------+
only showing top 20 rows

--- 2.738379955291748 seconds ---


