In [58]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment_2_Batch")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
airport_info_gsc_file_path = 'gs://dejads_a2_input_steven/AIRPORT_COORDINATES.csv'   
month_1_delay_gsc_file_path = 'gs://dejads_a2_input_steven/ONTIME_REPORTING_01.csv'   

In [59]:
# Create data frame load batch data - AIRPORT_COORDINATES.csv
df_airport = spark.read.format("csv").option("header", "true").load(airport_info_gsc_file_path)

In [60]:
# Create data frame load batch data - ONTIME_REPORTING_01.csv
df_delay_01 = spark.read.format("csv").option("header", "true").load(month_1_delay_gsc_file_path)

In [3]:
df_airport.printSchema()
df_airport.show(truncate=False)

root
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- DISPLAY_AIRPORT_NAME: string (nullable = true)
 |-- LATITUDE: string (nullable = true)
 |-- LONGITUDE: string (nullable = true)

+-----------------+------------------------+-----------+------------+
|ORIGIN_AIRPORT_ID|DISPLAY_AIRPORT_NAME    |LATITUDE   |LONGITUDE   |
+-----------------+------------------------+-----------+------------+
|10001            |Afognak Lake Airport    |58.10944444|-152.9066667|
|10003            |Bear Creek Mining Strip |65.54805556|-161.0716667|
|10004            |Lik Mining Camp         |68.08333333|-163.1666667|
|10005            |Little Squaw Airport    |67.57      |-148.1838889|
|10006            |Kizhuyak Bay            |57.74527778|-152.8827778|
|10007            |Klawock Seaplane Base   |55.55472222|-133.1016667|
|10008            |Elizabeth Island Airport|59.15694444|-151.8291667|
|10009            |Augustin Island         |59.36277778|-153.4305556|
|10010            |Columbia County        

In [5]:
df_delay_01.printSchema()
df_delay_01.show(truncate=False)

root
 |-- MONTH: string (nullable = true)
 |-- DAY_OF_MONTH: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST_AIRPORT_ID: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY_NEW: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY_NEW: string (nullable = true)
 |-- ARR_TIME_BLK: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable 

In [66]:
df_airport.select("ORIGIN_AIRPORT_ID").distinct().count()

6573

In [67]:
# There exists some duplicated airport records
df_airport.count()

18133

In [70]:
# clever way to remove duplicated airport records

df_airport.drop_duplicates(['ORIGIN_AIRPORT_ID']).count()

6573

In [72]:
from pyspark.sql.window import Window
from pyspark.sql.functions import row_number, col

# stupid way remove duplicated airport records
airport_window = Window.partitionBy('ORIGIN_AIRPORT_ID').orderBy('ORIGIN_AIRPORT_ID')

df_airport_clean = df_airport.withColumn('row_num', row_number().over(airport_window))\
                             .where(col('row_num') == 1)\
                             .drop('row_num')

In [73]:
df_airport_clean.count()

6573

In [77]:
info_columns = ['DAY_OF_MONTH', 'DAY_OF_WEEK', 'OP_UNIQUE_CARRIER', 'TAIL_NUM', 
                'OP_CARRIER_FL_NUM', 'ORIGIN_AIRPORT_ID', 'ORIGIN', 'DEST', 'DEP_TIME', 
                'ARR_TIME', 'DEP_DELAY_NEW']

In [88]:
df_delay_per_airport = df_delay_01.where(df_delay_01.DEP_DELAY_NEW != 0).select("*")

# join flight delay table with airport info tale
joinExpression = df_delay_per_airport["ORIGIN_AIRPORT_ID"] == df_airport_clean['ORIGIN_AIRPORT_ID']
df_delay_01_w_airport_name = df_delay_per_airport.join(df_airport_clean, joinExpression,"left")

In [89]:
df_delay_01.where(df_delay_01.DEP_DELAY_NEW != 0).select("*").count()

185482

In [85]:
df_delay_per_airport.count()

185482

In [86]:
df_delay_01_w_airport_name.count()

185482

In [90]:
info_columns_new = info_columns + ['DISPLAY_AIRPORT_NAME']
info_columns_new.remove('ORIGIN_AIRPORT_ID')

In [91]:
df_delay_01_w_airport_name.select(info_columns_new).show(10, truncate=False)

+------------+-----------+-----------------+--------+-----------------+------+----+--------+--------+-------------+------------------------------------------+
|DAY_OF_MONTH|DAY_OF_WEEK|OP_UNIQUE_CARRIER|TAIL_NUM|OP_CARRIER_FL_NUM|ORIGIN|DEST|DEP_TIME|ARR_TIME|DEP_DELAY_NEW|DISPLAY_AIRPORT_NAME                      |
+------------+-----------+-----------------+--------+-----------------+------+----+--------+--------+-------------+------------------------------------------+
|13          |7          |9E               |N8688C  |3280             |ATL   |CSG |1714    |1756    |29.00        |Atlanta Municipal                         |
|16          |3          |9E               |N981EV  |3280             |ATL   |CSG |1902    |1944    |137.00       |Atlanta Municipal                         |
|23          |3          |9E               |N8976E  |3280             |ATL   |CSG |1827    |1913    |102.00       |Atlanta Municipal                         |
|24          |4          |9E               |N8

In [97]:
from pyspark.sql.functions import asc, desc

df_delay_01_summary = df_delay_01_w_airport_name.groupby('DISPLAY_AIRPORT_NAME')\
                                                .count()\
                                                .orderBy('count', ascending=False)

In [98]:
df_delay_01_summary.show(10, truncate=False)

+--------------------------------+-----+
|DISPLAY_AIRPORT_NAME            |count|
+--------------------------------+-----+
|Chicago O'Hare International    |10285|
|Atlanta Municipal               |9232 |
|Dallas Fort Worth Regional      |7827 |
|Stapleton International         |7080 |
|Los Angeles International       |6267 |
|Douglas Municipal               |6118 |
|LaGuardia                       |5320 |
|McCarran International          |4858 |
|Phoenix Sky Harbor International|4837 |
|San Francisco International     |4655 |
+--------------------------------+-----+
only showing top 10 rows



In [4]:
df.write.format("csv").save("gs://dejads_output/lab61.csv") # use correct bucket name

In [57]:
# Stop the spark context
spark.stop()