In [8]:
from pyspark.sql import SparkSession
from pyspark import SparkConf

sparkConf = SparkConf()
sparkConf.setMaster("spark://spark-master:7077")
sparkConf.setAppName("Assignment_2_Batch")
sparkConf.set("spark.driver.memory", "2g")
sparkConf.set("spark.executor.cores", "1")
sparkConf.set("spark.driver.cores", "1")
# create the spark session, which is the entry point to Spark SQL engine.
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()

# Setup hadoop fs configuration for schema gs://
conf = spark.sparkContext._jsc.hadoopConfiguration()
conf.set("fs.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem")
conf.set("fs.AbstractFileSystem.gs.impl", "com.google.cloud.hadoop.fs.gcs.GoogleHadoopFS")

#  Google Storage File Path
airport_info_gsc_file_path = 'gs://dejads_a2_input_steven/airports_list.csv'   
month_1_delay_gsc_file_path = 'gs://dejads_a2_input_steven/ONTIME_REPORTING_01.csv'   

In [9]:
# Create data frame load batch data
df_airport = spark.read.format("csv").option("header", "true").load(airport_info_gsc_file_path)

root
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- DISPLAY_AIRPORT_NAME: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- NAME: string (nullable = true)

+-----------------+--------------------+--------------------+--------------------+
|ORIGIN_AIRPORT_ID|DISPLAY_AIRPORT_NAME|    ORIGIN_CITY_NAME|                NAME|
+-----------------+--------------------+--------------------+--------------------+
|            12992|         Adams Field|     Little Rock, AR|NORTH LITTLE ROCK...|
|            10257|Albany International|          Albany, NY|ALBANY INTERNATIO...|
|            10140|Albuquerque Inter...|     Albuquerque, NM|ALBUQUERQUE INTER...|
|            10299|Anchorage Interna...|       Anchorage, AK|ANCHORAGE TED STE...|
|            10397|   Atlanta Municipal|         Atlanta, GA|ATLANTA HARTSFIEL...|
|            10423|Austin - Bergstro...|          Austin, TX|AUSTIN BERGSTROM ...|
|            10599|  Birmingham Airport|      Birmingham, AL|B

In [10]:
df_airport.printSchema()
df_airport.show(truncate=False)

root
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- DISPLAY_AIRPORT_NAME: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- NAME: string (nullable = true)

+-----------------+------------------------------------------+---------------------+-------------------------------------------------------+
|ORIGIN_AIRPORT_ID|DISPLAY_AIRPORT_NAME                      |ORIGIN_CITY_NAME     |NAME                                                   |
+-----------------+------------------------------------------+---------------------+-------------------------------------------------------+
|12992            |Adams Field                               |Little Rock, AR      |NORTH LITTLE ROCK AIRPORT, AR US                       |
|10257            |Albany International                      |Albany, NY           |ALBANY INTERNATIONAL AIRPORT, NY US                    |
|10140            |Albuquerque International Sunport         |Albuquerque, NM      |ALBUQUERQUE INTERNAT

In [11]:
df_delay_01 = spark.read.format("csv").option("header", "true").load(month_1_delay_gsc_file_path)

In [12]:
df_delay_01.printSchema()
df_delay_01.show(truncate=False)

root
 |-- MONTH: string (nullable = true)
 |-- DAY_OF_MONTH: string (nullable = true)
 |-- DAY_OF_WEEK: string (nullable = true)
 |-- OP_UNIQUE_CARRIER: string (nullable = true)
 |-- TAIL_NUM: string (nullable = true)
 |-- OP_CARRIER_FL_NUM: string (nullable = true)
 |-- ORIGIN_AIRPORT_ID: string (nullable = true)
 |-- ORIGIN: string (nullable = true)
 |-- ORIGIN_CITY_NAME: string (nullable = true)
 |-- DEST_AIRPORT_ID: string (nullable = true)
 |-- DEST: string (nullable = true)
 |-- DEST_CITY_NAME: string (nullable = true)
 |-- CRS_DEP_TIME: string (nullable = true)
 |-- DEP_TIME: string (nullable = true)
 |-- DEP_DELAY_NEW: string (nullable = true)
 |-- DEP_DEL15: string (nullable = true)
 |-- DEP_TIME_BLK: string (nullable = true)
 |-- CRS_ARR_TIME: string (nullable = true)
 |-- ARR_TIME: string (nullable = true)
 |-- ARR_DELAY_NEW: string (nullable = true)
 |-- ARR_TIME_BLK: string (nullable = true)
 |-- CANCELLED: string (nullable = true)
 |-- CANCELLATION_CODE: string (nullable 

In [14]:
len(df_delay_01.columns)

33

In [22]:
from pyspark.sql.functions import col

df_delay_per_airport = df_delay_01.where(df_delay_01.DEP_DELAY_NEW == 1).select("*")

In [24]:
df_delay_per_airport.show()

+-----+------------+-----------+-----------------+--------+-----------------+-----------------+------+--------------------+---------------+----+--------------------+------------+--------+-------------+---------+------------+------------+--------+-------------+------------+---------+-----------------+----------------+-------------------+--------+--------------+-------------+-------------+---------+--------------+-------------------+----+
|MONTH|DAY_OF_MONTH|DAY_OF_WEEK|OP_UNIQUE_CARRIER|TAIL_NUM|OP_CARRIER_FL_NUM|ORIGIN_AIRPORT_ID|ORIGIN|    ORIGIN_CITY_NAME|DEST_AIRPORT_ID|DEST|      DEST_CITY_NAME|CRS_DEP_TIME|DEP_TIME|DEP_DELAY_NEW|DEP_DEL15|DEP_TIME_BLK|CRS_ARR_TIME|ARR_TIME|ARR_DELAY_NEW|ARR_TIME_BLK|CANCELLED|CANCELLATION_CODE|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|DISTANCE|DISTANCE_GROUP|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|_c32|
+-----+------------+-----------+-----------------+--------+-----------------+-----------------+------+----------------

In [4]:
df.write.format("csv").save("gs://dejads_output/lab61.csv") # use correct bucket name

In [7]:
# Stop the spark context
spark.stop()