In [1]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F

In [2]:
spark = SparkSession.builder.appName("ExampleApp").getOrCreate()

In [3]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

In [4]:
people_schema = StructType([
  # Define a StructField for each field
  StructField('name', StringType(), False),
  StructField('age', IntegerType(), False),
  StructField('city', StringType(), False)
])

In [5]:
# Load the CSV file
aa_dfw_df = spark.read.format('csv').options(Header=True).load('data/AA_DFW_2014_Departures_Short.csv.gz')
aa_dfw_df.show(5)

+-----------------+-------------+-------------------+-----------------------------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|
+-----------------+-------------+-------------------+-----------------------------+
|       01/01/2014|         0005|                HNL|                          519|
|       01/01/2014|         0007|                OGG|                          505|
|       01/01/2014|         0035|                SLC|                          174|
|       01/01/2014|         0043|                DTW|                          153|
|       01/01/2014|         0052|                PIT|                          137|
+-----------------+-------------+-------------------+-----------------------------+
only showing top 5 rows



In [6]:
aa_dfw_df = aa_dfw_df.withColumn('airport', F.lower(aa_dfw_df['Destination Airport']))
aa_dfw_df.show(5)

+-----------------+-------------+-------------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Destination Airport|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-------------------+-----------------------------+-------+
|       01/01/2014|         0005|                HNL|                          519|    hnl|
|       01/01/2014|         0007|                OGG|                          505|    ogg|
|       01/01/2014|         0035|                SLC|                          174|    slc|
|       01/01/2014|         0043|                DTW|                          153|    dtw|
|       01/01/2014|         0052|                PIT|                          137|    pit|
+-----------------+-------------+-------------------+-----------------------------+-------+
only showing top 5 rows



In [7]:
# Drop the Destination Airport column
aa_dfw_df = aa_dfw_df.drop(aa_dfw_df['Destination Airport'])
aa_dfw_df.show(5)

+-----------------+-------------+-----------------------------+-------+
|Date (MM/DD/YYYY)|Flight Number|Actual elapsed time (Minutes)|airport|
+-----------------+-------------+-----------------------------+-------+
|       01/01/2014|         0005|                          519|    hnl|
|       01/01/2014|         0007|                          505|    ogg|
|       01/01/2014|         0035|                          174|    slc|
|       01/01/2014|         0043|                          153|    dtw|
|       01/01/2014|         0052|                          137|    pit|
+-----------------+-------------+-----------------------------+-------+
only showing top 5 rows



In [8]:
# aa_dfw_df.write.parquet('AA_DFW_2014_Departures_Short.parquet', mode='overwrite')

In [9]:
aa_dfw_df.createOrReplaceTempView('flights_2014')

In [10]:
avg_duration = spark.sql('SELECT avg(`Actual elapsed time (Minutes)`) FROM flights_2014').collect()[0]
print(avg_duration)

Row(avg(Actual elapsed time (Minutes))=141.11749513352586)
