# Example - Flight Data Preprocessing and loading 

In [6]:
import findspark
findspark.init()
from pyspark import SparkContext
from pyspark.sql import SparkSession
sc = SparkContext("local")
spark = SparkSession.builder.getOrCreate()


## Step-1: First you need download and move the file data

The following command runs on Unix-based operating systems. 

Download it from this link https://storage.googleapis.com/met-cs-777-data/flights.csv.bz2 
File size is 135 MB 



In [2]:
%%bash 
# Dataset is stored on Google cloud
# https://storage.googleapis.com/met-cs-777-data/flights.csv.bz2 
# or from inside S3
# s3://metcs777/flights.csv.bz2

# Uncomment the following lines to download the datasets 
wget -q https://storage.googleapis.com/met-cs-777-data/flights.csv.bz2 
mv flights.csv.bz2 ./data/
ls -la ./data/

total 158792
drwxrwxrwx 1 dimitar dimitar       512 Feb 27 20:08 .
drwxrwxrwx 1 dimitar dimitar       512 Feb 27 20:08 ..
-rwxrwxrwx 1 dimitar dimitar     49184 Jan 20 01:31 Alices-Adventures-in-Wonderland-by-Lewis-Carroll.txt.bz2
-rwxrwxrwx 1 dimitar dimitar  19358733 Jan 20 01:31 IMDB Dataset.csv.bz2
-rwxrwxrwx 1 dimitar dimitar      4062 Jan 20 01:31 advertising.csv
-rwxrwxrwx 2 dimitar dimitar 141250930 Mar 11  2022 flights.csv.bz2
-rwxrwxrwx 1 dimitar dimitar      3020 Feb 14 17:44 netflix-subscription.csv
-rwxrwxrwx 1 dimitar dimitar         0 Jan 20 01:31 placeholder
-rwxrwxrwx 1 dimitar dimitar   1921037 Sep  6 21:17 taxi-data-sorted-verysmall.csv


In [5]:
%%bash
# Now, let us get the Airports and Airlines datasets 
# And Link these data sets. 

# AirLine dataset https://storage.googleapis.com/met-cs-777-data/airlines.csv
# Airport dataset https://storage.googleapis.com/met-cs-777-data/airports.csv
wget -q https://storage.googleapis.com/met-cs-777-data/airlines.csv

mv  airlines.csv   ./data/

wget -q https://storage.googleapis.com/met-cs-777-data/airports.csv

mv  airports.csv  ./data/ 
ls -la ./data/

total 158816
drwxrwxrwx 1 dimitar dimitar       512 Feb 27 20:09 .
drwxrwxrwx 1 dimitar dimitar       512 Feb 27 20:09 ..
-rwxrwxrwx 1 dimitar dimitar     49184 Jan 20 01:31 Alices-Adventures-in-Wonderland-by-Lewis-Carroll.txt.bz2
-rwxrwxrwx 1 dimitar dimitar  19358733 Jan 20 01:31 IMDB Dataset.csv.bz2
-rwxrwxrwx 1 dimitar dimitar      4062 Jan 20 01:31 advertising.csv
-rwxrwxrwx 1 dimitar dimitar       359 Mar 11  2022 airlines.csv
-rwxrwxrwx 1 dimitar dimitar     23867 Mar 11  2022 airports.csv
-rwxrwxrwx 2 dimitar dimitar 141250930 Mar 11  2022 flights.csv.bz2
-rwxrwxrwx 1 dimitar dimitar      3020 Feb 14 17:44 netflix-subscription.csv
-rwxrwxrwx 1 dimitar dimitar         0 Jan 20 01:31 placeholder
-rwxrwxrwx 1 dimitar dimitar   1921037 Sep  6 21:17 taxi-data-sorted-verysmall.csv


In [7]:
lines = sc.textFile("./data/flights.csv.bz2")

# First line is the header. 
lines.first()

'YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY'

In [8]:
# First line is the header 
lines.take(2)

['YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY',
 '2015,1,1,4,AS,98,N407AS,ANC,SEA,0005,2354,-11,21,0015,205,194,169,1448,0404,4,0430,0408,-22,0,0,,,,,,']

In [9]:
# Remove the header from the RDD
linesHeader = lines.first()
header = sc.parallelize([linesHeader])
linesWithOutHeader = lines.subtract(header)
linesWithOutHeader.first()

'2015,1,1,4,AS,730,N423AS,ANC,SEA,0505,0457,-8,16,0513,205,199,179,1448,0912,4,0930,0916,-14,0,0,,,,,,'

In [10]:
# The data is about the flights from different airports which includes following attributes
#[u'YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY,TAXI_OUT,WHEELS_OFF,SCHEDULED_TIME,ELAPSED_TIME,AIR_TIME,DISTANCE,WHEELS_ON,TAXI_IN,SCHEDULED_ARRIVAL,ARRIVAL_TIME,ARRIVAL_DELAY,DIVERTED,CANCELLED,CANCELLATION_REASON,AIR_SYSTEM_DELAY,SECURITY_DELAY,AIRLINE_DELAY,LATE_AIRCRAFT_DELAY,WEATHER_DELAY']
flights = linesWithOutHeader.map(lambda x: x.split(','))
flights.first()

['2015',
 '1',
 '1',
 '4',
 'AS',
 '730',
 'N423AS',
 'ANC',
 'SEA',
 '0505',
 '0457',
 '-8',
 '16',
 '0513',
 '205',
 '199',
 '179',
 '1448',
 '0912',
 '4',
 '0930',
 '0916',
 '-14',
 '0',
 '0',
 '',
 '',
 '',
 '',
 '',
 '']

In [11]:
# We expect to have 31 data elements. 
# We go ahead and remove all rows that do not include 31 elements
dataFiltered=flights.filter(lambda x: len(x)==31)
dataFiltered.first()

['2015',
 '1',
 '1',
 '4',
 'AS',
 '730',
 'N423AS',
 'ANC',
 'SEA',
 '0505',
 '0457',
 '-8',
 '16',
 '0513',
 '205',
 '199',
 '179',
 '1448',
 '0912',
 '4',
 '0930',
 '0916',
 '-14',
 '0',
 '0',
 '',
 '',
 '',
 '',
 '',
 '']

In [12]:
# YEAR,MONTH,DAY,DAY_OF_WEEK,AIRLINE,FLIGHT_NUMBER,TAIL_NUMBER,ORIGIN_AIRPORT,DESTINATION_AIRPORT,SCHEDULED_DEPARTURE,DEPARTURE_TIME,DEPARTURE_DELAY, CANCELLED
# We only need the following elements
mainFlightsData = dataFiltered.map(lambda p: (p[0], p[1] , p[2] , p[3], p[4] , p[5] , p[6], p[7] , p[8] , p[9], p[10], p[11], p[24] ))

# Cache this RDD, we will use it alot
mainFlightsData.cache()

# Show the first 
mainFlightsData.first()

# Note: this new RDD will have only 13 elements (max index 12) 

# 0 YEAR,
# 1 MONTH,
# 2 DAY,
# 3 DAY_OF_WEEK,
# 4 AIRLINE, 
# 5 FLIGHT_NUMBER,
# 6 TAIL_NUMBER,
# 7 ORIGIN_AIRPORT,
# 8 DESTINATION_AIRPORT,
# 9 SCHEDULED_DEPARTURE,
# 10 DEPARTURE_TIME,
# 11 DEPARTURE_DELAY, 
# 12 CANCELLED

('2015',
 '1',
 '1',
 '4',
 'AS',
 '730',
 'N423AS',
 'ANC',
 'SEA',
 '0505',
 '0457',
 '-8',
 '0')

In [15]:
airlines = sc.textFile("./data/airlines.csv")
airlines.take(2)

['IATA_CODE,AIRLINE', 'UA,United Air Lines Inc.']

In [16]:
airports = sc.textFile("./data/airports.csv")
airports.take(2)

['IATA_CODE,AIRPORT,CITY,STATE,COUNTRY,LATITUDE,LONGITUDE',
 'ABE,Lehigh Valley International Airport,Allentown,PA,USA,40.65236,-75.44040']

In [17]:
# Remove the header from the RDD
airlinesHeader = airlines.first()
header1 = sc.parallelize([airlinesHeader])
airlinesWithOutHeader = airlines.subtract(header1)
airlinesWithOutHeader.first()

'UA,United Air Lines Inc.'

In [18]:
# Remove the header from the RDD
airportsHeader = airports.first()
header1 = sc.parallelize([airportsHeader])
airportsWithOutHeader = airports.subtract(header1)
airportsWithOutHeader.first()

'ABI,Abilene Regional Airport,Abilene,TX,USA,32.41132,-99.68190'