In [2]:
import os
import pandas as pd
import numpy as np


spark_version = 'spark-3.0.3'
os.environ['SPARK_VERSION']=spark_version

# Install Spark and Java
!apt-get update
!apt-get install openjdk-11-jdk-headless -qq > /dev/null
!wget -q https://downloads.apache.org/spark/spark-3.0.3/spark-3.0.3-bin-hadoop2.7.tgz
!tar xf spark-3.0.3-bin-hadoop2.7.tgz
!pip install -q findspark

# Set Environment Variables
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"
os.environ["SPARK_HOME"] = f"/content/{spark_version}-bin-hadoop2.7"

# Start a SparkSession
import findspark
findspark.init()

0% [Working]            Get:1 https://cloud.r-project.org/bin/linux/ubuntu bionic-cran40/ InRelease [3,626 B]
Ign:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  InRelease
Get:3 http://security.ubuntu.com/ubuntu bionic-security InRelease [88.7 kB]
Ign:4 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  InRelease
Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release [696 B]
Hit:6 https://developer.download.nvidia.com/compute/machine-learning/repos/ubuntu1804/x86_64  Release
Get:7 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu1804/x86_64  Release.gpg [836 B]
Get:8 http://ppa.launchpad.net/c2d4u.team/c2d4u4.0+/ubuntu bionic InRelease [15.9 kB]
Hit:9 http://archive.ubuntu.com/ubuntu bionic InRelease
Get:10 http://archive.ubuntu.com/ubuntu bionic-updates InRelease [88.7 kB]
Hit:11 http://ppa.launchpad.net/cran/libgit2/ubuntu bionic InRelease
Get:12 http://archive.ubunt

In [3]:
# Download the Postgres driver that will allow Spark to interact with Postgres.
!wget https://jdbc.postgresql.org/download/postgresql-42.2.16.jar

--2022-01-29 21:33:51--  https://jdbc.postgresql.org/download/postgresql-42.2.16.jar
Resolving jdbc.postgresql.org (jdbc.postgresql.org)... 72.32.157.228, 2001:4800:3e1:1::228
Connecting to jdbc.postgresql.org (jdbc.postgresql.org)|72.32.157.228|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1002883 (979K) [application/java-archive]
Saving to: ‘postgresql-42.2.16.jar’


2022-01-29 21:33:52 (11.6 MB/s) - ‘postgresql-42.2.16.jar’ saved [1002883/1002883]



In [4]:
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("Final-Project").config("spark.driver.extraClassPath","/content/postgresql-42.2.16.jar").getOrCreate()

In [5]:
from pyspark.sql import *
from pyspark.sql.functions import col, when
from pyspark.sql import SparkSession
from pyspark.sql.functions import lpad
from pyspark.sql.functions import concat
from pyspark.sql.functions import lit
from pyspark.sql.functions import substring

In [6]:
from pyspark import SparkFiles
#url = "https://finalprojectstorage10.s3.us-east-2.amazonaws.com/2018_clean_final.csv"
url = "https://storage.googleapis.com/uftairlinedbbucket/2018_clean_final.csv"
spark.sparkContext.addFile(url)
df = spark.read.option("encoding", "UTF-8").csv(SparkFiles.get("2018_clean_final.csv"), header=True)
df.show()

+----------+------------------+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-------------+-------------+---------+--------------+-------------------+--------------------+------------+-------------+-----------+--------------------+-------------+--------------+---------+-----+---+----------+------------+
|   FL_DATE|        OP_CARRIER|CRS_DEP_TIME|DEP_DELAY|TAXI_OUT|WHEELS_OFF|WHEELS_ON|TAXI_IN|CRS_ARR_TIME|ARR_DELAY|CRS_ELAPSED_TIME|ACTUAL_ELAPSED_TIME|AIR_TIME|DISTANCE|CARRIER_DELAY|WEATHER_DELAY|NAS_DELAY|SECURITY_DELAY|LATE_AIRCRAFT_DELAY|         Org_Airport|Org_latitude|Org_longitude|Origin_city|        Dest_Airport|Dest_latitude|Dest_longitude|Dest_city|month|Day|MONTH_abbr|DELAY_STATUS|
+----------+------------------+------------+---------+--------+----------+---------+-------+------------+---------+----------------+-------------------+--------+--------+-------------+-------------+------

In [7]:
from pyspark.sql.functions import *

#Timestamp function to fix formatting
def padTimeStamp(x,y):
  if y is None:
      y = "0"

  y = y.replace(".0","")
  return x + " " + y.zfill(4)[0:2] + ":" + y.zfill(4)[2:4]

#Create udf becuase python fucntions do not work with pyspark
padTimeStampUDF = udf(lambda x,y: padTimeStamp(x,y)) 

#inital dataframe applying the udf to pad and format the timestamp columns and pick the rest of the columns
initial_ml_df = df.select(
                    "FL_DATE",
                    "OP_CARRIER",
                    padTimeStampUDF(df["FL_DATE"],df["CRS_DEP_TIME"]).alias("CRS_DEPARTURE_TIMESTAMP"),
                    df.columns[3],df.columns[4],
                    padTimeStampUDF(df["FL_DATE"],df["WHEELS_OFF"]).alias("WHEELS_OFF_TIMESTAMP"),
                    padTimeStampUDF(df["FL_DATE"],df["WHEELS_ON"]).alias("WHEELS_ON_TIMESTAMP"),
                    df.columns[7],
                    padTimeStampUDF(df["FL_DATE"],df["CRS_ARR_TIME"]).alias("CRS_ARRIVAL_TIMESTAMP"),
                    df.columns[9], df.columns[10], df.columns[11], df.columns[12], df.columns[13], df.columns[14], df.columns[15], df.columns[16], df.columns[17], df.columns[18], df.columns[19], df.columns[20], df.columns[21],
                    df.columns[22],df.columns[23],df.columns[24],df.columns[25],df.columns[26],df.columns[27],df.columns[28],df.columns[29],df.columns[30]
                  )
initial_ml_df.printSchema()
#initial_ml_df.show()

root
 |-- FL_DATE: string (nullable = true)
 |-- OP_CARRIER: string (nullable = true)
 |-- CRS_DEPARTURE_TIMESTAMP: string (nullable = true)
 |-- DEP_DELAY: string (nullable = true)
 |-- TAXI_OUT: string (nullable = true)
 |-- WHEELS_OFF_TIMESTAMP: string (nullable = true)
 |-- WHEELS_ON_TIMESTAMP: string (nullable = true)
 |-- TAXI_IN: string (nullable = true)
 |-- CRS_ARRIVAL_TIMESTAMP: string (nullable = true)
 |-- ARR_DELAY: string (nullable = true)
 |-- CRS_ELAPSED_TIME: string (nullable = true)
 |-- ACTUAL_ELAPSED_TIME: string (nullable = true)
 |-- AIR_TIME: string (nullable = true)
 |-- DISTANCE: string (nullable = true)
 |-- CARRIER_DELAY: string (nullable = true)
 |-- WEATHER_DELAY: string (nullable = true)
 |-- NAS_DELAY: string (nullable = true)
 |-- SECURITY_DELAY: string (nullable = true)
 |-- LATE_AIRCRAFT_DELAY: string (nullable = true)
 |-- Org_Airport: string (nullable = true)
 |-- Org_latitude: string (nullable = true)
 |-- Org_longitude: string (nullable = true)
 |-

In [8]:
#Final df where we change the column names and cast the datatypes
final_ml_df = initial_ml_df.selectExpr(
                                "cast(FL_DATE as date) FLIGHT_DT",
                                "OP_CARRIER as AIRLINE_CARRIER_CODE",
                                "cast(CRS_DEPARTURE_TIMESTAMP as timestamp) CRS_DEPARTURE_TIMESTAMP",
                                "cast(DEP_DELAY as integer) as DEPARTURE_DELAY_MINUTES",
                                "cast(TAXI_OUT as integer) TAXI_OUT_MINUTES",
                                "cast(WHEELS_OFF_TIMESTAMP as timestamp) WHEELS_OFF_TIMESTAMP",
                                "cast(WHEELS_ON_TIMESTAMP as timestamp) WHEELS_ON_TIMESTAMP",
                                "cast(TAXI_IN as integer) TAXI_IN_MINUTES",
                                "cast(CRS_ARRIVAL_TIMESTAMP as timestamp) CRS_ARRIVAL_TIMESTAMP",
                                "cast(ARR_DELAY as integer) ARRIVAL_DELAY_MINUTES",
                                "cast(CRS_ELAPSED_TIME as integer) CRS_ELAPSED_TIME_MINUTES",
                                "cast(ACTUAL_ELAPSED_TIME as integer) ACTUAL_ELAPSED_TIME_MINUTES",
                                "cast(AIR_TIME as integer) AIR_TIME_MINUTES",
                                "cast(DISTANCE as integer) DISTANCE_MILES",
                                "cast(CARRIER_DELAY as integer) CARRIER_DELAY_MINUTES",
                                "cast(WEATHER_DELAY as integer) WEATHER_DELAY_MINUTES",
                                "cast(NAS_DELAY as integer) NAS_DELAY_MINUTES",
                                "cast(SECURITY_DELAY as integer) SECURITY_DELAY_MINUTES",
                                "cast(LATE_AIRCRAFT_DELAY as integer) LATE_AIRCRAFT_DELAY_MINUTES",
                                "Org_Airport as ORG_AIRPORT",
                                "cast(Org_latitude as integer) ORG_LATITUDE",
                                "cast(Org_longitude as integer) ORG_LONGITUDE",
                                "Origin_city as ORIGIN_CITY",
                                "Dest_Airport as DEST_AIRPORT",
                                "cast(Dest_latitude as integer) DEST_LATITUDE",
                                "cast(Dest_longitude as integer) DEST_LONGITUDE",
                                "Dest_city as DEST_CITY",
                                "month as MONTH",
                                "Day as DAY",
                                "MONTH_abbr as MONTH_ABBR",
                                "cast(DELAY_STATUS as integer) DELAY_STATUS"
                                )
#final_ml_df.show()

In [9]:
final_ml_df.printSchema()

root
 |-- FLIGHT_DT: date (nullable = true)
 |-- AIRLINE_CARRIER_CODE: string (nullable = true)
 |-- CRS_DEPARTURE_TIMESTAMP: timestamp (nullable = true)
 |-- DEPARTURE_DELAY_MINUTES: integer (nullable = true)
 |-- TAXI_OUT_MINUTES: integer (nullable = true)
 |-- WHEELS_OFF_TIMESTAMP: timestamp (nullable = true)
 |-- WHEELS_ON_TIMESTAMP: timestamp (nullable = true)
 |-- TAXI_IN_MINUTES: integer (nullable = true)
 |-- CRS_ARRIVAL_TIMESTAMP: timestamp (nullable = true)
 |-- ARRIVAL_DELAY_MINUTES: integer (nullable = true)
 |-- CRS_ELAPSED_TIME_MINUTES: integer (nullable = true)
 |-- ACTUAL_ELAPSED_TIME_MINUTES: integer (nullable = true)
 |-- AIR_TIME_MINUTES: integer (nullable = true)
 |-- DISTANCE_MILES: integer (nullable = true)
 |-- CARRIER_DELAY_MINUTES: integer (nullable = true)
 |-- WEATHER_DELAY_MINUTES: integer (nullable = true)
 |-- NAS_DELAY_MINUTES: integer (nullable = true)
 |-- SECURITY_DELAY_MINUTES: integer (nullable = true)
 |-- LATE_AIRCRAFT_DELAY_MINUTES: integer (nulla

In [10]:
# Configure settings for RDS
mode = "append"
jdbc_url="jdbc:postgresql://34.74.181.190:5432/airlinedb_final"
config = {"user":"airlinedb", 
          "password": "KFG5ruuAfBBJGqhz", 
          "driver":"org.postgresql.Driver"}

In [13]:
#Write Machine Learning Dataframe to table in RDS
final_ml_df.write.jdbc(url=jdbc_url, table='project.machine_learning_flight_data', mode=mode, properties=config)

In [12]:
!curl ipecho.net/plain

35.232.115.217