In [15]:
from pyspark.sql import SparkSession

spark = (
    SparkSession.builder.master("local[5]")
    .appName("Airport_Data_Project")
    .getOrCreate()
)

df = spark.read.csv("./Airports2.csv", header=True, inferSchema=True)

df.printSchema()
df.show(truncate=False)
print(df.rdd.getNumPartitions())



root
 |-- Origin_airport: string (nullable = true)
 |-- Destination_airport: string (nullable = true)
 |-- Origin_city: string (nullable = true)
 |-- Destination_city: string (nullable = true)
 |-- Passengers: integer (nullable = true)
 |-- Seats: integer (nullable = true)
 |-- Flights: integer (nullable = true)
 |-- Distance: integer (nullable = true)
 |-- Fly_date: date (nullable = true)
 |-- Origin_population: integer (nullable = true)
 |-- Destination_population: integer (nullable = true)
 |-- Org_airport_lat: string (nullable = true)
 |-- Org_airport_long: string (nullable = true)
 |-- Dest_airport_lat: string (nullable = true)
 |-- Dest_airport_long: string (nullable = true)

+--------------+-------------------+-------------+----------------+----------+-----+-------+--------+----------+-----------------+----------------------+----------------+-----------------+----------------+-----------------+
|Origin_airport|Destination_airport|Origin_city  |Destination_city|Passengers|Seats|F

                                                                                

In [20]:
# Question1 : Find the airport highest no of flight departures
from pyspark.sql import functions as func

highestNumOfFlightDepartures = (
    df.groupBy(func.col("Origin_airport").alias("Airport"))
    .agg(func.sum(func.col("Flights")).alias("Number_Of_Flights"))
    .orderBy(func.desc("Number_Of_Flights"))
)
highestNumOfFlightDepartures.show(10, truncate=False)



+-------+-----------------+
|Airport|Number_Of_Flights|
+-------+-----------------+
|ORD    |6908482          |
|ATL    |6558015          |
|DFW    |5994638          |
|LAX    |4099901          |
|DTW    |3452613          |
|PHX    |3213108          |
|MSP    |3204923          |
|IAH    |3195062          |
|STL    |3181102          |
|CLT    |2840773          |
+-------+-----------------+
only showing top 10 rows



                                                                                

In [19]:
# Question2 : Find the Airport with Highest Number of Passenger Arrivals
from pyspark.sql import functions as func

highestNoOfPassengersArrival = (
    df.groupBy(func.col("Origin_airport").alias("Airport"))
    .agg(func.sum(func.col("Passengers")).alias("Total_Passengers_Arrived"))
    .orderBy(func.desc("Total_Passengers_Arrived"))
)
highestNoOfPassengersArrival.show(10, truncate=False)



+-------+------------------------+
|Airport|Total_Passengers_Arrived|
+-------+------------------------+
|ATL    |577124268               |
|ORD    |529018110               |
|DFW    |457153720               |
|LAX    |393005676               |
|PHX    |295857703               |
|LAS    |270590248               |
|DTW    |250983023               |
|MSP    |245197238               |
|SFO    |243779917               |
|IAH    |228367851               |
+-------+------------------------+
only showing top 10 rows



                                                                                

In [18]:
# Question3 : Find the Airport with Most Flight Traffic
from pyspark.sql import functions as func

origin_airport_traffic = (
    df.groupBy(func.col("Origin_airport"))
    .agg(func.sum(func.col("Flights")).alias("Origin_airport_Traffic"))
    .orderBy(func.desc("Origin_airport_Traffic"))
)

destination_airport_traffic = (
    df.groupBy(func.col("Destination_airport"))
    .agg(func.sum(func.col("Flights")).alias("Destination_airport_traffic"))
    .orderBy(func.desc("Destination_airport_traffic"))
)

AirportWithMostTraffic = (
    origin_airport_traffic.join(
        destination_airport_traffic,
        destination_airport_traffic["Destination_airport"]
        == origin_airport_traffic["Origin_airport"],
        "inner",
    )
    .select(
        func.col("Origin_airport").alias("Airport"),
        (
            func.col("Origin_airport_Traffic") + func.col("Destination_airport_traffic")
        ).alias("Total_Airport_Traffic"),
    )
    .orderBy(func.desc("Total_Airport_Traffic"))
)

AirportWithMostTraffic.show(10, truncate=False)



+-------+---------------------+
|Airport|Total_Airport_Traffic|
+-------+---------------------+
|ORD    |13804767             |
|ATL    |13102774             |
|DFW    |11982524             |
|LAX    |8196603              |
|DTW    |6900655              |
|PHX    |6421985              |
|MSP    |6405105              |
|IAH    |6391830              |
|STL    |6358741              |
|CLT    |5677880              |
+-------+---------------------+
only showing top 10 rows



                                                                                

In [17]:
# Question 4 : Find the Airport with Most Passenger Footfall
from pyspark.sql import functions as func

Origin_Airport_Passengers_Visit = (
    df.groupBy(func.col("Origin_airport"))
    .agg(func.sum(func.col("Passengers")).alias("Origin_Total_Passengers"))
    .orderBy(func.desc("Origin_Total_Passengers"))
)

Destination_Airport_Passengers_Visit = (
    df.groupBy(func.col("Destination_airport"))
    .agg(func.sum(func.col("Passengers")).alias("Destination_Total_Passengers"))
    .orderBy(func.desc("Destination_Total_Passengers"))
)

Airport_With_Most_Passeners_Visit = (
    Origin_Airport_Passengers_Visit.join(
        Destination_Airport_Passengers_Visit,
        Destination_Airport_Passengers_Visit["Destination_airport"]
        == Origin_Airport_Passengers_Visit["Origin_airport"],
        "inner",
    )
    .select(
        func.col("Origin_airport").alias("Airport"),
        (
            func.col("Origin_Total_Passengers")
            + func.col("Destination_Total_Passengers")
        ).alias("Total_Passengers_Visit"),
    )
    .orderBy(func.desc("Total_Passengers_Visit"))
)

Airport_With_Most_Passeners_Visit.show(10, truncate=False)



+-------+----------------------+
|Airport|Total_Passengers_Visit|
+-------+----------------------+
|ATL    |1155078415            |
|ORD    |1057666258            |
|DFW    |915476247             |
|LAX    |782482278             |
|PHX    |591438147             |
|LAS    |539736139             |
|DTW    |502450897             |
|MSP    |490971274             |
|SFO    |486063162             |
|IAH    |457473254             |
+-------+----------------------+
only showing top 10 rows



                                                                                

In [4]:
# Question 5 : Find the Occupancy Rate for Most Popular Routes
from pyspark.sql import functions as func

OverAll_Flight_Data = (
    df.groupBy(
        func.least(
            func.col("Origin_airport"),
            func.col("Destination_airport"),
        ).alias("Airport_1"),
        func.greatest(
            func.col("Origin_airport"),
            func.col("Destination_airport"),
        ).alias("Airport_2"),
    )
    .agg(
        func.sum(func.col("Passengers")).alias("Total_Passenger_Arrived"),
        func.sum(func.col("Seats")).alias("Total_Seats"),
        func.sum(func.col("Flights")).alias("Total_Flights"),
    )
    .orderBy(
        func.desc(
            func.col("Airport_1"),
        ),
        func.desc(
            func.col("Airport_2"),
        ),
    )
)

Occupancy_Rate_Of_Most_Popular_Route = (
    OverAll_Flight_Data.withColumn(
        "Occupancy_Rate",
        (func.col("Total_Passenger_Arrived") * 100 / func.col("Total_Seats")),
    )
    .filter(
        (func.col("Occupancy_Rate").isNotNull())
        & (func.col("Occupancy_Rate") > 0)
        & (func.col("Occupancy_Rate") <= 100)
    )
    .orderBy(
        func.desc(
            func.col("Total_Passenger_Arrived"),
        ),
        func.desc(
            func.col("Total_Seats"),
        ),
        func.desc(
            func.col("Total_Flights"),
        ),
        func.desc(func.col("Occupancy_Rate")),
    )
    .limit(10)
)

Occupancy_Rate_Of_Most_Popular_Route.show(truncate=False)



+---------+---------+-----------------------+-----------+-------------+-----------------+
|Airport_1|Airport_2|Total_Passenger_Arrived|Total_Seats|Total_Flights|Occupancy_Rate   |
+---------+---------+-----------------------+-----------+-------------+-----------------+
|HNL      |OGG      |62109354               |96640901   |784873       |64.26818599300931|
|HNL      |LAX      |57596315               |72737189   |276821       |79.1841364669729 |
|LAS      |LAX      |52511530               |80532768   |588151       |65.20517213564546|
|LAX      |SFO      |51119989               |79405656   |636449       |64.3782717442697 |
|ATL      |MCO      |46237919               |59608322   |319861       |77.56956989998812|
|LAX      |ORD      |45915774               |64140975   |366256       |71.58571256517382|
|JFK      |LAX      |43871972               |63078113   |333413       |69.55181427193295|
|LAS      |PHX      |42979048               |64844100   |460104       |66.28058373853597|
|ATL      

                                                                                

In [7]:
# Question 6: Find the Number of Flights for Long Distance Journeys
from pyspark.sql import functions as func

Long_Journey_Flights = (
    df.groupBy(
        func.least(func.col("Origin_airport"), func.col("Destination_airport")).alias(
            "Airport_1"
        ),
        func.greatest(
            func.col("Origin_airport"), func.col("Destination_airport")
        ).alias("Airport_2"),
    )
    .agg(
        func.mean(func.col("Distance")).alias("Distance"),
        func.sum(func.col("Flights")).alias("Total_Flights"),
    )
    .filter((func.col("Total_Flights") > 0) & (func.col("Total_Flights").isNotNull()))
    .orderBy(func.desc(func.col("Distance")), func.desc(func.col("Total_Flights")))
    .limit(10)
)

Long_Journey_Flights.show(truncate=False)



+---------+---------+--------+-------------+
|Airport_1|Airport_2|Distance|Total_Flights|
+---------+---------+--------+-------------+
|BDL      |HNL      |5018.0  |3            |
|HNL      |JFK      |4983.0  |430          |
|HIK      |JFK      |4983.0  |3            |
|HNL      |LGA      |4976.0  |1            |
|EWR      |HNL      |4962.0  |8320         |
|JFK      |OGG      |4924.0  |1            |
|HNL      |PHL      |4919.0  |1            |
|HNL      |MIA      |4862.0  |9            |
|HIK      |MIA      |4862.0  |1            |
|HNL      |ISO      |4860.0  |1            |
+---------+---------+--------+-------------+



                                                                                

In [12]:
# Question 7 : Find the Average Distances for Routes with Most Flights
from pyspark.sql import functions as func

Average_Distance_For_Most_Flights = (
    df.groupBy(
        func.least(func.col("Origin_airport"), func.col("Destination_airport")).alias(
            "Airport_1"
        ),
        func.greatest(
            func.col("Origin_airport"), func.col("Destination_airport")
        ).alias("Airport_2"),
    )
    .agg(
        func.avg(func.col("Distance")).alias("Average_Distance"),
        func.sum(func.col("Flights")).alias("Total_Flights"),
    )
    .filter((func.col("Total_Flights") > 0) & (func.col("Total_Flights").isNotNull()))
    .orderBy(
        func.desc(func.col("Total_Flights")),
        func.desc(func.col("Average_Distance")),
    )
    .limit(10)
)

Average_Distance_For_Most_Flights.show(truncate=False)



+---------+---------+------------------+-------------+
|Airport_1|Airport_2|Average_Distance  |Total_Flights|
+---------+---------+------------------+-------------+
|HNL      |OGG      |100.0             |784873       |
|LAX      |SFO      |337.0             |636449       |
|LAS      |LAX      |236.0             |588151       |
|PDX      |SEA      |129.0             |565707       |
|LAX      |PHX      |370.0             |515093       |
|BOS      |LGA      |185.0             |470737       |
|MSP      |ORD      |334.0             |467514       |
|LAS      |PHX      |255.96021840873635|460104       |
|DCA      |LGA      |214.0             |439107       |
|LAX      |SAN      |109.0             |431076       |
+---------+---------+------------------+-------------+



                                                                                