In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col,avg

In [None]:
spark = SparkSession.builder.appName("FlightAnalysis").getOrCreate()

In [None]:
file_path = "/content/Airlines.csv"  # Update with the actual file path
df = spark.read.option("header", "true").csv(file_path, inferSchema=True)

In [None]:
# Show the first few rows of the dataset to confirm the structure
df.show(5)

+---+-------+------+-----------+---------+---------+----+------+-----+
| id|Airline|Flight|AirportFrom|AirportTo|DayOfWeek|Time|Length|Delay|
+---+-------+------+-----------+---------+---------+----+------+-----+
|  1|     CO|   269|        SFO|      IAH|        3|  15|   205|    1|
|  2|     US|  1558|        PHX|      CLT|        3|  15|   222|    1|
|  3|     AA|  2400|        LAX|      DFW|        3|  20|   165|    1|
|  4|     AA|  2466|        SFO|      DFW|        3|  20|   195|    1|
|  5|     AS|   108|        ANC|      SEA|        3|  30|   202|    0|
+---+-------+------+-----------+---------+---------+----+------+-----+
only showing top 5 rows



In [None]:
carrier_avg_delay = df.groupBy('Airline').agg(
    avg('Delay').alias('AvgDelay')
)

In [None]:
carrier_avg_delay.show()

+-------+-------------------+
|Airline|           AvgDelay|
+-------+-------------------+
|     UA|0.32390745501285345|
|     AA|0.38847029963203084|
|     EV| 0.4022084837222599|
|     B6|0.46703842756183744|
|     DL|  0.450475877912701|
|     OO| 0.4528992716997652|
|     F9|0.44903965303593557|
|     YV| 0.2429143897996357|
|     US|0.33597101449275363|
|     MQ|0.34809452260620133|
|     OH| 0.2772763262074426|
|     HA| 0.3201864467551094|
|     XE|0.37894364839683864|
|     AS| 0.3392903844477378|
|     CO| 0.5661994507055592|
|     FL|0.30129159264416383|
|     WN| 0.6977586958138942|
|     9E|0.39766025331141835|
+-------+-------------------+



In [None]:
route_avg_delay = df.groupBy('AirportFrom', 'AirportTo').agg(
    avg('Delay').alias('AvgDelay')  # Compute average delay (fraction of delayed flights) for each route
)

In [None]:
# Step 3: Sort the routes by average delay in descending order and get the top 5 routes
top_5_routes = route_avg_delay.orderBy(col('AvgDelay').desc()).limit(5)

# Show the top 5 routes with the highest average delay
top_5_routes.show()

+-----------+---------+--------+
|AirportFrom|AirportTo|AvgDelay|
+-----------+---------+--------+
|        STX|      CLT|     1.0|
|        MTJ|      LAX|     1.0|
|        EWR|      HDN|     1.0|
|        MSP|      PVD|     1.0|
|        BDL|      LAS|     1.0|
+-----------+---------+--------+

