In [1]:
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from plotly.subplots import make_subplots
import plotly.offline as py


In [2]:
spark = SparkSession.builder.appName("flights").getOrCreate()
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("../matrix/schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

22/12/25 10:41:49 WARN Utils: Your hostname, MacBook-Air-di-Teodoro.local resolves to a loopback address: 127.0.0.1; using 192.168.1.156 instead (on interface en0)
22/12/25 10:41:49 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/25 10:41:50 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [3]:
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)

In [4]:
airports_sp = spark.read.csv("../../preprocessing/airports.csv", header=True,inferSchema=True)
airports = airports_sp.toPandas()

In [5]:
def routes_queries(df,date_start,date_end,origin="BOS",query="NumFlights"):
    df_aggregated = df.filter((col("Origin") == origin))
    df_aggregated = df_aggregated.filter((col("FlightDate") >= date_start) & (col("FlightDate") <= date_end))
    # aggregate by flight date, day of weew, Origin and Dest, count the number of flights and average the arrival delay
    df_aggregated = df_aggregated.groupBy("Origin","Dest","ORIGIN_LATITUDE","ORIGIN_LONGITUDE","DEST_LATITUDE","DEST_LONGITUDE").agg({"ArrDelay": "avg","*":"count"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay").withColumnRenamed("count(1)", "NumFlights")
    # sort by query and take the first 100 rows
    df_aggregated = df_aggregated.orderBy(df_aggregated[query].desc()).limit(100)
    return df_aggregated

In [8]:
# create a timestamp object from the string
date_start = "2013-02-01"
date_end = "2013-12-28"
# convert the string to a timestamp object
date_start = pd.Timestamp(date_start)
date_end = pd.Timestamp(date_end)


In [7]:
df_aggregatedp = routes_queries(df,date_start,date_end)
#show on map the routes


In [106]:
df_aggregatedp=df_aggregatedp.toPandas()

                                                                                

In [107]:
# join the airports dataframe with the aggregated dataframe compare with Origin and Dest
df_aggregatedp = df_aggregatedp.merge(airports, left_on="Origin", right_on="IATA")
df_aggregatedp = df_aggregatedp.merge(airports, left_on="Dest", right_on="IATA")


In [108]:
df_aggregated = df_aggregatedp.sort_values(by="AverageArrivalDelay", ascending=False).head(100)

In [112]:
df_aggregated["AverageArrivalDelay"] = df_aggregated["AverageArrivalDelay"]+ df_aggregated["AverageArrivalDelay"].min()*-1

In [113]:
df_aggregated["AverageArrivalDelay"].min()

0.0

In [24]:
def plot_routes(df,date_start,date_to,origin="BOS",query="NumFlights"):
    df_aggregated=routes_queries(df,date_start,date_to,origin,query).toPandas()
    print(len(df_aggregated))
    df_aggregated = df_aggregated.merge(airports, left_on="Origin", right_on="IATA")
    df_aggregated = df_aggregated.merge(airports, left_on="Dest", right_on="IATA")

    
    
    fig = go.Figure()
    



    source_to_dest = zip(df_aggregated["ORIGIN_LATITUDE"], df_aggregated["DEST_LATITUDE"],
                         df_aggregated["ORIGIN_LONGITUDE"], df_aggregated["DEST_LONGITUDE"],
                         df_aggregated[query])

    ## Loop thorugh each flight entry to add line between source and destination
    for slat,dlat, slon, dlon, num_flights in source_to_dest:
        fig.add_trace(go.Scattergeo(
                            lat = [slat,dlat],
                            lon = [slon, dlon],
                            mode = 'lines',
                            line = dict(width = 2, color="red"),
                            hoverinfo='text',
                            text = query+" "+str(num_flights)
                            textposition="top center"



                            ))

    ## Logic to create labels of source and destination cities of flights
    cities = df_aggregated["AIRPORT_x"].values.tolist()+df_aggregated["AIRPORT_y"].values.tolist()
    scatter_hover_data = [city for city in cities]

    if query == "AverageArrivalDelay":
        df_aggregated[query] = df_aggregated[query] + df_aggregated[query].min()*-1

    
    df_aggregated[query]=df_aggregated[query]/df_aggregated[query].max()
    ## Loop thorugh each flight entry to plot source and destination as points.
    fig.add_trace(
        go.Scattergeo(
                    lon = df_aggregated["DEST_LONGITUDE"].values.tolist(),
                    lat = df_aggregated["DEST_LATITUDE"].values.tolist(),
                    hoverinfo = 'text',
                    text = df_aggregated["AIRPORT_y"],
                    mode = 'markers',
                    marker = dict(size = df_aggregated[query]*20, color = 'blue', opacity=0.9)),
                    # define the size of the marker based on the number of flights
                    #     
        )

    ## Update graph layout to improve graph styling.
    fig.update_layout(title_text="Connection Map Depicting Flights from Brazil to All Other Countries",
                      height=700, width=900,
                      margin={"t":0,"b":0,"l":0, "r":0, "pad":0},
                      showlegend=False,
                      geo= dict(showland = True, landcolor = 'white', countrycolor = 'grey', bgcolor="lightgrey",scope='north america'))

    return fig

In [26]:
plot_routes(df,date_start,date_end,"BOS","AverageArrivalDelay").show()



58


                                                                                