In [1]:
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from plotly.subplots import make_subplots
import plotly.offline as py


In [2]:
# create saprk session
spark = SparkSession.builder.appName("flights").getOrCreate()

22/12/21 23:06:52 WARN Utils: Your hostname, MacBook-Air-di-Teodoro.local resolves to a loopback address: 127.0.0.1; using 192.168.69.184 instead (on interface en0)
22/12/21 23:06:52 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/21 23:06:53 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
22/12/21 23:06:53 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# load schema from json
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

In [4]:
# load csv file
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)

In [5]:
airports_sp = spark.read.csv("../../preprocessing/airports.csv", header=True,inferSchema=True)
airports = airports_sp.toPandas()

In [5]:
# MATRIX

def week_day_month_agg(df):
    df_aggregated = df.groupBy("DayOfWeek", "Month").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    return df_aggregated
def week_day_matrix_avg_delay(df):
    df_pd = week_day_month_agg(df).toPandas()
    fig = px.imshow(
        df_pd.pivot("DayOfWeek", "Month", "AverageArrivalDelay"), 
        labels=dict(x="Month", y="DayOfWeek", color="Average Arrival Delay"), 
        x=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 
        y=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    )
    return fig

fig = week_day_matrix_avg_delay(df)
fig.show()


  df_pd.pivot("DayOfWeek", "Month", "AverageArrivalDelay"),


In [6]:
# aggregate by Origin and Dest and month, and count the number of flights
def flights_by_month(df):

    df_aggregated = df.groupBy("ORIGIN_STATE", "DEST_STATE").count()
    # create a new column as Orgigin and Dest concatenated using a dash
    df_aggregated = df_aggregated.withColumn("Origin-Dest", concat(col("ORIGIN_STATE"), lit("-"), col("DEST_STATE")))
    # drop the columns Origin and Dest
    df_aggregated = df_aggregated.drop("Origin", "Dest")
    # order by Origin-Dest
    df_aggregated = df_aggregated.orderBy("count",ascending=False)
    
    return df_aggregated

# visualized flights as bar chart, onlyt show the top 20
def flights_by_month_bar(df):
    df_pd = flights_by_month(df).toPandas()
    fig = px.bar(df_pd.head(20), x="Origin-Dest", y="count", title="Number of flights by Origin-Dest")
    return fig
    
flights_by_month_bar(df).show()


                                                                                

In [8]:
# num flights by day
def flights_by_day(df):
    df_aggregated = df.groupBy("FlightDate","DayOfWeek").count()
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated

# visualized flights as line chart
def flights_by_day_line(df):
    df_pd = flights_by_day(df).toPandas()
    fig = px.line(df_pd, x="FlightDate", y="count", title="Number of flights by day",hover_data=["DayOfWeek","FlightDate","count"])
    return fig


flights_by_day_line(df).show()


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [9]:
# average delay by day
def avg_delay_by_day(df):
    df_aggregated = df.groupBy("FlightDate","DayOfWeek").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated

# visualized flights as line chart
def avg_delay_by_day_line(df):
    df_pd = avg_delay_by_day(df).toPandas()
    fig = px.line(df_pd, x="FlightDate", y="AverageArrivalDelay", title="Average Arrival Delay by day",hover_data=["DayOfWeek","FlightDate","AverageArrivalDelay"])
    return fig

avg_delay_by_day_line(df)


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [10]:
def avg_num_line(df):
    df_num = flights_by_day(df).toPandas()
    df_avg = avg_delay_by_day(df).toPandas()

    # plot the two lines in the same figure, use to different y axes
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=df_num["FlightDate"] , y=df_num["count"], name="num flights"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=df_avg["FlightDate"], y=df_avg["AverageArrivalDelay"], name="avg delay"),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text="Double Y Axis Example"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="xaxis title")

    # Set y-axes titles
    fig.update_yaxes(title_text="<b>primary</b> yaxis title", secondary_y=False)
    fig.update_yaxes(title_text="<b>secondary</b> yaxis title", secondary_y=True)


    return fig


avg_num_line(df)
    


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [7]:
# get all distinct FlightDate
df_dates = df.select("FlightDate").distinct().orderBy("FlightDate", ascending=True)
df_dates = df_dates.toPandas()


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [70]:
def route_between_airports(df,date_start,date_end,origin="BOS",dest="None"):
    if dest == "None":
        df_aggregated = df.filter((col("Origin") == origin))
    else:
        df_aggregated = df.filter((col("Origin") == origin) & (col("Dest") == dest))

    
    
    # aggregate by flight date, day of weew, Origin and Dest, count the number of flights and average the arrival delay
    df_aggregated = df_aggregated.groupBy("FlightDate","DayOfWeek","Origin","Dest","ORIGIN_LATITUDE","ORIGIN_LONGITUDE","DEST_LATITUDE","DEST_LONGITUDE").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # filter by date_start and date_end
    df_aggregated = df_aggregated.filter((col("FlightDate") >= date_start) & (col("FlightDate") <= date_end))
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated


def plot_map_routes(df,date_start,date_end,origin="BOS",dest="None"):
    df_flight_paths = route_between_airports(df,date_start,date_end,origin,dest).toPandas()

    fig = go.Figure()


    flight_paths = []
    for i in range(len(df_flight_paths)):
        fig.add_trace(
            go.Scattergeo(
                locationmode = 'USA-states',
                lon = [df_flight_paths['ORIGIN_LONGITUDE'][i], df_flight_paths['DEST_LONGITUDE'][i]],
                lat = [df_flight_paths['ORIGIN_LATITUDE'][i], df_flight_paths['DEST_LATITUDE'][i]],
                mode = 'lines',
                line = dict(width = 1,color = 'blue'),
            
            )
        )

    # for each airport count the number of rows in df_flight_paths with that airport as destination
    sizes = []
    for airport_code in airports["IATA"]:
        # count the rows with airport_code as destination
        count = len(df_flight_paths[df_flight_paths["Dest"] == airport_code])+1
        sizes.append(count)
    
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = airports['LONGITUDE'],
        lat = airports['LATITUDE'],
        hoverinfo = 'text',
        text = airports['AIRPORT']+", "+airports['CITY']+", "+airports['STATE']+", "+airports['COUNTRY'],
        mode = 'markers',
        marker = dict(
            size = sizes,
            color = 'rgb(255, 0, 0)',
            line = dict(
                width = 3,
                color = 'rgba(68, 68, 68, 0)'
            )
        )))

    

    fig.update_layout(
        title_text = "Flights' route",
        showlegend = False,
        geo = dict(
            scope = 'north america',
            projection_type = 'azimuthal equal area',
            showland = True,
            landcolor = 'rgb(243, 243, 243)',
            countrycolor = 'rgb(204, 204, 204)',
        ),
    )

    return fig

plot_map_routes(df,df_dates["FlightDate"][10],df_dates["FlightDate"][15],"ABQ").show()


Passing unit-less datetime64 dtype to .astype is deprecated and will raise in a future version. Pass 'datetime64[ns]' instead



In [55]:
# get avg arrival delay by state origin
def avg_delay_state_origin(df):
    df_aggregated = df.groupBy("ORIGIN_STATE").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated

def plot_avg_delay_state_origin(df):
    df_avg = avg_delay_state_origin(df).toPandas()
    # drop row with AS in ORIGIN_STATE
    df_avg = df_avg.drop(df_avg[df_avg["ORIGIN_STATE"] == "AS"].index)
    fig = px.choropleth(locations=df_avg["ORIGIN_STATE"], locationmode="USA-states", color=df_avg["AverageArrivalDelay"], scope="usa")
    return fig

plot_avg_delay_state_origin(df).show()

                                                                                

In [62]:

# get avg arrival delay by state origin
def avg_delay_state_dest(df):
    df_aggregated = df.groupBy("DEST_STATE").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated    


def plot_avg_delay_state_dest(df):
    df_avg = avg_delay_state_dest(df).toPandas()
    # drop row with AS in ORIGIN_STATE
    df_avg = df_avg.drop(df_avg[df_avg["DEST_STATE"] == "AS"].index)
    df_avg = df_avg.drop(df_avg[df_avg["DEST_STATE"] == "GU"].index)

    fig = px.choropleth(locations=df_avg["DEST_STATE"], locationmode="USA-states", color=df_avg["AverageArrivalDelay"], scope="usa")
    return fig

plot_avg_delay_state_dest(df).show()

                                                                                