In [None]:
import plotly.express as px
import plotly.graph_objs as go
import pandas as pd
import numpy as np
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from plotly.subplots import make_subplots
import plotly.offline as py


In [None]:
# create saprk session
spark = SparkSession.builder.appName("flights").getOrCreate()

In [None]:
# load schema from json
import json
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, DoubleType, DateType, TimestampType
with open("schema.json","r") as f:
    schema = StructType.fromJson(json.load(f))

In [None]:
# load csv file
df = spark.read.csv("../../data.nosync/cleaned/cleaned_flights.csv",schema=schema, header=True)

In [None]:
airports_sp = spark.read.csv("../../preprocessing/airports.csv", header=True,inferSchema=True)
airports = airports_sp.toPandas()

In [None]:
airlines=pd.read_csv("./airlines.csv")

In [None]:
# MATRIX

def week_day_month_agg(df):
    df_aggregated = df.groupBy("DayOfWeek", "Month").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    return df_aggregated
def week_day_matrix_avg_delay(df):
    df_pd = week_day_month_agg(df).toPandas()
    fig = px.imshow(
        df_pd.pivot("DayOfWeek", "Month", "AverageArrivalDelay"), 
        labels=dict(x="Month", y="DayOfWeek", color="Average Arrival Delay"), 
        x=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 
        y=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    )
    # add title "Average delay by day of week and month"
    fig.update_layout(title="Average delay by day of week and month")
    return fig

fig = week_day_matrix_avg_delay(df)
fig.show()


In [None]:
def week_day_month_agg_count(df):
    # group by mongh day of week and count
    df_aggregated = df.groupBy("DayOfWeek", "Month").count().withColumnRenamed("count", "num_flights")
    return df_aggregated
def week_day_matrix_count(df):
    df_pd = week_day_month_agg_count(df).toPandas()
    fig = px.imshow(
        df_pd.pivot("DayOfWeek", "Month", "num_flights"), 
        labels=dict(x="Month", y="DayOfWeek", color="num_flights"), 
        x=["Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"], 
        y=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    )

    # add title text "Number of flights per day of week and month"
    fig.update_layout(title_text="Number of flights per day of week and month")
    
    return fig



fig = week_day_matrix_count(df)
fig.show()

In [None]:
# aggregate by Origin and Dest and month, and count the number of flights
def flights_by_month(df):

    df_aggregated = df.groupBy("ORIGIN_STATE", "DEST_STATE").count()
    # create a new column as Orgigin and Dest concatenated using a dash
    df_aggregated = df_aggregated.withColumn("Origin-Dest", concat(col("ORIGIN_STATE"), lit("-"), col("DEST_STATE")))
    # drop the columns Origin and Dest
    df_aggregated = df_aggregated.drop("Origin", "Dest")
    # order by Origin-Dest
    df_aggregated = df_aggregated.orderBy("count",ascending=False)
    
    return df_aggregated

# visualize flights as pie chart
def flights_by_month_pie(df):
    df_pd = flights_by_month(df).toPandas()
    fig = px.pie(df_pd.head(20), values='count', names='Origin-Dest', title='Number of flights by Origin-Dest')
    return fig
# visualized flights as bar chart, onlyt show the top 20
def flights_by_month_bar(df):
    df_pd = flights_by_month(df).toPandas()
    fig = px.bar(df_pd.head(20), x="Origin-Dest", y="count", title="Number of flights by Origin-Dest")
    return fig
    
flights_by_month_pie(df).show()


In [None]:
# num flights by day
def flights_by_day(df):
    df_aggregated = df.groupBy("FlightDate","DayOfWeek").count()
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated

# visualized flights as line chart
def flights_by_day_line(df):
    df_pd = flights_by_day(df).toPandas()
    fig = px.line(df_pd, x="FlightDate", y="count", title="Number of flights by day",hover_data=["DayOfWeek","FlightDate","count"])
    return fig


flights_by_day_line(df).show()

In [None]:
# average delay by day
def avg_delay_by_day(df):
    df_aggregated = df.groupBy("FlightDate","DayOfWeek").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated

# visualized flights as line chart
def avg_delay_by_day_line(df):
    df_pd = avg_delay_by_day(df).toPandas()
    fig = px.line(df_pd, x="FlightDate", y="AverageArrivalDelay", title="Average Arrival Delay by day",hover_data=["DayOfWeek","FlightDate","AverageArrivalDelay"])
    return fig

avg_delay_by_day_line(df)

In [None]:
def avg_num_line(df):
    df_num = flights_by_day(df).toPandas()
    df_avg = avg_delay_by_day(df).toPandas()

    # plot the two lines in the same figure, use to different y axes
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=df_num["FlightDate"] , y=df_num["count"], name="num flights"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=df_avg["FlightDate"], y=df_avg["AverageArrivalDelay"], name="avg delay"),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text="Number of flights and average delay by day"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Day of the year")

    # Set y-axes titles
    fig.update_yaxes(title_text="Number of flights", secondary_y=False)
    fig.update_yaxes(title_text="Average delay in minutes", secondary_y=True)


    return fig


avg_num_line(df)
    

In [None]:
# get all distinct FlightDate
df_dates = df.select("FlightDate").distinct().orderBy("FlightDate", ascending=True)
df_dates = df_dates.toPandas()

In [None]:
def route_between_airports(df,date_start,date_end,origin="BOS",dest="None"):
    if dest == "None":
        df_aggregated = df.filter((col("Origin") == origin))
    else:
        df_aggregated = df.filter((col("Origin") == origin) & (col("Dest") == dest))

    
    
    # aggregate by flight date, day of weew, Origin and Dest, count the number of flights and average the arrival delay
    df_aggregated = df_aggregated.groupBy("FlightDate","DayOfWeek","Origin","Dest","ORIGIN_LATITUDE","ORIGIN_LONGITUDE","DEST_LATITUDE","DEST_LONGITUDE").agg({"ArrDelay": "avg","*":"count"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay").withColumnRenamed("count(1)", "NumFlights")
    df_aggregated = df_aggregated.orderBy("FlightDate", ascending=True)
    # filter by date_start and date_end
    df_aggregated = df_aggregated.filter((col("FlightDate") >= date_start) & (col("FlightDate") <= date_end))
    # convert DayOfWeek to string with the corresponding day
    df_aggregated = df_aggregated.withColumn("DayOfWeek", when(col("DayOfWeek") == 1, "Monday").when(col("DayOfWeek") == 2, "Tuesday").when(col("DayOfWeek") == 3, "Wednesday").when(col("DayOfWeek") == 4, "Thursday").when(col("DayOfWeek") == 5, "Friday").when(col("DayOfWeek") == 6, "Saturday").when(col("DayOfWeek") == 7, "Sunday"))
    return df_aggregated


def plot_map_routes(df,date_start,date_end,origin="BOS",dest="None"):
    df_flight_paths = route_between_airports(df,date_start,date_end,origin,dest).toPandas()

    fig = go.Figure()


    flight_paths = []
    for i in range(len(df_flight_paths)):
        fig.add_trace(
            go.Scattergeo(
                locationmode = 'USA-states',
                lon = [df_flight_paths['ORIGIN_LONGITUDE'][i], df_flight_paths['DEST_LONGITUDE'][i]],
                lat = [df_flight_paths['ORIGIN_LATITUDE'][i], df_flight_paths['DEST_LATITUDE'][i]],
                mode = 'lines',
                line = dict(
                    width = 1,
                    color = 'blue'
                ),
            
            )
        )

    # for each airport count the number of rows in df_flight_paths with that airport as destination
    sizes = []
    for airport_code in airports["IATA"]:
        # count the rows with airport_code as destination
        count = len(df_flight_paths[df_flight_paths["Dest"] == airport_code])+1
        sizes.append(count)
    
    fig.add_trace(go.Scattergeo(
        locationmode = 'USA-states',
        lon = airports['LONGITUDE'],
        lat = airports['LATITUDE'],
        hoverinfo = 'text',
        text = airports['AIRPORT']+", "+airports['CITY']+", "+airports['STATE']+", "+airports['COUNTRY'],
        mode = 'markers',
        marker = dict(
            size = sizes,
            color = 'rgb(255, 0, 0)',
            line = dict(
                width = 3,
                color = 'rgba(68, 68, 68, 0)'
            )
        )))

    

    fig.update_layout(
        title_text = "Flights' route",
        showlegend = False,
        geo = dict(
            scope = 'north america',
            projection_type = 'azimuthal equal area',
            showland = True,
            landcolor = 'rgb(243, 243, 243)',
            countrycolor = 'rgb(204, 204, 204)',
        ),
    )
    # make the figure bigger
    fig.update_layout(height=800, width=800)

    return fig

plot_map_routes(df,df_dates["FlightDate"][10],df_dates["FlightDate"][15],"ABQ").show()

In [None]:
# get avg arrival delay by state origin
def avg_delay_state_origin(df):
    df_aggregated = df.groupBy("ORIGIN_STATE").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated

def plot_avg_delay_state_origin(df):
    df_avg = avg_delay_state_origin(df).toPandas()
    # drop row with AS in ORIGIN_STATE
    df_avg = df_avg.drop(df_avg[df_avg["ORIGIN_STATE"] == "AS"].index)
    fig = px.choropleth(locations=df_avg["ORIGIN_STATE"], locationmode="USA-states", color=df_avg["AverageArrivalDelay"], scope="usa")

    # add title "Average delay by state origin"
    fig.update_layout(title_text="Average delay by state origin")
    
    return fig

plot_avg_delay_state_origin(df).show()

In [None]:

# get avg arrival delay by state origin
def avg_delay_state_dest(df):
    df_aggregated = df.groupBy("DEST_STATE").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated    


def plot_avg_delay_state_dest(df):
    df_avg = avg_delay_state_dest(df).toPandas()
    # drop row with AS in ORIGIN_STATE
    df_avg = df_avg.drop(df_avg[df_avg["DEST_STATE"] == "AS"].index)
    df_avg = df_avg.drop(df_avg[df_avg["DEST_STATE"] == "GU"].index)

    fig = px.choropleth(locations=df_avg["DEST_STATE"], locationmode="USA-states", color=df_avg["AverageArrivalDelay"], scope="usa")

    # add title "Average delay by state "
    fig.update_layout(title_text="Average delay by state destination")
    return fig

plot_avg_delay_state_dest(df).show()

In [None]:
# group by Reporting_Airline and count
def aggregate_carrier(df):
    df_aggregated = df.groupBy("Reporting_Airline").count()
    df_aggregated = df_aggregated.orderBy("count", ascending=False)
    return df_aggregated

# show the most popular airlines as pie chart
def plot_carrier(df):
    df_aggregated = aggregate_carrier(df).toPandas()
    # join df_aggregated with airlines to get the airline name
    df_aggregated = df_aggregated.join(airlines.set_index("IATA"), on="Reporting_Airline")
    fig = px.pie(df_aggregated, values='count', names='Name')
    fig.update_layout(title_text="Most popular airlines")
    return fig


    
plot_carrier(df).show()


In [None]:
# group by airline and make the avg of ArrDelay, count the number of flights
def avg_delay_carrier(df):
    # aggregate by airline, count the number of rows
    df_aggregated = df.groupBy("Reporting_Airline").agg({"ArrDelay": "avg", "*": "count"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay").withColumnRenamed("count(*)", "NumberOfFlights")
    # create a new column 
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated

# visualize the avg delay by airline as bar chart
def plot_avg_delay_carrier(df):
    df_avg = avg_delay_carrier(df).toPandas()
    df_avg = df_avg.join(airlines.set_index("IATA"), on="Reporting_Airline")
    
    fig = px.bar(df_avg, x='Name', y='AverageArrivalDelay')
    # add title "Average delay by airline"
    fig.update_layout(title_text="Average delay by airline")
    return fig

plot_avg_delay_carrier(df).show()


In [None]:
def taxiing_congenstion(df):
    # group by Origin, sum taxi in  time, count the number of flights
    df_aggregated = df.groupBy("ORIGIN").agg({"TaxiIn": "sum", "*": "count"}).withColumnRenamed("sum(TaxiIn)", "TotalTaxiInTime").withColumnRenamed("count(1)", "NumberOfFlights")
    return df_aggregated

# plot in a scatter the number of flights vs the total taxi in time, hover data is the airport name
def plot_taxiing_congenstion(df):
    df_aggregated = taxiing_congenstion(df).toPandas()
    df_aggregated = df_aggregated.join(airports.set_index("IATA"), on="ORIGIN")
    fig = px.scatter(df_aggregated, x="NumberOfFlights", y="TotalTaxiInTime", hover_data=["AIRPORT"])
    # add title "Taxiing Congestion"
    fig.update_layout(title_text="Taxi in Congestion")
    return fig

plot_taxiing_congenstion(df).show()
    

In [None]:
def taxiing_congenstion(df):
    # group by Origin, sum taxi in  time, count the number of flights
    df_aggregated = df.groupBy("DEST").agg({"TaxiIn": "sum", "*": "count"}).withColumnRenamed("sum(TaxiIn)", "TotalTaxiInTime").withColumnRenamed("count(1)", "NumberOfFlights")
    return df_aggregated

# plot in a scatter the number of flights vs the total taxi in time, hover data is the airport name
def plot_taxiing_congenstion(df):
    df_aggregated = taxiing_congenstion(df).toPandas()
    df_aggregated = df_aggregated.join(airports.set_index("IATA"), on="DEST")
    fig = px.scatter(df_aggregated, x="NumberOfFlights", y="TotalTaxiInTime", hover_data=["AIRPORT"])
    # add title "Taxiing Congestion"
    fig.update_layout(title_text="Taxi out Congestion")
    return fig

plot_taxiing_congenstion(df).show()

In [None]:
# group by DayOfWeek and DepTimeBlk, make the avg of ArrDelay, count the number of flights
def avg_delay_day_time(df):
    # aggregate by airline, count the number of rows
    df_aggregated = df.groupBy("DayOfWeek", "DepTimeBlk").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    # create a new column 
    df_aggregated = df_aggregated.orderBy("AverageArrivalDelay", ascending=False)
    return df_aggregated

def week_time_blk_matrix_avg_delay(df):
    df_pd = avg_delay_day_time(df).toPandas()
    fig = px.imshow(
        df_pd.pivot("DayOfWeek", "DepTimeBlk", "AverageArrivalDelay"), 
        labels=dict(x="DepTimeBlk", y="DayOfWeek", color="Average Arrival Delay"), 
        y=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    )
    # add title "Average delay by day of week and month"
    fig.update_layout(title="Average delay by day of week and block time")
    return fig


week_day_matrix_avg_delay(df).show()



In [None]:
# group by DayOfWeek and DepTimeBlk, make the avg of ArrDelay, count the number of flights
def num_flights_day_time(df):
    # aggregate by airline, count the number of rows
    df_aggregated = df.groupBy("DayOfWeek", "DepTimeBlk").agg({"*": "count"}).withColumnRenamed("count(1)", "NumFlights")
    # create a new column 
    df_aggregated = df_aggregated.orderBy("DepTimeBlk", ascending=False)
    return df_aggregated

def week_time_blk_matrix_num_flights(df):
    df_pd = num_flights_day_time(df).toPandas()
    fig = px.imshow(
        df_pd.pivot("DayOfWeek", "DepTimeBlk", "NumFlights"), 
        labels=dict(x="DepTimeBlk", y="DayOfWeek", color="NumFlights"), 
        y=["Mon", "Tue", "Wed", "Thu", "Fri", "Sat", "Sun"]
    )
    # add title "Average delay by day of week and month"
    fig.update_layout(title="Average delay by day of week and month")
    return fig


week_time_blk_matrix_num_flights(df).show()

In [None]:
# num flights by day
def flights_by_month(df):
    df_aggregated = df.groupBy("Month").count()
    df_aggregated = df_aggregated.orderBy("Month", ascending=True)
    # convert Month to string with the corresponding month name, like January, February, etc.
    df_aggregated = df_aggregated.withColumn("Month", df_aggregated["Month"].cast("string"))
    df_aggregated = df_aggregated.withColumn("Month", when(df_aggregated["Month"] == "1", "January")
                                .when(df_aggregated["Month"] == "2", "February")
                                .when(df_aggregated["Month"] == "3", "March")
                                .when(df_aggregated["Month"] == "4", "April")
                                .when(df_aggregated["Month"] == "5", "May")
                                .when(df_aggregated["Month"] == "6", "June")
                                .when(df_aggregated["Month"] == "7", "July")
                                .when(df_aggregated["Month"] == "8", "August")
                                .when(df_aggregated["Month"] == "9", "September")
                                .when(df_aggregated["Month"] == "10", "October")
                                .when(df_aggregated["Month"] == "11", "November")
                                .when(df_aggregated["Month"] == "12", "December")
                                .otherwise("0")
                    )
    return df_aggregated

# visualized flights as line chart, smooth the line in the plot
def flights_by_month_line(df):
    df_aggregated = flights_by_month(df).toPandas()
    fig = px.line(df_aggregated, x="Month", y="count", title="Number of Flights by Month")
    fig.update_traces(line_shape="spline")
    return fig


flights_by_month_line(df).show()

In [None]:

def delay_by_month(df):
    df_aggregated = df.groupBy("Month").agg({"ArrDelay": "avg"}).withColumnRenamed("avg(ArrDelay)", "AverageArrivalDelay")
    df_aggregated = df_aggregated.orderBy("Month", ascending=True)
    # convert Month to string with the corresponding month name, like January, February, etc.
    df_aggregated = df_aggregated.withColumn("Month", df_aggregated["Month"].cast("string"))
    df_aggregated = df_aggregated.withColumn("Month", when(df_aggregated["Month"] == "1", "January")
                                .when(df_aggregated["Month"] == "2", "February")
                                .when(df_aggregated["Month"] == "3", "March")
                                .when(df_aggregated["Month"] == "4", "April")
                                .when(df_aggregated["Month"] == "5", "May")
                                .when(df_aggregated["Month"] == "6", "June")
                                .when(df_aggregated["Month"] == "7", "July")
                                .when(df_aggregated["Month"] == "8", "August")
                                .when(df_aggregated["Month"] == "9", "September")
                                .when(df_aggregated["Month"] == "10", "October")
                                .when(df_aggregated["Month"] == "11", "November")
                                .when(df_aggregated["Month"] == "12", "December")
                                .otherwise("0")
                    )
    return df_aggregated

# visualized flights as line chart, smooth the line in the plot
def delay_by_month_line(df):
    df_aggregated = delay_by_month(df).toPandas()
    fig = px.line(df_aggregated, x="Month", y="AverageArrivalDelay", title="Avg delay by Month")
    fig.update_traces(line_shape="spline")
    return fig


delay_by_month_line(df).show()

In [None]:
def avg_num_line_month(df):
    df_num = flights_by_month(df).toPandas()
    df_avg = delay_by_month(df).toPandas()

    # plot the two lines in the same figure, use to different y axes
    fig = make_subplots(specs=[[{"secondary_y": True}]])

    # Add traces
    fig.add_trace(
        go.Scatter(x=df_num["Month"] , y=df_num["count"], name="num flights"),
        secondary_y=False,
    )

    fig.add_trace(
        go.Scatter(x=df_avg["Month"], y=df_avg["AverageArrivalDelay"], name="avg delay"),
        secondary_y=True,
    )

    # Add figure title
    fig.update_layout(
        title_text="Number of flights and average delay by day"
    )

    # Set x-axis title
    fig.update_xaxes(title_text="Month of the year")

    # Set y-axes titles
    fig.update_yaxes(title_text="Number of flights", secondary_y=False)
    fig.update_yaxes(title_text="Average delay in minutes", secondary_y=True)


    return fig


avg_num_line_month(df).show()