# Bar plots

This notebook contains the code to create bar plots and stacked bar plots in `lets-plot`, using the ["Airlines Delays from 2003-2016"](https://www.kaggle.com/datasets/giovamata/airlinedelaycauses) dataset by [Priank Ravichandar](https://www.kaggle.com/priankravichandar) licensed under [CC0 1.0](https://creativecommons.org/publicdomain/zero/1.0/). This dataset contains the information on flight delays and cancellations in the US airports for the period of 2003-2016.

In [1]:
import pandas as pd
from lets_plot import *

LetsPlot.setup_html()

## Import and process the data
* Create a date/time variable from the month/year column
* Remove the first and last years, as they only contain partial records for the year

In [2]:
airlines = pd.read_csv("data/airlines.csv")
airlines["Time"] = pd.to_datetime(airlines["TimeLabel"], infer_datetime_format=True)
airlines = airlines[~(airlines["TimeYear"].isin([2003, 2016]))]

## Create a summary DataFrame that gives the proportion of flights delayed by airport

In [3]:
flights_delayed_by_airport = (
    airlines[["AirportCode", "FlightsDelayed", "FlightsTotal"]]
    .groupby(["AirportCode"])
    .sum()
    .assign(PropFlightsDelayed=lambda x: x["FlightsDelayed"] / x["FlightsTotal"])
    .reset_index()
    .sort_values("PropFlightsDelayed", ascending=False)
)

## Bar plot to show proportion of flights delayed per airport

In [12]:
(
        ggplot(flights_delayed_by_airport, aes(x="AirportCode", y="PropFlightsDelayed"))
        + geom_bar(stat="identity", fill = "#b3cde3")
        + coord_flip()
        + xlab("Airport Code")
        + ylab("Flights delayed (proportion)")
        + ggtitle("Proportion of flights delayed in US airports, 2004-2015")
)

## Create a summary DataFrame to show proportion of flights delayed by cause and airport

In [5]:
delays_by_airport_and_cause = (
    airlines[["AirportCode", "NumDelaysLateAircraft", "NumDelaysWeather", "NumDelaysSecurity", "NumDelaysCarrier",
              "FlightsTotal"]]
    .groupby("AirportCode")
    .sum()
    .reset_index()
)

delays_by_airport_and_cause = (
    pd.melt(delays_by_airport_and_cause, id_vars=["AirportCode", "FlightsTotal"],
            value_vars=["NumDelaysLateAircraft", "NumDelaysWeather", "NumDelaysSecurity", "NumDelaysCarrier"],
            var_name="TypeOfDelay",
            value_name="NumberDelays")
    .assign(TypeOfDelay=lambda x: x["TypeOfDelay"].str.replace("NumDelays", ""))
    .assign(PropFlightsDelayed=lambda x: x["NumberDelays"] / x["FlightsTotal"])
    .assign(PropTypeOfDelay=lambda x: x["NumberDelays"] / x.groupby("AirportCode")["NumberDelays"].transform("sum"))
)

## Bar plot showing proportion of flights delayed by cause

In [10]:
(
        ggplot(
            delays_by_airport_and_cause[(delays_by_airport_and_cause["AirportCode"].isin(["EWR", "SLC", "DEN"]))],
            aes(x="AirportCode", y="PropFlightsDelayed", fill="TypeOfDelay")
        )
        + geom_bar(stat="identity", position="dodge")
        + xlab("Airport Code")
        + ylab("Flights delayed (proportion)")
        + ggtitle("Proportion of flights delayed by cause in US airports, 2004-2015")
        + scale_fill_brewer(type="qual", palette="Pastel1", name="Year",
                            labels=["Late aircraft", "Weather", "Security", "Carrier"])
)

## Stacked bar plot showing proportion of delayed flights by cause

In [11]:
(
        ggplot(delays_by_airport_and_cause, aes(x="AirportCode", y="PropTypeOfDelay", fill="TypeOfDelay"))
        + geom_bar(stat="identity")
        + xlab("Airport Code")
        + ylab("Proportion of delayed flights")
        + ggtitle("Division of delayed US flights by cause, 2004-2015")
        + scale_fill_brewer(type="qual", palette="Pastel1", name="Year",
                            labels=["Late aircraft", "Weather", "Security", "Carrier"])
)