## 1. Dask DataFrames

### Set up the Dask Client

[Dask Client API Reference](https://distributed.dask.org/en/stable/api.html#distributed.Client)  
[Dask Dataframe Tutorial](https://tutorial.dask.org/04_dataframe.html)

* These settings took a bit of trial and error to avoid a mountain of memory leak messages. 
* My machine (Mac OSX) has 16 cores and 16MB of RAM.
* See the 5. Dask Client Notebook for more details

In [None]:
from dask.distributed import Client

client = Client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit="4 GiB"
)

client

#### Making the flights.csv dataframe 20 x bigger (Pandas)

In [None]:
%%time

import pandas as pd

# Specify columns to keep
usecols = [
    "YEAR", "MONTH", "DAY", "FLIGHT_NUMBER", "AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY",
    "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"
]

# Read the data in from csv
df = pd.read_csv("./data/flights/flights.csv", usecols=usecols)

df_20 = df.copy()

i = 1
while i < 20:
    df_20 = pd.concat([df_20, df])
    i += 1

df_20.to_csv("./data/flights/flights_limited_x20.csv", index=False)

#### Making the flights.csv dataframe 20 x bigger (Dask)

In [None]:
%%time

import dask.dataframe as dd

# Specify columns to keep
usecols = [
    "YEAR", "MONTH", "DAY", "FLIGHT_NUMBER", "AIR_SYSTEM_DELAY", "SECURITY_DELAY", "AIRLINE_DELAY",
    "LATE_AIRCRAFT_DELAY", "WEATHER_DELAY"
]

# Read the data in from csv
df = dd.read_csv("./data/flights/flights.csv", usecols=usecols)

df_20 = df.copy()

i = 1
while i < 20:
    df_20 = dd.concat([df_20, df])
    i += 1

# Write to CSV
df_20.to_csv("./data/flights/flights_limited_x20_csv", index=False)

# Write to JSON
df_20.to_json(
    filename="./data/flights/flights_limited_x20_json",
    orient="records"
)

#### Pandas Speed Test

In [None]:
%%time

import pandas as pd
import numpy as np

df_20 = pd.read_csv("./data/flights/flights_limited_x20.csv")

# Fill NA values
df_20 = df_20.fillna(0)

# Create a Date column
df_20["DATE"] = pd.to_datetime(df_20[["YEAR", "MONTH", "DAY"]])

df_20["TOTAL_DELAY"] = (
    df_20[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']].sum(axis=1)
)

# Drop Columns
df_20 = df_20.drop(["YEAR", "MONTH", "DAY"], axis=1)

# Perform Boolean Indexing (Where/Filter)
df_20 = df_20[df_20["DATE"] > pd.to_datetime("2015-01-01")]

print(df_20.shape)

# Perform an aggregation
tab = df_20.groupby(["DATE", "FLIGHT_NUMBER"]).sum()

# Show the table
tab

#### Dask Speed Test

In [None]:
%%time

import dask.dataframe as dd
import pandas as pd

df_20 = dd.read_csv("./data/flights/flights_limited_x20.csv")

# Fill NA values
df_20 = df_20.fillna(0)

# Create a Date column
df_20["DATE"] = dd.to_datetime(df_20[["YEAR", "MONTH", "DAY"]])

df_20["TOTAL_DELAY"] = (
    df_20[['AIR_SYSTEM_DELAY', 'SECURITY_DELAY', 'AIRLINE_DELAY', 'LATE_AIRCRAFT_DELAY', 'WEATHER_DELAY']].sum(axis=1)
)

# Drop Columns
df_20 = df_20.drop(["YEAR", "MONTH", "DAY"], axis=1)

# Perform Boolean Indexing (Where/Filter)
df_20 = df_20[df_20["DATE"] > pd.to_datetime("2015-01-01")]

print(df_20.shape)

# Perform an aggregation
tab = df_20.groupby(["DATE", "FLIGHT_NUMBER"]).sum()


tab_out = tab.compute()
tab_out

In [None]:
tab

#### Compute two ways...

In [None]:
from dask.dataframe import compute

tab_out = compute(tab)

In [None]:
tab_out = tab.compute()

In [None]:
# Return an un-computed DF
tab

In [None]:
# Return the computed DF
tab_out

In [None]:
# Compute the first 5 rows of an un-computed DF
tab.head(5)

#### Delay Binner in Dask DF (used for a comparison to Dask Bags)

In [None]:
def delay_binner(row):
    
    if row["TOTAL_DELAY"] > 10000:
        return "Huge"
    
    elif row["TOTAL_DELAY"] > 6000:
        return "Large"
    
    elif row["TOTAL_DELAY"] > 3000:
        return "Medium"
    
    elif row["TOTAL_DELAY"] > 1000:
        return "Low"  
    
    elif row["TOTAL_DELAY"] > 0:
        return "None"
    
    else:
        return "None"

In [None]:
%%time

df_20["DELAY_CAT"] = df_20.apply(delay_binner, meta=(None, 'object'), axis=1)
df_20.to_csv("./data/flights/flights/flights_limited_x20_agg_cat")