## 2. Dask Bags

#### Set Up the Dask Client

[Bag Tutorial](https://tutorial.dask.org/02_bag.html)

In [None]:
from dask.distributed import Client

client = Client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit="4 GiB"
)

client

#### Import JSON data into  a Dask Bag

In [None]:
import dask.bag as db
import json

# Read in data as text & Convert to JSON
b = db.read_text("./data/flights/flights_limited_x20_json/*.part")
b = b.map(json.loads)
b.take(1)

#### Convert Dask DataFrame to a Bag

In [None]:
import dask.dataframe as dd

df_20 = dd.read_csv("./data/flights/flights_limited_x20.csv")
b = df_20.to_bag(index=False, format='dict')

b.take(1)

#### Filter

In [None]:
b_feb = b.filter(lambda record: record['MONTH'] == 2)

#### Map

In [None]:
import pandas as pd

def convert_to_date(row):
    
    year = row["YEAR"]
    month = row["MONTH"]
    day = row["DAY"]
    
    row["DATE"] = pd.to_datetime(f"{year}-{month}-{day}")
    
    return row

new_bag = b.map(convert_to_date)

#### Making a DataFrame

In [None]:
df = new_bag.to_dataframe()
df.head()

#### Iterating Dataframes vs Bags

#### Dask Bags Iteration

In [None]:
def delay_binner_bag(row):
        
    if row["TOTAL_DELAY"] > 10000:
        row["DELAY_CAT"] = "Huge"
    
    elif row["TOTAL_DELAY"] > 6000:
        row["DELAY_CAT"] = "Huge"
    
    elif row["TOTAL_DELAY"] > 3000:
        row["DELAY_CAT"] = "Medium"
    
    elif row["TOTAL_DELAY"] > 1000:
        row["DELAY_CAT"] = "Low"  
    
    elif row["TOTAL_DELAY"] > 0:
        row["DELAY_CAT"] = "None"
    
    else:
        row["DELAY_CAT"] = "None"
        
    return row

In [None]:
%%time

import dask.dataframe as dd
import json

# Read the data in and convert to a bag
df_20 = dd.read_csv(
    "./data/flights/flights_limited_x20_agg/*.part"
)
b = df_20.to_bag(index=False, format='dict')

# Map the function to the bag and write to JSON
(
    b.map(delay_binner_bag)
    .map(json.dumps)
    .to_textfiles('./data/flights/flights_limited_x20_agg/*.json')
)


#### Dask DataFrames Iteration

In [None]:
def delay_binner_df(row):
    
    if row["TOTAL_DELAY"] > 10000:
        return "Huge"
    
    elif row["TOTAL_DELAY"] > 6000:
        return "Large"
    
    elif row["TOTAL_DELAY"] > 3000:
        return "Medium"
    
    elif row["TOTAL_DELAY"] > 1000:
        return "Low"  
    
    elif row["TOTAL_DELAY"] > 0:
        return "None"
    
    else:
        return "None"

In [None]:
%%time

import dask.dataframe as dd

df = dd.read_csv(
    "./data/flights/flights_limited_x20_agg/*.part"
)
df["DELAY_CAT"] = df.apply(
    delay_binner_df, meta=(None, 'object'), axis=1
)
df.to_csv(
    "./data/flights/flights/flights_limited_x20_agg_cat/*.csv"
)