## Dask ML

In [None]:
from dask.distributed import Client

client = Client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit="4 GiB"
)

client

## Prep some Flights Data for ML

In [None]:
%%time

import dask.dataframe as dd

# Read the data in from csv

usecols = [
    "YEAR", "MONTH", "DAY_OF_WEEK", "DISTANCE", "AIR_TIME", 
    "ARRIVAL_DELAY", "CANCELLED"
]

df = dd.read_csv(
    "./data/flights/flights.csv",
    usecols=usecols,
    dtype={
        "YEAR": "int16",
        "MONTH": "int16",
        "DAY_OF_WEEK": "int16",
        "DISTANCE": "int16",
        "AIR_TIME": "float16",
        "ARRIVAL_DELAY": "float16",
        "CANCELLED": "bool"
    }
)

# Remove Cancelled Flights, One-hot encoding, dropping na
df = (
    df[
        (df["CANCELLED"] == False) 
    ]
    .drop(["CANCELLED"], axis=1)
    .dropna(how="any")
)

## Option 1: Using Dask as a backend for SKL

First we create a train / test split using SKL.

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split

df_out = df.compute()
client.close()

# Split into Features and Target
X = df_out.drop("ARRIVAL_DELAY", axis=1)
y = df_out[["ARRIVAL_DELAY"]].values.ravel()

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5
)

df_out.shape

## Parallelised SKL

Running SKL in parallel...

In [None]:
%%time

model = RandomForestRegressor(
    n_jobs=-1,
    verbose=True
)

model.fit(X_train, y_train)

## Dask ML

First, create a client...

In [None]:
client.close()

In [None]:
from dask.distributed import Client

client = Client(
    n_workers=4,
    threads_per_worker=2,
    memory_limit="4 GiB"
)

client

#### Running via a Dask Backend

It's easy but actually a little slower than Parallelised SKL =(

In [None]:
%%time

import joblib

model = RandomForestRegressor(verbose=True)

with joblib.parallel_backend('dask'):
    model.fit(X_train, y_train)

## Option 2: Using dask-ml

We can also use defined Dask-ML models (df = a Dask DataFrame):

#### With Dask-ML

In [None]:
%%time

from dask_ml.xgboost import XGBRegressor
from dask_ml.model_selection import train_test_split

# Init the model
model = XGBRegressor()

# Split Dask DataFrame into Features and Target
X = df.drop("ARRIVAL_DELAY", axis=1)
y = df[["ARRIVAL_DELAY"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5
)

model.fit(X_train.compute(), y_train.compute())
model.predict(X_test)

Compared to the XGBoost package (running in parallel) it's 2x as quick.

In [None]:
df_out = df.compute()
client.close()

In [None]:
%%time

from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split



# Split into Features and Target
X = df_out.drop("ARRIVAL_DELAY", axis=1)
y = df_out[["ARRIVAL_DELAY"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5
)

model = XGBRegressor(n_jobs=16)
model.fit(X_train, y_train)
model.predict(X_test)

## Using a dask-ml Incremental wrapper

In [None]:
%%time

from sklearn.linear_model import PassiveAggressiveRegressor
from dask_ml.wrappers import Incremental
from dask_ml.model_selection import train_test_split

# Split into Features and Target
X = df.drop("ARRIVAL_DELAY", axis=1)
y = df[["ARRIVAL_DELAY"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5
)

model = Incremental(PassiveAggressiveRegressor())
model.fit(X_train, y_train)

In [None]:
model.predict(X_test.compute())

## Using SKL with Dask Objects

In [None]:
%%time

from sklearn.linear_model import PassiveAggressiveRegressor
from dask_ml.model_selection import train_test_split


# Split into Features and Target
X = df.drop("ARRIVAL_DELAY", axis=1)
y = df[["ARRIVAL_DELAY"]]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5
)

model = PassiveAggressiveRegressor()
model.fit(X_train, y_train)
model.predict(X_test.compute())