<img src="http://dask.readthedocs.io/en/latest/_images/dask_horizontal.svg"
     align="right"
     width="30%"
     alt="Dask logo">

DataFrames on a Cluster
=======================

<img src="http://www.numfocus.org/uploads/6/0/6/9/60696727/6893890_orig.png"
     align="left"
     width="30%"
     alt="Pandas logo">


### We have a lot of CSV data on S3

In [None]:
from s3fs import S3FileSystem

s3 = S3FileSystem(anon=True)
s3.ls('dask-data/nyc-taxi/2015/')

### But it's too large to load into Pandas :(

In [None]:
import pandas as pd

with s3.open('dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv') as f:
    df = pd.read_csv(f, nrows=5)
    
df

### Dask DataFrames coordinate many Pandas DataFrames

*  Coordinate many Pandas DataFrames across a cluster
*  Faithfully implement a subset of the Pandas API
*  Use Pandas under the hood (for speed and maturity)

In [None]:
from dask.distributed import Executor, progress
e = Executor('schedulers:9000', set_as_default=True)
e

In [None]:
import dask.dataframe as dd

df = dd.read_csv('s3://dask-data/nyc-taxi/2015/yellow_tripdata_2015-01.csv', 
                 parse_dates=['tpep_pickup_datetime', 
                              'tpep_dropoff_datetime'],
                storage_options=dict(anon=True))

df = e.persist(df)
progress(df)

### Dask.dataframe looks *almost* identical to Pandas


In [None]:
df

In [None]:
df.head()

In [None]:
df.passenger_count.sum().compute()

In [None]:
# Compute average trip distance grouped by passenger count
df.groupby(df.passenger_count).trip_distance.mean().compute()

### Dask.dataframe looks *almost* identical to Pandas



(because it really just *is* just lots of Pandas)

<img src="https://deadlinescotland.files.wordpress.com/2008/12/01-giant-panda-group-eating-bamboo.jpg">

### Tip Fraction grouped by Hour of day

In [None]:
df2 = df[(df.tip_amount > 0) & (df.fare_amount > 0)]
df2 = df2.assign(tip_fraction=df2.tip_amount / df2.fare_amount)

dayofweek = df2.groupby(df2.tpep_pickup_datetime.dt.dayofweek).tip_fraction.mean() 
hour = df2.groupby(df2.tpep_pickup_datetime.dt.hour).tip_fraction.mean()

dayofweek, hour = e.persist([dayofweek, hour])
progress(dayofweek, hour)

### Plot results

In [None]:
from bokeh.plotting import figure, output_notebook, show
output_notebook()

fig = figure(title='Tip Fraction', 
             x_axis_label='Hour of day', 
             y_axis_label='Tip Fraction',
             width=600,
             height=300)
fig.line(x=hour.index.compute(), y=hour.compute(), line_width=3)
fig.y_range.start = 0

show(fig)