# Python 3.6 karnel 

In [3]:
%%capture
%%bash
python -m pip install -U pip
python -m pip install vaex-core==2.0.0a5
python -m pip install vaex-hdf5==0.6.0a1 
python -m pip install -U numpy 
python -m pip install -U ipython ipykernel
python -m pip install -U datatable
python -m pip install dask fsspec>=0.3.3 tqdm pyarrow koalas fastparquet
# aws s3 cp --recursive s3://xdss-public-datasets/demos/taxi_1B.sf datasets/taxi_1B.sf

In [3]:
import pandas as pd
import time
import gc

def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        duration = time.time()-start_time
        print(f"{name}-{i+1}: duration {duration}")
    return duration


In [4]:
# Vaex
import vaex
import numpy as np

data = vaex.open('datasets/taxi_1B.hdf5')
print(f"size: {len(data)} with {len(data.columns)} columns")

def vaex_lazy_mean(df):
    df['lazy'] = df.fare_amount * df.trip_distance
    return df['lazy'].mean()
    
benchmark(vaex_lazy_mean, df=data, name='vaex')  

del data
gc.collect()

size: 1173057927 with 18 columns
vaex-1: duration 1.5117080211639404
vaex-2: duration 1.4436674118041992


1095

In [5]:
# Koalas
import pandas as pd
import numpy as np
import databricks.koalas as ks
from pyspark.sql import SparkSession

data = ks.read_parquet('datasets/taxi_parquet/')
print(f"size: {len(data)} with {len(data.columns)} columns")

def koalas_lazy_mean(df):
    df['lazy'] = df.fare_amount * df.trip_distance
    return df['lazy'].mean()
    
benchmark(koalas_lazy_mean, df=data, name='koalas')   

del data
gc.collect()

INFO:MainThread:spark:Patching spark automatically. You can disable it by setting SPARK_KOALAS_AUTOPATCH=false in your environment


size: 1173057928 with 18 columns
koalas-1: duration 14.04140853881836
koalas-2: duration 4.717753887176514


438

In [6]:
# datatable
import vaex
import numpy as np
import datatable as dt
from datatable import f, math

# This is a hack to let datatable read hdf5, it currently can't read parquet, multiple files, or a huge csv
def read_file(data=None):
    vdf = vaex.open('datasets/taxi_1B.hdf5')
    columns = {}
    for name in vdf.get_column_names():
        data = vdf.columns[name]
        if data.dtype == str:
            pass  # skip strings
        elif data.dtype.kind == 'f':
            # datatable is picky about <f4 format
            columns[name] = data.view(np.float32)
        elif data.dtype.kind == 'i':
            columns[name] = data
        else:
            pass  # ignore non int and float
    return dt.Frame(**columns)

# Load data
data = read_file(data=None)

def datatable_lazy_mean(df):
    df['lazy'] = df[:, f.fare_amount * f.trip_distance]
    return df[:, dt.mean(dt.f.lazy)]
    
benchmark(datatable_lazy_mean, df=data, name='datatable')  

del data
gc.collect()

datatable-1: duration 15.700656652450562
datatable-2: duration 15.407875061035156


107

# Move to  conda_tensorflow_p36 karnel 

In [2]:
%%capture
%%bash
python -m pip install turicreate
python -m pip install dask[complete]
python -m pip install pyarrow==0.14

In [3]:
import pandas as pd
import time
import gc

def benchmark(f, df, name, **kwargs):    
    for i in range(2):
        start_time = time.time()
        ret = f(df, **kwargs)
        duration = time.time()-start_time
        print(f"{name}-{i+1}: duration {duration}")
    return duration


In [4]:
import dask.dataframe as dd
import numpy as np

# Load data
data = dd.read_parquet('datasets/taxi_parquet/', engine='pyarrow')
print(f"size: {len(data.vendor_id)} with {len(data.columns)} columns")

def dask_lazy_mean(df):
    df['lazy'] = df.fare_amount * df.trip_distance
    return df['lazy'].mean().compute()
    
benchmark(dask_lazy_mean, df=data, name='dask')  

del data
gc.collect()

dask-1: duration 940.780773639679
dask-2: duration 933.2596433162689


3688

In [None]:
import turicreate as tc
data = tc.SFrame('datasets/taxi_1B.sf')

# Turicreate
def turi_lazy_mean(df):
    df['lazy'] = df['fare_amount'] * df['trip_distance']
    return df['lazy'].mean()
    
benchmark(turi_lazy_mean, df=data, name='turicreate')    
del data
gc.collect()