In [None]:
import vaex
import numpy as np
%matplotlib inline

In [None]:
# load the example dataset
# df = vaex.example()

# or downloads a slightly larger version of the example dataset
df = vaex.datasets.helmi_de_zeeuw.fetch()

In [None]:
df

# Expressions
Expressions are only evaluated when needed by vaex, and save you memory.

In [None]:
np.sqrt(df.x**2 + df.y**2)

# Virtual columns
Expression can be added to a DataFrame to create a virtual column. A virtual column can be treated the same as a normal column, except it does not use up RAM.

In [None]:
df['r'] = np.sqrt(df.x**2 + df.y**2)
df[['x', 'y', 'r']]

In [None]:
df.r.mean()

# JIT (Just in time) compilation
If an expression becomes to show, try optimizing it with numba, or Pythran

In [None]:
df['r_normal'] = np.sqrt(df.x**2 + df.y**2)
df['r_jit'] = np.sqrt(df.x**2 + df.y**2).jit_numba()

In [None]:
%%timeit -n3 -r10
df.mean(df.r_normal)

In [None]:
%%timeit -n3 -r10
df.mean(df.r_jit)

# Materialize
Or, if you have plenty of RAM, materialize the column.

In [None]:
df_m = df.materialize('r')

In [None]:
%%timeit -n3 -r10
df_m.mean(df.r)

# Filtering
Filtering makes no copy of the data, ideal when exploring your 1TB dataset.

In [None]:
df_filtered = df[df.x > 0]
df_filtered[['x', 'y', 'r']]

# Selections
All statistical functions can take 1 or more selections as arguments. Multiple selections allow for multiple computations in 1 pass over the data.

In [None]:
df.mean(df.x, selection=[df.x < 0, df.x > 0])

# Data cleansing
Even fillna does not use memory, try different values without wasting time or RAM.

In [None]:
df_fillna_0 = df.fillna(value=0, column_names=['x'])
df_fillna_3 = df.fillna(value=3, column_names=['x'])
df_fillna_5 = df.fillna(value=5, column_names=['x'])

# N-d statistics
All statistical methods can be computed on N-dimensional regular grids.

In [None]:
df.mean(df.x, binby=df.y, limits=[-10, 10], shape=20)

# Visualization
The N-d statistics are the basis for many of the build-in visualizations.

In [None]:
df.plot1d(df.x, limits=[-10, 10]);

In [None]:
df.plot(df.x, df.y, limits=[-10, 10]);

# Interactive viz
Based on ipywidgets / bqplot, you can even do interactive visualization

*Note that (since we are on mybinder) we only use 100.000 rows, instead of 150.000.000 or >1.000.000.000 rows. Download it from https://docs.vaex.io/en/latest/datasets.html if you want to try it out on your local computer.*

In [None]:
# the first 100,000 rows 
df_taxi = vaex.open('./nyc_taxi_2015_100k.arrow')

In [None]:
df_taxi.plot_widget(df_taxi.dropoff_longitude, df_taxi.dropoff_latitude, shape=400,
                    f='log1p', controls_selection=True)