Skip to content

Commit

Permalink
Make extending with new functions vaex more convenient
Browse files Browse the repository at this point in the history
  • Loading branch information
maartenbreddels committed Mar 18, 2019
1 parent f425b29 commit 08b340d
Show file tree
Hide file tree
Showing 8 changed files with 181 additions and 147 deletions.
25 changes: 23 additions & 2 deletions docs/source/api.rst
Expand Up @@ -55,7 +55,7 @@ vaex-core
---------

.. automodule:: vaex
:members: open, from_arrays, from_dict, from_items, from_arrow_table, from_csv, from_ascii, from_pandas, from_astropy_table, from_samp, open_many, server, example, app, delayed
:members: open, from_arrays, from_dict, from_items, from_arrow_table, from_csv, from_ascii, from_pandas, from_astropy_table, from_samp, open_many, register_function, server, example, app, delayed
:undoc-members:
:show-inheritance:

Expand All @@ -76,6 +76,27 @@ DataFrameLocal class
:special-members:


Expression class
~~~~~~~~~~~~~~~~

.. autoclass:: vaex.expression.Expression
:members:
:special-members:



String operations
~~~~~~~~~~~~~~~~

.. autoclass:: vaex.expression.StringOperations
:members:
:special-members:

.. autoclass:: vaex.expression.StringOperationsPandas
:members:
:special-members:


vaex.stat module
~~~~~~~~~~~~~~~~

Expand Down Expand Up @@ -250,7 +271,7 @@ Boosted trees
.. autoclass:: vaex.ml.lightgbm.LightGBMClassifier
:members:

.. autoclass:: vaex.ml.xgboost.XGBModel
.. autoclass:: vaex.ml.xgboost.XGBoostModel
:members:

.. PyGBM support is in the incubator phase, which means support may disappear in future versions
Expand Down
4 changes: 2 additions & 2 deletions docs/source/conf.py
Expand Up @@ -69,9 +69,9 @@
if 1:
import vaex
# The short X.Y version.
version = vaex.version
version = vaex.__version__
# The full version, including alpha/beta/rc tags.
release = vaex.version
release = vaex.__version__
else:
print("failed finding vaex module, try finding version")
import sys
Expand Down
1 change: 1 addition & 0 deletions packages/vaex-core/vaex/__init__.py
Expand Up @@ -36,6 +36,7 @@
import glob
import vaex.dataframe
import vaex.dataset
from vaex.functions import register_function
from . import stat
# import vaex.file
# import vaex.export
Expand Down
64 changes: 32 additions & 32 deletions packages/vaex-core/vaex/dataframe.py
Expand Up @@ -31,7 +31,7 @@
import logging
import vaex.kld
from . import selections, tasks, scopes
from .functions import expression_namespace
from .expression import expression_namespace
from .delayed import delayed, delayed_args, delayed_list
from .column import Column, ColumnIndexed, ColumnSparse, ColumnString, ColumnConcatenatedLazy, str_type
import vaex.events
Expand Down Expand Up @@ -209,6 +209,37 @@ def __getattr__(self, name):
else:
return object.__getattribute__(self, name)

@property
def func(self):
class Functions(object):
pass

functions = Functions()
for name, value in expression_namespace.items():
# f = vaex.expression.FunctionBuiltin(self, name)
def closure(name=name, value=value):
local_name = name
def wrap(*args, **kwargs):
def myrepr(k):
if isinstance(k, Expression):
return str(k)
else:
return repr(k)
arg_string = ", ".join([myrepr(k) for k in args] + ['{}={}'.format(name, myrepr(value)) for name, value in kwargs.items()])
expression = "{}({})".format(local_name, arg_string)
return vaex.expression.Expression(self, expression)
return wrap
f = closure()
try:
f = functools.wraps(value)(f)
except AttributeError:
pass # python2 quicks.. ?
setattr(functions, name, f)
for name, value in self.functions.items():
setattr(functions, name, value)

return functions

@_hidden
@vaex.utils.deprecated('use is_category')
def iscategory(self, column):
Expand Down Expand Up @@ -4452,37 +4483,6 @@ class Datas(object):
setattr(datas, name, array)
return datas

@property
def func(self):
class Functions(object):
pass

functions = Functions()
for name, value in expression_namespace.items():
# f = vaex.expression.FunctionBuiltin(self, name)
def closure(name=name, value=value):
local_name = name
def wrap(*args, **kwargs):
def myrepr(k):
if isinstance(k, Expression):
return str(k)
else:
return repr(k)
arg_string = ", ".join([myrepr(k) for k in args] + ['{}={}'.format(name, myrepr(value)) for name, value in kwargs.items()])
expression = "{}({})".format(local_name, arg_string)
return vaex.expression.Expression(self, expression)
return wrap
f = closure()
try:
f = functools.wraps(value)(f)
except AttributeError:
pass # python2 quicks.. ?
setattr(functions, name, f)
for name, value in self.functions.items():
setattr(functions, name, value)

return functions

def copy(self, column_names=None, virtual=True):
df = DataFrameArrays()
df._length_unfiltered = self._length_unfiltered
Expand Down
63 changes: 14 additions & 49 deletions packages/vaex-core/vaex/expression.py
Expand Up @@ -2,7 +2,6 @@
import six
import functools
from future.utils import with_metaclass
from vaex.functions import expression_namespace, _scopes
from vaex.utils import _ensure_strings_from_expressions, _ensure_string_from_expression
from vaex.column import ColumnString
import numpy as np
Expand All @@ -18,6 +17,10 @@
# TODO: repeated from dataframe.py
default_shape = 128

expression_namespace = {}
expression_namespace['nan'] = np.nan


_binary_ops = [
dict(code="+", name='add', op=operator.add),
dict(code="in", name='contains', op=operator.contains),
Expand Down Expand Up @@ -94,33 +97,11 @@ def f(a):
return Expression(self.ds, expression=expression)
attrs['__%s__' % op['name']] = f
wrap(op)
for name, func_real in expression_namespace.items():
def wrap(name=name):
def f(*args, **kwargs):
self = args[0]

def to_expression(expression):
if isinstance(expression, str):
expression = repr(expression)
if isinstance(expression, Expression):
assert expression.ds == self.ds
expression = expression.expression
return expression
expressions = [to_expression(e) for e in args]
# print(name, expressions)
expression = '{0}({1})'.format(name, ", ".join(expressions))
return Expression(self.ds, expression=expression)
try:
f = functools.wraps(func_real)(f)
except AttributeError:
pass # numpy ufuncs don't have a __module__, which may choke wraps

attrs['%s' % name] = f
if name not in attrs:
wrap(name)
return type(future_class_name, future_class_parents, attrs)


class DateTime(object):
"""DateTime operations"""
def __init__(self, expression):
self.expression = expression

Expand All @@ -144,37 +125,21 @@ def hour(self):
def weekofyear(self):
return self.expression.ds.func.dt_weekofyear(self.expression)


class StringOperations(object):
"""String operations"""
def __init__(self, expression):
self.expression = expression

class PandasStringOperations(object):

class StringOperationsPandas(object):
"""String operations using Pandas Series"""
def __init__(self, expression):
self.expression = expression

for name, function in _scopes['str'].items():
full_name = 'str_' + name
def closure(name=name, full_name=full_name, function=function):
def wrapper(self, *args, **kwargs):
lazy_func = getattr(self.expression.ds.func, full_name)
args = (self.expression, ) + args
return lazy_func(*args, **kwargs)
return wrapper
setattr(StringOperations, name, closure())

for name, function in _scopes['str_pandas'].items():
full_name = 'str_pandas_' + name
def closure(name=name, full_name=full_name, function=function):
def wrapper(self, *args, **kwargs):
lazy_func = getattr(self.expression.ds.func, full_name)
args = (self.expression, ) + args
return lazy_func(*args, **kwargs)
return wrapper
setattr(PandasStringOperations, name, closure())

class Expression(with_metaclass(Meta)):
"""Expression class"""
def __init__(self, ds, expression):
self.ds = ds
if isinstance(expression, Expression):
Expand All @@ -193,7 +158,7 @@ def str(self):
@property
def str_pandas(self):
"""Gives access to string operations (using Pandas Series)"""
return PandasStringOperations(self)
return StringOperationsPandas(self)

@property
def values(self):
Expand Down Expand Up @@ -430,7 +395,7 @@ def jit_numba(self, verbose=False):
import imp
import hashlib
names = []
funcs = set(vaex.dataset.expression_namespace.keys())
funcs = set(expression_namespace.keys())
# if it's a virtual column, we probably want to optimize that
# TODO: fully extract the virtual columns, i.e. depending ones?
expression = self.expression
Expand Down Expand Up @@ -459,7 +424,7 @@ def jit_pythran(self, verbose=False):
import hashlib
# self._import_all(module)
names = []
funcs = set(vaex.dataset.expression_namespace.keys())
funcs = set(expression_namespace.keys())
expression = self.expression
if expression in self.ds.virtual_columns:
expression = self.ds.virtual_columns[self.expression]
Expand All @@ -484,7 +449,7 @@ def f({0}):

module = imp.load_dynamic(module_name, module_path)
function_name = "f_" + m.hexdigest()
vaex.dataset.expression_namespace[function_name] = module.f
expression_namespace[function_name] = module.f

return Expression(self.ds, "{0}({1})".format(function_name, argstring))
finally:
Expand Down

0 comments on commit 08b340d

Please sign in to comment.