From a6f04ec23e86134e890e9b3ddb4cb6da85bb33f9 Mon Sep 17 00:00:00 2001 From: Maarten Breddels Date: Tue, 19 Mar 2019 09:12:37 +0100 Subject: [PATCH] Make extending with new functions vaex more convenient (#188) * Make extending with new functions vaex more convenient --- docs/source/api.rst | 25 +++- docs/source/conf.py | 4 +- packages/vaex-core/vaex/__init__.py | 1 + packages/vaex-core/vaex/dataframe.py | 64 +++++----- packages/vaex-core/vaex/expression.py | 63 +++------- packages/vaex-core/vaex/functions.py | 174 +++++++++++++++++--------- packages/vaex-core/vaex/scopes.py | 2 +- packages/vaex-core/vaex/stat.py | 2 +- 8 files changed, 186 insertions(+), 149 deletions(-) diff --git a/docs/source/api.rst b/docs/source/api.rst index 56762ecd71..ab5fc8ddff 100644 --- a/docs/source/api.rst +++ b/docs/source/api.rst @@ -55,7 +55,7 @@ vaex-core --------- .. automodule:: vaex - :members: open, from_arrays, from_dict, from_items, from_arrow_table, from_csv, from_ascii, from_pandas, from_astropy_table, from_samp, open_many, server, example, app, delayed + :members: open, from_arrays, from_dict, from_items, from_arrow_table, from_csv, from_ascii, from_pandas, from_astropy_table, from_samp, open_many, register_function, server, example, app, delayed :undoc-members: :show-inheritance: @@ -76,6 +76,27 @@ DataFrameLocal class :special-members: +Expression class +~~~~~~~~~~~~~~~~ + +.. autoclass:: vaex.expression.Expression + :members: + :special-members: + + + +String operations +~~~~~~~~~~~~~~~~ + +.. autoclass:: vaex.expression.StringOperations + :members: + :special-members: + +.. autoclass:: vaex.expression.StringOperationsPandas + :members: + :special-members: + + vaex.stat module ~~~~~~~~~~~~~~~~ @@ -250,7 +271,7 @@ Boosted trees .. autoclass:: vaex.ml.lightgbm.LightGBMClassifier :members: -.. autoclass:: vaex.ml.xgboost.XGBModel +.. autoclass:: vaex.ml.xgboost.XGBoostModel :members: .. PyGBM support is in the incubator phase, which means support may disappear in future versions diff --git a/docs/source/conf.py b/docs/source/conf.py index caf9666710..aedf1fd3c8 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -69,9 +69,9 @@ if 1: import vaex # The short X.Y version. - version = vaex.version + version = vaex.__version__ # The full version, including alpha/beta/rc tags. - release = vaex.version + release = vaex.__version__ else: print("failed finding vaex module, try finding version") import sys diff --git a/packages/vaex-core/vaex/__init__.py b/packages/vaex-core/vaex/__init__.py index 78d766823d..b06720a07c 100644 --- a/packages/vaex-core/vaex/__init__.py +++ b/packages/vaex-core/vaex/__init__.py @@ -36,6 +36,7 @@ import glob import vaex.dataframe import vaex.dataset +from vaex.functions import register_function from . import stat # import vaex.file # import vaex.export diff --git a/packages/vaex-core/vaex/dataframe.py b/packages/vaex-core/vaex/dataframe.py index deeb9b25a2..0a0104dc6d 100644 --- a/packages/vaex-core/vaex/dataframe.py +++ b/packages/vaex-core/vaex/dataframe.py @@ -31,7 +31,7 @@ import logging import vaex.kld from . import selections, tasks, scopes -from .functions import expression_namespace +from .expression import expression_namespace from .delayed import delayed, delayed_args, delayed_list from .column import Column, ColumnIndexed, ColumnSparse, ColumnString, ColumnConcatenatedLazy, str_type import vaex.events @@ -209,6 +209,37 @@ def __getattr__(self, name): else: return object.__getattribute__(self, name) + @property + def func(self): + class Functions(object): + pass + + functions = Functions() + for name, value in expression_namespace.items(): + # f = vaex.expression.FunctionBuiltin(self, name) + def closure(name=name, value=value): + local_name = name + def wrap(*args, **kwargs): + def myrepr(k): + if isinstance(k, Expression): + return str(k) + else: + return repr(k) + arg_string = ", ".join([myrepr(k) for k in args] + ['{}={}'.format(name, myrepr(value)) for name, value in kwargs.items()]) + expression = "{}({})".format(local_name, arg_string) + return vaex.expression.Expression(self, expression) + return wrap + f = closure() + try: + f = functools.wraps(value)(f) + except AttributeError: + pass # python2 quicks.. ? + setattr(functions, name, f) + for name, value in self.functions.items(): + setattr(functions, name, value) + + return functions + @_hidden @vaex.utils.deprecated('use is_category') def iscategory(self, column): @@ -4452,37 +4483,6 @@ class Datas(object): setattr(datas, name, array) return datas - @property - def func(self): - class Functions(object): - pass - - functions = Functions() - for name, value in expression_namespace.items(): - # f = vaex.expression.FunctionBuiltin(self, name) - def closure(name=name, value=value): - local_name = name - def wrap(*args, **kwargs): - def myrepr(k): - if isinstance(k, Expression): - return str(k) - else: - return repr(k) - arg_string = ", ".join([myrepr(k) for k in args] + ['{}={}'.format(name, myrepr(value)) for name, value in kwargs.items()]) - expression = "{}({})".format(local_name, arg_string) - return vaex.expression.Expression(self, expression) - return wrap - f = closure() - try: - f = functools.wraps(value)(f) - except AttributeError: - pass # python2 quicks.. ? - setattr(functions, name, f) - for name, value in self.functions.items(): - setattr(functions, name, value) - - return functions - def copy(self, column_names=None, virtual=True): df = DataFrameArrays() df._length_unfiltered = self._length_unfiltered diff --git a/packages/vaex-core/vaex/expression.py b/packages/vaex-core/vaex/expression.py index 5b17c4feb1..b1811d8562 100644 --- a/packages/vaex-core/vaex/expression.py +++ b/packages/vaex-core/vaex/expression.py @@ -8,7 +8,6 @@ import numpy as np import tabulate -from vaex.functions import expression_namespace, _scopes from vaex.utils import _ensure_strings_from_expressions, _ensure_string_from_expression from vaex.column import ColumnString import vaex.serialize @@ -24,6 +23,10 @@ default_shape = 128 PRINT_MAX_COUNT = 10 +expression_namespace = {} +expression_namespace['nan'] = np.nan + + _binary_ops = [ dict(code="+", name='add', op=operator.add), dict(code="in", name='contains', op=operator.contains), @@ -100,33 +103,11 @@ def f(a): return Expression(self.ds, expression=expression) attrs['__%s__' % op['name']] = f wrap(op) - for name, func_real in expression_namespace.items(): - def wrap(name=name): - def f(*args, **kwargs): - self = args[0] - - def to_expression(expression): - if isinstance(expression, str): - expression = repr(expression) - if isinstance(expression, Expression): - assert expression.ds == self.ds - expression = expression.expression - return expression - expressions = [to_expression(e) for e in args] - # print(name, expressions) - expression = '{0}({1})'.format(name, ", ".join(expressions)) - return Expression(self.ds, expression=expression) - try: - f = functools.wraps(func_real)(f) - except AttributeError: - pass # numpy ufuncs don't have a __module__, which may choke wraps - - attrs['%s' % name] = f - if name not in attrs: - wrap(name) return type(future_class_name, future_class_parents, attrs) + class DateTime(object): + """DateTime operations""" def __init__(self, expression): self.expression = expression @@ -150,37 +131,21 @@ def hour(self): def weekofyear(self): return self.expression.ds.func.dt_weekofyear(self.expression) + class StringOperations(object): """String operations""" def __init__(self, expression): self.expression = expression -class PandasStringOperations(object): + +class StringOperationsPandas(object): """String operations using Pandas Series""" def __init__(self, expression): self.expression = expression -for name, function in _scopes['str'].items(): - full_name = 'str_' + name - def closure(name=name, full_name=full_name, function=function): - def wrapper(self, *args, **kwargs): - lazy_func = getattr(self.expression.ds.func, full_name) - args = (self.expression, ) + args - return lazy_func(*args, **kwargs) - return wrapper - setattr(StringOperations, name, closure()) - -for name, function in _scopes['str_pandas'].items(): - full_name = 'str_pandas_' + name - def closure(name=name, full_name=full_name, function=function): - def wrapper(self, *args, **kwargs): - lazy_func = getattr(self.expression.ds.func, full_name) - args = (self.expression, ) + args - return lazy_func(*args, **kwargs) - return wrapper - setattr(PandasStringOperations, name, closure()) class Expression(with_metaclass(Meta)): + """Expression class""" def __init__(self, ds, expression): self.ds = ds if isinstance(expression, Expression): @@ -199,7 +164,7 @@ def str(self): @property def str_pandas(self): """Gives access to string operations (using Pandas Series)""" - return PandasStringOperations(self) + return StringOperationsPandas(self) @property def values(self): @@ -475,7 +440,7 @@ def jit_numba(self, verbose=False): import imp import hashlib names = [] - funcs = set(vaex.dataset.expression_namespace.keys()) + funcs = set(expression_namespace.keys()) # if it's a virtual column, we probably want to optimize that # TODO: fully extract the virtual columns, i.e. depending ones? expression = self.expression @@ -504,7 +469,7 @@ def jit_pythran(self, verbose=False): import hashlib # self._import_all(module) names = [] - funcs = set(vaex.dataset.expression_namespace.keys()) + funcs = set(expression_namespace.keys()) expression = self.expression if expression in self.ds.virtual_columns: expression = self.ds.virtual_columns[self.expression] @@ -529,7 +494,7 @@ def f({0}): module = imp.load_dynamic(module_name, module_path) function_name = "f_" + m.hexdigest() - vaex.dataset.expression_namespace[function_name] = module.f + expression_namespace[function_name] = module.f return Expression(self.ds, "{0}({1})".format(function_name, argstring)) finally: diff --git a/packages/vaex-core/vaex/functions.py b/packages/vaex-core/vaex/functions.py index 39791e80a8..ee4d92f769 100644 --- a/packages/vaex-core/vaex/functions.py +++ b/packages/vaex-core/vaex/functions.py @@ -4,13 +4,75 @@ from vaex import column from vaex.column import _to_string_sequence import re +import vaex.expression +import functools -# @vaex.serialize.register +# @vaex.serialize.register_function # class Function(FunctionSerializable): +scopes = { + 'str': vaex.expression.StringOperations, + 'str_pandas': vaex.expression.StringOperationsPandas, + 'dt': vaex.expression.DateTime +} + +def register_function(scope=None, as_property=False, name=None): + """Decorator to register a new function with vaex. + + Example: + + >>> import vaex + >>> df = vaex.example() + >>> @vaex.register_function() + >>> def invert(x): + >>> return 1/x + >>> df.x.invert() + + + >>> import numpy as np + >>> df = vaex.from_arrays(departure=np.arange('2015-01-01', '2015-12-05', dtype='datetime64')) + >>> @vaex.register_function(as_property=True, scope='dt') + >>> def dt_relative_day(x): + >>> return vaex.functions.dt_dayofyear(x)/365. + >>> df.departure.dt.relative_day + """ + prefix = '' + if scope: + prefix = scope + "_" + if scope not in scopes: + raise KeyError("unknown scope") + def wrapper(f, name=name): + name = name or f.__name__ + # remove possible prefix + if name.startswith(prefix): + name = name[len(prefix):] + full_name = prefix + name + if scope: + def closure(name=name, full_name=full_name, function=f): + def wrapper(self, *args, **kwargs): + lazy_func = getattr(self.expression.ds.func, full_name) + args = (self.expression, ) + args + return lazy_func(*args, **kwargs) + return functools.wraps(function)(wrapper) + if as_property: + setattr(scopes[scope], name, property(closure())) + else: + setattr(scopes[scope], name, closure()) + else: + def closure(name=name, full_name=full_name, function=f): + def wrapper(self, *args, **kwargs): + lazy_func = getattr(self.ds.func, full_name) + args = (self, ) + args + return lazy_func(*args, **kwargs) + return functools.wraps(function)(wrapper) + setattr(vaex.expression.Expression, name, closure()) + vaex.expression.expression_namespace[prefix + name] = f + return f # we leave the original function as is + return wrapper + # name maps to numpy function # : -function_mapping = [name.strip().split(":") if ":" in name else (name, name) for name in """ +numpy_function_mapping = [name.strip().split(":") if ":" in name else (name, name) for name in """ sinc sin cos @@ -38,17 +100,25 @@ minimum maximum clip -nan searchsorted """.strip().split()] -expression_namespace = {} -for name, numpy_name in function_mapping: +for name, numpy_name in numpy_function_mapping: if not hasattr(np, numpy_name): raise SystemError("numpy does not have: %s" % numpy_name) else: - expression_namespace[name] = getattr(np, numpy_name) - - + function = getattr(np, numpy_name) + def f(function=function): + def wrapper(*args, **kwargs): + return function(*args, **kwargs) + return wrapper + try: + function = functools.wraps(function)(f()) + except AttributeError: + function = f() # python 2 case + register_function(name=name)(function) + + +@register_function() def fillna(ar, value, fill_nan=True, fill_masked=True): '''Returns an array where missing values are replaced by value. @@ -74,67 +144,47 @@ def fillna(ar, value, fill_nan=True, fill_masked=True): return ar -expression_namespace['fillna'] = fillna - +########## datetime operations ########## +@register_function(scope='dt', as_property=True) def dt_dayofweek(x): import pandas as pd # x = x.astype("