Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Exporting out of memory dataframe to parquet error #2421

Open
Intijir opened this issue Apr 20, 2024 · 0 comments
Open

Exporting out of memory dataframe to parquet error #2421

Intijir opened this issue Apr 20, 2024 · 0 comments

Comments

@Intijir
Copy link

Intijir commented Apr 20, 2024

I am trying to export an out of memory dataframe to parquet as in the following code but i keep getting the following error.

Code:

'''
import numpy as np
from matplotlib import pyplot as plt
import vaex as vd

def custom_shift(df, column):
# Extract the values of the column
values = vd.from_arrays(column=df[f'{column}'].values)
# Create a new shifted column with None as the first value
d = vd.from_arrays(column=[None])
shifted_values = vd.concat([d, values[:-1]])
shifted_values = vd.from_arrays(ClosePrice_shifted=shifted_values).ClosePrice_shifted.values
df.add_column(f'{column}_shifted', shifted_values)
# Add the shifted values as a new column
return df

def get_threshold(daily_returns, lookback=40):
ewm_std = np.abs(daily_returns.rolling(window=lookback).std())
threshold = np.exp(ewm_std)
return threshold.mean() * 0.1

ddf = custom_shift(ddf, 'ClosePrice')
ddf = ddf.dropna()
ddf['daily_returns'] = ddf['ClosePrice'] / ddf['ClosePrice_shifted'] - 1
ddf['threshold'] = (ddf['daily_returns'].apply(get_threshold))

ddf.export_parquet('dollar_bars_threshold.parquet', engine='pyarrow')
'''

Error:
Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 198, in getitem
raise KeyError("Unknown variables or column: %r" % (variable,))
KeyError: "Unknown variables or column: '((ClosePrice / ClosePrice_shifted) - 1)'"

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2273, in data_type
data = self.evaluate(expression, 0, 1, filtered=False, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6562, in _evaluate_implementation
value = block_scope.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 113, in evaluate
result = self[expression]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 188, in getitem
values = self.evaluate(expression)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\scopes.py", line 119, in evaluate
result = eval(expression, expression_namespace, self)
File "", line 1, in
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 74, in operator
result_data = a.add_missing(result_data)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\numpy_dispatch.py", line 27, in add_missing
ar = vaex.array_types.to_arrow(ar)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\array_types.py", line 184, in to_arrow
return pa.array(x)
File "pyarrow\array.pxi", line 340, in pyarrow.lib.array
File "pyarrow\array.pxi", line 86, in pyarrow.lib._ndarray_to_array
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowInvalid: only handle 1-dimensional arrays

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
File "C:\Users\Intijir\PycharmProjects\quantStrategy\Data\Labeling\cumulative_sum.py", line 89, in
ddf.export_parquet('dollar_bars_threshold.parquet', engine='pyarrow')
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6823, in export_parquet
schema = self.schema_arrow(reduce_large=True)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2335, in schema_arrow
return pa.schema({name: reduce(dtype.arrow) for name, dtype in self.schema().items()})
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in schema
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2323, in
return {column_name:self.data_type(column_name) for column_name in self.get_column_names()}
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 2275, in data_type
data = self.evaluate(expression, 0, 1, filtered=True, array_type=array_type, parallel=False)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 3095, in evaluate
return self._evaluate_implementation(expression, i1=i1, i2=i2, out=out, selection=selection, filtered=filtered, array_type=array_type, parallel=parallel, chunk_size=chunk_size, progress=progress)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 6402, in _evaluate_implementation
max_stop = (len(self) if (self.filtered and filtered) else self.length_unfiltered())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 4326, in len
self._cached_filtered_length = int(self.count())
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 967, in count
return self._compute_agg('count', expression, binby, limits, shape, selection, delay, edges, progress, array_type=array_type)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 941, in _compute_agg
return self._delay(delay, progressbar.exit_on(var))
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 1780, in _delay
self.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataframe.py", line 421, in execute
self.executor.execute()
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 308, in execute
for _ in self.execute_generator():
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\execution.py", line 432, in execute_generator
yield from self.thread_pool.map(self.process_part, dataset.chunk_iterator(run.dataset_deps, chunk_size),
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 100, in map
iterator = super(ThreadPoolIndex, self).map(wrapped, cancellable_iter())
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in map
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 598, in
fs = [self.submit(fn, *args) for args in zip(*iterables)]
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\multithreading.py", line 86, in cancellable_iter
for value in chunk_iterator:
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\dataset.py", line 1257, in chunk_iterator
for (i1, i2, ichunks), (j1, j2, jchunks) in zip(
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 182, in chunk_iterator
chunks = chunks_future.result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 446, in result
return self.__get_result()
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures_base.py", line 391, in __get_result
raise self._exception
File "C:\Users\Intijir\AppData\Local\Programs\Python\Python39\lib\concurrent\futures\thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "C:\Users\Intijir\PycharmProjects\quantStrategy\venv\lib\site-packages\vaex\arrow\dataset.py", line 114, in reader
table = fragment.to_table(columns=list(columns_physical), use_threads=False)
File "pyarrow_dataset.pyx", line 1613, in pyarrow._dataset.Fragment.to_table
File "pyarrow_dataset.pyx", line 3713, in pyarrow._dataset.Scanner.to_table
File "pyarrow\error.pxi", line 154, in pyarrow.lib.pyarrow_internal_check_status
File "pyarrow\error.pxi", line 91, in pyarrow.lib.check_status
pyarrow.lib.ArrowMemoryError: malloc of size 8388608 failed

Thanks!

@Intijir Intijir changed the title Exporting out of memory dataframe to parquet out of memory issue Exporting out of memory dataframe to parquet error Apr 20, 2024
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
None yet
Projects
None yet
Development

No branches or pull requests

1 participant