In [2]:
import fletcher as fr
import pandas as pd
import pyarrow as pa
import numpy as np

In [3]:
string_array = np.random.choice(['Ésdfasdfasdf', 'Ézxcvzxcvzxcv', 'qawef'], size=10000)
string_array

array(['qawef', 'qawef', 'Ézxcvzxcvzxcv', ..., 'Ézxcvzxcvzxcv', 'qawef',
       'qawef'], dtype='<U13')

In [4]:
fr_cont_str = pd.Series(fr.FletcherContinuousArray(string_array))

In [5]:
fr_chunk_str = pd.Series(fr.FletcherChunkedArray(string_array))

In [6]:
pd_str = pd.Series(string_array)

In [7]:
fr_cont_str.text.slice(0, 2, 1)

0       qa
1       qa
2       Éz
3       qa
4       Éz
        ..
9995    És
9996    És
9997    Éz
9998    qa
9999    qa
Length: 10000, dtype: fletcher_continuous[string]

In [8]:
fr_chunk_str.text.slice(0, 2, 1)

0       qa
1       qa
2       Éz
3       qa
4       Éz
        ..
9995    És
9996    És
9997    Éz
9998    qa
9999    qa
Length: 10000, dtype: fletcher_chunked[string]

In [7]:
assert (pd_str.str.slice(0, 2, 1) == fr_cont_str.text.slice(0, 2, 1)).all()

In [11]:
%%timeit
pd_str.str.slice(0, 2, 1)

2.15 ms ± 71.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [12]:
%%timeit
fr_cont_str.text.slice(0, 2, 1)

1.8 ms ± 149 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [13]:
%%timeit
fr_chunk_str.text.slice(0, 2, 1)

1.72 ms ± 16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
%load_ext line_profiler

In [11]:
%lprun -f fr_cont_str.text.slice fr_cont_str.text.slice(0, 2, 1)

Timer unit: 1e-06 s

Total time: 0.00229 s
File: /Users/marc_qco/repos/fletcher/fletcher/string_array.py
Function: slice at line 385

Line #      Hits         Time  Per Hit   % Time  Line Contents
   385                                               def slice(self, start, end, step):
   386         1         40.0     40.0      1.7          offsets, data = _extract_string_buffers(self.data)
   387         2        283.0    141.5     12.4          return self._series_like(
   388         1       1967.0   1967.0     85.9              finalize_string_array(_slice(offsets, data, start, end, step), pa.string())
   389                                                   )

In [12]:
fr_str = pd.Series(fr.FletcherContinuousArray(["a", "awefawef", ""]))

In [18]:
np.asanyarray(fr_str.text.data.buffers()[1]).view(np.int32)

array([0, 1, 9, 9], dtype=int32)

In [56]:
np.asanyarray(fr_str.text.data.buffers()[2]).view(np.uint8)

array([ 97,  97, 119, 101, 102,  97, 119, 101, 102], dtype=uint8)

In [19]:
from fletcher.algorithms.string_builder import StringArrayBuilder

In [45]:
builder = StringArrayBuilder(100)

In [31]:
builder.append_null()

In [72]:
builder.append_value(np.array([1, 2, 3], dtype=np.uint8), 3)

In [69]:
builder.append_value(None, 0)

In [75]:
builder.valid_bits.buf

[15]

In [76]:
bin(15)

'0b1111'

In [78]:
builder.data.buf

[1, 2, 3, 1, 2, 3]

In [70]:
builder.current_offset

3

In [73]:
builder.value_offsets.buf

[0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0, 6, 0, 0, 0]

In [68]:
np.asanyarray(builder.value_offsets.buf).view(np.int64)

array([0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0])

In [50]:
np.asanyarray(builder.value_offsets.buf).view(np.uint32)

array([0, 0, 0, 0, 0, 0, 0, 0], dtype=uint32)

In [39]:
builder.value_offsets.buf

[0, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 3, 0, 0, 0]