In [31]:
import os
import pandas as pd
import numpy as np
from uuid import uuid4
import pickle

In [32]:
n,m = 1000, 20
frame = pd.DataFrame(data=np.random.randn(n, m), columns=[str(uuid4()) for _ in range(0, m)])

## Attempts without Mongo

In [33]:
%%timeit
frame.to_parquet("test.parquet", engine="pyarrow")
pd.read_parquet("test.parquet", engine="pyarrow")

10.5 ms ± 1.39 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [34]:
%%timeit
frame.to_pickle("test.pickle")
pd.read_pickle("test.pickle")

1.69 ms ± 215 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


## Connecting to Mongo

In [36]:
MONGODB_SETTINGS = {'db': 'mongo', 'host':'mongo'}

from mongoengine import *
connect(**MONGODB_SETTINGS)

MongoClient(host=['mongo:27017'], document_class=dict, tz_aware=False, connect=True, read_preference=Primary())

## Using Parquet, Convert frame to parquet and back

In [37]:
from antarctic.PandasFields import ParquetFrameField

class Maffay(Document):
    frame = ParquetFrameField()

In [38]:
%%timeit
m=Maffay(frame=frame).save()
m.frame

10.7 ms ± 1.41 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Using Pickle, Convert using Pickle and back

In [39]:
from antarctic.PandasFields import PickleFrameField

class Falco(Document):
    frame = PickleFrameField()

In [40]:
%%timeit
f = Falco(frame=frame).save()
f.frame

2.12 ms ± 471 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


## Using Arctic

In [None]:
from arctic import Arctic 

# Connect to Local MONGODB
store = Arctic('mongo')

# Create the library - defaults to VersionStore
store.initialize_library('test')

# Access the library
library = store['test']

In [None]:
%%timeit

# Store the data in the library
library.write('test', frame)

# Reading the data
item = library.read('test').data

Library created, but couldn't enable sharding: no such command: 'enablesharding'. This is OK if you're not 'admin'


In [42]:
%%timeit

# Store the data in the library
library.write('test', frame)

# Reading the data
item = library.read('test').data

15.2 ms ± 1.68 ms per loop (mean ± std. dev. of 7 runs, 100 loops each)
