In [13]:
import string
import time
import pandas as pd
import pickle
import random
import dill
import pyarrow.feather as feather
from io import BytesIO
from pyarrow import plasma

# Generate DataFrame

In [2]:
def random_string(length): 
    letters = string.ascii_letters 
    return ''.join(random.choice(letters) for _ in range(length))

def random_strings(string_length, list_length): 
    return [random_string(string_length) for _ in range(list_length)]

In [3]:
string_length = 16
list_length = 5*10**6
df = pd.DataFrame(dict(a=random_strings(string_length, list_length), b=random_strings(string_length, list_length)))

In [4]:
df2 = df
df2['c'] = df.a
df2['d'] = df.b

df4 = df2
df2['e'] = df2.a
df2['f'] = df2.b

In [61]:
df.memory_usage(deep=True)/2**20

Index      0.000076
a        348.091125
b        348.091125
c        348.091125
d        348.091125
dtype: float64

In [62]:
df2.memory_usage(deep=True)/2**20

Index      0.000076
a        348.091125
b        348.091125
c        348.091125
d        348.091125
dtype: float64

In [5]:
df4.memory_usage(deep=True)/2**20

Index      0.000076
a        348.091125
b        348.091125
c        348.091125
d        348.091125
e        348.091125
f        348.091125
dtype: float64

# Serialization

In [40]:
%%time
pickled = pickle.dumps(df)
depickled_df = pickle.loads(pickled)

CPU times: user 5.17 s, sys: 769 ms, total: 5.94 s
Wall time: 5.9 s


In [67]:
del(pickled)
del(depickled_df)

NameError: name 'pickled' is not defined

In [42]:
%%time
dilled = dill.dumps(df)
dedilled_df = pickle.loads(dilled)

CPU times: user 35.5 s, sys: 1.12 s, total: 36.6 s
Wall time: 36.6 s


In [68]:
del(dilled)
del(dedilled_df)

NameError: name 'dilled' is not defined

In [44]:
%%time
feathered = BytesIO()
feather.write_feather(df, feathered)
defeathered_df = feather.read_feather(feathered)

CPU times: user 4.42 s, sys: 1.58 s, total: 6 s
Wall time: 5.97 s


In [66]:
del(feathered)
del(defeathered_df)

NameError: name 'feathered' is not defined

# Pickled then dilled

In [8]:
%%time
pickled_then_dilled = dill.dumps(pickle.dumps(df4))
dedilled_then_depickled_df = pickle.loads(dill.loads(pickled_then_dilled))

CPU times: user 10.9 s, sys: 1.34 s, total: 12.2 s
Wall time: 12.1 s


In [9]:
del(pickled_then_dilled)
del(dedilled_then_depickled_df)

# Feathered then dilled

In [6]:
%%time
feathered = BytesIO()
feather.write_feather(df4, feathered)
feathered_then_dilled = dill.dumps(feathered)

dedilled = dill.loads(feathered_then_dilled)
dedilled_then_defeathered = feather.read_feather(dedilled)

CPU times: user 13.9 s, sys: 6.46 s, total: 20.4 s
Wall time: 20.4 s


In [7]:
del(feathered)
del(feathered_then_dilled)
del(dedilled)
del(dedilled_then_defeathered)

# Feathered then in ram

In [10]:
%%time
feather_file = '/dev/shm/df.feather' 
feather.write_feather(df2, feather_file)
defeathered = feather.read_feather(feather_file)

CPU times: user 12.5 s, sys: 4.64 s, total: 17.1 s
Wall time: 17 s


In [11]:
del(defeathered)

# Plasma

In [18]:
plasma_client = plasma.connect('/tmp/plasma')
object_id = plasma_client.put(df2)
deplasmaed = plasma_client.get(object_id)

CPU times: user 682 µs, sys: 0 ns, total: 682 µs
Wall time: 804 µs


In [None]:
del(plasma_)