In [None]:
import pandas as pd
import random
import itertools
from contextlib import contextmanager
import time
import json
import re

In [None]:
# random_range iterator
class random_range:
    def __init__(self, start, stop, step):
        self.start = start
        self.stop = stop
        self.step = step

    def __iter__(self):
        while True:
            self.start += random.randint(1, self.step)
            if self.start >= self.stop:
                break
            yield self.start

# random bucket iterator - simulate API bucket walker
class random_bucket:
    def __init__(self, max_sub, max_ott, step):
        iter_sub = map(lambda x: '%+88693{:07.0f}'.format(x), random_range(0, max_sub, step))
        iter_ott = map(lambda x: '%ott-{:04.0f}.com.tw'.format(x), random_range(0, max_ott, step))
        self.iter = itertools.product(iter_sub, iter_ott)

    def __iter__(self):
        for sub, ott in self.iter:
            mtr_f = random.randint(100000000, 999999999)
            mtr_o = random.randint(100000000, 999999999)
            row = [sub, ott, mtr_f, mtr_o]
            yield row

In [None]:
# 14GB memory
data = list(random_bucket(5000000, 1000, 20))
len(data)

In [None]:
text = json.dumps(data).encode()
len(text)

In [None]:
df = pd.DataFrame(data, columns=['user', 'ott', 'forward', 'opposite'])
# df = pd.DataFrame(data, columns=['user', 'ott', 'forward', 'opposite'])
df.shape

In [None]:
df.memory_usage(deep=True)

In [None]:
@contextmanager
def mytimer():
    t0 = time.time()
    try:
        yield t0
    finally:
        t1 = time.time()
        print(t1-t0)

### Line JSON

In [None]:
# it took 2m 34.9s, the result is 5.2GB (compressed?)
# the memory go up to 17GB in the process
columns = ['sub','ott','forward','opposite']
with mytimer():
    # iter = random_bucket(100, 100, 20) # 10 sub, each with 10 ott
    iter = random_bucket(10000000, 1000, 20) # 1,000,000 sub, each with 100 ott
    df = pd.DataFrame(iter, columns=columns)
print(df.shape)

In [None]:
df.memory_usage(deep=True)
# Index           128
# 0        6620168226
# 1        6806651838
# 2         745934448
# 3         745934448

In [None]:
df2 = df.astype({'sub': 'category', 'ott': 'category'})
df2.memory_usage(deep=True)

In [None]:
df.info()

In [None]:
df2.info()

In [None]:
df.dtypes

### JSON

In [None]:
# instantiate random buckets and tranlate them to JSON
# it took 3m 35.5s
with mytimer():
    data = json.dumps(list(random_bucket()))

In [None]:
# the result is 5.8GB, but the memory go up to 23GB in the process
len(data)

In [None]:
# convert it to frame, ends with 36GB usage
# it took 10m 46.5
with mytimer():
    df = pd.read_json(data, orient='values')

In [None]:
df.memory_usage(deep=True)
# Index           128
# 0        6620168226
# 1        6806651838
# 2         745934448
# 3         745934448

In [None]:
df.head()

In [None]:
df.shape
# (93241806, 4)