In [1]:
import pandas as pd
import random
from contextlib import contextmanager
import time
import json
import re

In [20]:
# random_step iterator
class random_step:
    def __init__(self, size, step):
        self.size = size
        self.step = step

    def __iter__(self):
        idx = 0
        cnt = 0
        while cnt < self.size:
            idx += random.randint(1, self.step)
            cnt += 1
            yield idx

# random bucket iterator - simulate API bucket walker
class random_bucket:
    def __init__(self, nsub, nott, step):
        self.nsub = nsub
        self.nott = nott
        self.step = step

    def __iter__(self):
        iter_sub = map(lambda x: '%+88693{:07.0f}'.format(x), random_step(self.nsub, self.step))
        for sub in iter_sub:
            iter_ott = map(lambda x: '%ott-{:04.0f}.com.tw'.format(x), random_step(self.nott, self.step))
            for ott in iter_ott:
                mtr_f = random.randint(100000000, 999999999)
                mtr_o = random.randint(100000000, 999999999)
                row = [sub, ott, mtr_f, mtr_o]
                yield row

In [22]:
# the generated ott is not randomized
data = list(random_bucket(3, 4, 5))
data

[['%+886930000004', '%ott-0005.com.tw', 412941572, 668304142],
 ['%+886930000004', '%ott-0009.com.tw', 387444304, 793767575],
 ['%+886930000004', '%ott-0013.com.tw', 323989728, 807475506],
 ['%+886930000004', '%ott-0015.com.tw', 297546486, 680729174],
 ['%+886930000006', '%ott-0003.com.tw', 578292103, 723723210],
 ['%+886930000006', '%ott-0007.com.tw', 110997896, 854545933],
 ['%+886930000006', '%ott-0008.com.tw', 699525597, 205737696],
 ['%+886930000006', '%ott-0013.com.tw', 641986948, 373141876],
 ['%+886930000008', '%ott-0004.com.tw', 326832810, 789878284],
 ['%+886930000008', '%ott-0008.com.tw', 230464311, 553839994],
 ['%+886930000008', '%ott-0010.com.tw', 402647507, 320826626],
 ['%+886930000008', '%ott-0015.com.tw', 130178497, 863519153]]

In [None]:
text = json.dumps(data).encode()
len(text)

In [None]:
df = pd.DataFrame(data, columns=['user', 'ott', 'forward', 'opposite'])
# df = pd.DataFrame(data, columns=['user', 'ott', 'forward', 'opposite'])
df.shape

In [None]:
df.memory_usage(deep=True)

In [None]:
@contextmanager
def mytimer():
    t0 = time.time()
    try:
        yield t0
    finally:
        t1 = time.time()
        print(t1-t0)

### Line JSON

In [None]:
# it took 2m 34.9s, the result is 5.2GB (compressed?)
# the memory go up to 17GB in the process
columns = ['sub','ott','forward','opposite']
with mytimer():
    # iter = random_bucket(100, 100, 20) # 10 sub, each with 10 ott
    iter = random_bucket(10000000, 1000, 20) # 1,000,000 sub, each with 100 ott
    df = pd.DataFrame(iter, columns=columns)
print(df.shape)

In [None]:
df.memory_usage(deep=True)
# Index           128
# 0        6620168226
# 1        6806651838
# 2         745934448
# 3         745934448

In [None]:
df2 = df.astype({'sub': 'category', 'ott': 'category'})
df2.memory_usage(deep=True)

In [None]:
df.info()

In [None]:
df2.info()

In [None]:
df.dtypes

### JSON

In [None]:
# instantiate random buckets and tranlate them to JSON
# it took 3m 35.5s
with mytimer():
    data = json.dumps(list(random_bucket()))

In [None]:
# the result is 5.8GB, but the memory go up to 23GB in the process
len(data)

In [None]:
# convert it to frame, ends with 36GB usage
# it took 10m 46.5
with mytimer():
    df = pd.read_json(data, orient='values')

In [None]:
df.memory_usage(deep=True)
# Index           128
# 0        6620168226
# 1        6806651838
# 2         745934448
# 3         745934448

In [None]:
df.head()

In [None]:
df.shape
# (93241806, 4)