In [None]:
#default_exp s3

# S3Cache

## Deals with everything that reads and write to the s3 cache for the database

In [None]:
#export
from s3bz.s3bz import S3
from nicHelper.wrappers import add_method, add_class_method, add_static_method
from nicHelper.dictUtil import stripDict, printDict, hashDict, saveStringToFile, loadStringFromFile, saveDictToFile, loadDictFromFile
from nicHelper.exception import errorString
from dict_hash import dict_hash, sha256
from base64 import b64encode, b64decode
import os, logging

In [None]:
#hide
import pickle, os

os.environ['DATABASE_TABLE_NAME'] = 'product-table-dev-manual'
os.environ['REGION'] = 'ap-southeast-1'
os.environ['INVENTORY_BUCKET_NAME'] = 'product-bucket-dev-manual'
os.environ['INPUT_BUCKET_NAME'] = 'input-product-bucket-dev-manual'
os.environ['DAX_ENDPOINT'] = 'longtermcluster.vuu7lr.clustercfg.dax.apse1.cache.amazonaws.com:8111'
os.environ['LINEKEY'] = '2uAfV4AoYglUGmKTAk2xNOm0aV2Ufgh1BQPvQl9vJd4'
REGION = 'ap-southeast-1'

In [None]:
from villaProductDatabase.database import ProductDatabase
import logging
logging.basicConfig(level=logging.INFO)

In [None]:
#export
import os
DBHASHLOCATION = '/tmp/database.hash'
DBCACHELOCATION = '/tmp/database.cache'
DATABASE_TABLE_NAME = os.environ.get('DATABASE_TABLE_NAME')
INVENTORY_BUCKET_NAME = os.environ.get('INVENTORY_BUCKET_NAME')
INPUT_BUCKET_NAME = os.environ.get('INPUT_BUCKET_NAME')
REGION = os.environ.get('REGION') or 'ap-southeast-1'
ACCESS_KEY_ID = os.environ.get('USER') or None
SECRET_ACCESS_KEY = os.environ.get('PW') or None
LINEKEY= os.environ.get('LINEKEY')
  
try:
  DAX_ENDPOINT = os.environ['DAX_ENDPOINT']
  print(DAX_ENDPOINT)
except KeyError as e:
  print(f'dax endpoint missing {e}')
  

longtermcluster.vuu7lr.clustercfg.dax.apse1.cache.amazonaws.com:8111


In [None]:
#export
class S3Cache:
  pass

In [None]:
class Tester( S3Cache, ProductDatabase):
  class Meta:
    table_name = os.environ['DATABASE_TABLE_NAME']
    region = os.environ['REGION']
    billing_mode='PAY_PER_REQUEST'
    dax_read_endpoints = [DAX_ENDPOINT] if DAX_ENDPOINT else None
    dax_write_endpoints = [DAX_ENDPOINT] if DAX_ENDPOINT else None
  pass
  

## Save and load hash

In [None]:
#export
@add_class_method(S3Cache)
def saveHash(cls , data:dict, key='allData', bucket=INVENTORY_BUCKET_NAME, 
             cachePath=DBCACHELOCATION, hashPath = DBHASHLOCATION):
  hashKey = f'{key}-hash'
  hashString = hashDict(data)
  dictToSave= {'hash': hashString }
  print(f'hashKey is {hashKey}')
  print(f'saving cache file')
  saveDictToFile(data, path = cachePath)
  print(f'saving hash file')
  saveStringToFile(hashString, path=hashPath)
  print('saving hash to s3')
  S3.save(key=hashKey,objectToSave=dictToSave, bucket=bucket)
  print(f'saved hash {hashString}')
@add_class_method(S3Cache)
def loadHash(cls,key='allData', bucket=INVENTORY_BUCKET_NAME):
  hashKey = f'{key}-hash'
  print(f'loading hashkey {hashKey}')
  loadedHash= S3.load(hashKey,bucket=bucket).get('hash')
  print(f'loaded hash is{loadedHash}')
  return loadedHash

In [None]:
testKey = 'testHash'
S3Cache.saveHash({'test':'test'}, key=testKey)
S3Cache.loadHash(key=testKey)

INFO:root:using accelerate endpoint
INFO:root:data was saved to s3
INFO:root:using accelerate endpoint


hashKey is testHash-hash
saving cache file
saving hash file
saving hash to s3
saved hash 9Sx/tDiQ9yHHG6sI8HBk1huYVbk=
loading hashkey testHash-hash


INFO:root:object exists, loading
INFO:root:using accelerate endpoint


loaded hash is9Sx/tDiQ9yHHG6sI8HBk1huYVbk=


'9Sx/tDiQ9yHHG6sI8HBk1huYVbk='

In [None]:
#export
@add_class_method(S3Cache)
def loadFromS3(cls, bucketName= INVENTORY_BUCKET_NAME, key = 'allData',
               hashPath=DBHASHLOCATION, cachePath = DBCACHELOCATION,**kwargs):
  '''
  this is not a real time function, there may be a delay of sync between
  the main dynamodb database and the cache
  '''
  
  if os.path.exists(hashPath) and os.path.exists(cachePath):
    print('cache exist')
    if cls.loadHash(key=key) == loadStringFromFile(hashPath):
      db = loadDictFromFile(cachePath)
      print('found a valid cache, using cache')
      return db
    else:
      print('cache has different hash than s3')
  print('cache doesnt exist')
  logging.info(f'loading from {bucketName}')
  logging.info(f'user is {kwargs.get("user")}')
  database =  S3.loadPklZl(key=f'{key}-pklzl', bucket = bucketName,  **kwargs)
#   database =  S3.load(key=f'{key}', bucket = bucketName,  **kwargs)
  print(database)
  cls.saveHash(database)
  return database

In [None]:
%time Tester.loadHash(key='allData')
%time db = loadDictFromFile(DBCACHELOCATION)

INFO:root:using accelerate endpoint


loading hashkey allData-hash


INFO:root:object exists, loading
INFO:root:using accelerate endpoint


loaded hash isVwSSjMiQzlnwM4SuIQumrYVPJTk=
CPU times: user 51.4 ms, sys: 0 ns, total: 51.4 ms
Wall time: 489 ms
CPU times: user 306 µs, sys: 0 ns, total: 306 µs
Wall time: 234 µs


In [None]:
key = 'allData'
tempPath = '/tmp/test.csv'
print('load from s3')
%time database =  S3.loadPklZl(key=f'{key}-pklzl', bucket=INVENTORY_BUCKET_NAME)
print(len(database))
print('filter db for pandas')
%time filteredDb = [list(item.values())[0] for item in database.values()]
print('gen pandas df')
%time df = pd.DataFrame(filteredDb).astype(str)

print('csv')
%time df.to_csv(tempPath)
%time pd.read_csv(tempPath, dtype=str)
print(os.path.getsize(tempPath)/1e6)
print('parquet')
%time df.to_parquet(tempPath)
%time pd.read_parquet(tempPath)
print(os.path.getsize(tempPath)/1e6)
print('pickle')
%time df.to_pickle(tempPath)
%time pd.read_pickle(tempPath)
print(os.path.getsize(tempPath)/1e6)
print('feather')
%time df.to_feather(tempPath)
%time pd.read_feather(tempPath)
print(os.path.getsize(tempPath)/1e6)
print('hdf')
os.remove(tempPath)
%time df.to_hdf(tempPath,key='df')
%time pd.read_hdf(tempPath, key='df')
print(os.path.getsize(tempPath)/1e6)

INFO:root:using accelerate endpoint


load from s3


INFO:root:object exists, loading
INFO:root:using accelerate endpoint


CPU times: user 632 ms, sys: 33.8 ms, total: 666 ms
Wall time: 1.16 s
45149
filter db for pandas
CPU times: user 65.9 ms, sys: 0 ns, total: 65.9 ms
Wall time: 65.6 ms
gen pandas df
CPU times: user 1.36 s, sys: 0 ns, total: 1.36 s
Wall time: 1.36 s
csv
CPU times: user 947 ms, sys: 2.53 ms, total: 950 ms
Wall time: 962 ms
CPU times: user 747 ms, sys: 30.6 ms, total: 778 ms
Wall time: 777 ms
31.603618
parquet
CPU times: user 637 ms, sys: 27.9 ms, total: 664 ms
Wall time: 663 ms
CPU times: user 341 ms, sys: 92.4 ms, total: 433 ms
Wall time: 349 ms
17.107349
pickle
CPU times: user 663 ms, sys: 87.9 ms, total: 751 ms
Wall time: 762 ms
CPU times: user 227 ms, sys: 103 ms, total: 330 ms
Wall time: 328 ms
37.249584
feather
CPU times: user 397 ms, sys: 55.2 ms, total: 452 ms
Wall time: 405 ms
CPU times: user 200 ms, sys: 67.1 ms, total: 267 ms
Wall time: 254 ms
23.96149
hdf
CPU times: user 727 ms, sys: 136 ms, total: 864 ms
Wall time: 880 ms
CPU times: user 615 ms, sys: 219 ms, total: 834 ms
Wal

In [None]:
import pandas as pd
import json
df = pd.DataFrame({'hello':['1','2','2'], 'world':['1','2','3']})
df
# tableDict = df.to_dict(orient='list')
# json.dumps(tableDict)

Unnamed: 0,hello,world
0,1,1
1,2,2
2,2,3


## Save to s3 with different options

In [None]:
#export
@add_class_method(S3Cache)
def saveAllS3(cls, objectToSave:dict, bucketName= INVENTORY_BUCKET_NAME, key = 'allData', 
              hashPath = DBHASHLOCATION, cachePath = DBCACHELOCATION, **kwargs):
  if os.path.exists(cachePath) and os.path.exists(hashPath):
    if loadStringFromFile(hashPath) == cls.loadHash(key=key, bucket=bucketName):
      print('the object did not change, skip saving')
      return
  S3.save(key=key, bucket=bucketName, objectToSave=objectToSave)
  S3.savePklZl(key=f'{key}-pklzl',bucket=bucketName, objectToSave=objectToSave)
  S3.saveZl(key=f'{key}-zl',bucket=bucketName, objectToSave=objectToSave)
  print(f'saving hash with key {key}')
  cls.saveHash(objectToSave, key=key)
  

In [None]:
%%time
key = 'testKey'
Tester.saveAllS3(objectToSave={'test':'test'}, key = key)
Tester.loadFromS3(key=key)

INFO:root:using accelerate endpoint


loading hashkey testKey-hash


INFO:root:object exists, loading
INFO:root:using accelerate endpoint
INFO:root:using accelerate endpoint


loaded hash isx8KAJ3w/W83jHPrsnTFSxm3egUc=
the object did not change, skip saving
cache exist
loading hashkey testKey-hash


INFO:root:object exists, loading
INFO:root:using accelerate endpoint


loaded hash isx8KAJ3w/W83jHPrsnTFSxm3egUc=
found a valid cache, using cache
CPU times: user 97.1 ms, sys: 1.5 ms, total: 98.6 ms
Wall time: 854 ms


{'test': 'test'}

In [None]:
%%time
import sys
database = Tester.loadFromS3()
sys.getsizeof(database)

INFO:root:using accelerate endpoint


cache exist
hashKey is allData-hash


INFO:root:object exists, loading
INFO:root:using accelerate endpoint


found a valid cache, using cache
CPU times: user 59.3 ms, sys: 3.86 ms, total: 63.2 ms
Wall time: 488 ms


232

In [None]:
%%time
import msgpack
with open ('/tmp/testfile.test', 'wb') as f:
  data = msgpack.packb(database)
  f.write(data)
  

CPU times: user 2.5 ms, sys: 0 ns, total: 2.5 ms
Wall time: 2.79 ms


In [None]:
%%time
with open ('/tmp/testfile.test', 'rb') as f:
  data = f.read()
  msgpack.unpackb(data)
  

CPU times: user 823 µs, sys: 0 ns, total: 823 µs
Wall time: 498 µs


In [None]:
%%time
with open ('/tmp/testfilep.test', 'wb') as f:
  pickle.dump(database,f)
  

CPU times: user 875 µs, sys: 161 µs, total: 1.04 ms
Wall time: 701 µs


In [None]:
%%time
with open ('/tmp/testfilep.test', 'rb') as f:
  pickle.load(f)

CPU times: user 754 µs, sys: 0 ns, total: 754 µs
Wall time: 548 µs


In [None]:
filteredDb = [list(item.values())[0] for item in database.values()]
df:pd.DataFrame = pd.DataFrame(filteredDb)
df.head()
# %time df.set_index('iprcode')

AttributeError: 'str' object has no attribute 'values'

In [None]:
%time res = df.to_json()
%time res = df.to_csv('/tmp/test.csv')
%time df.

In [None]:
import sys
sys.getsizeof(database )

In [None]:
from nicHelper.dictUtil import hashDict
%time hashDict(database)

In [None]:
S3.save(key='test',objectToSave=hashDict(database), bucket=INVENTORY_BUCKET_NAME)