In [None]:
#default_exp pdUtils

# pd Utils
utilities function for pandas dataframe and series

In [None]:
#export
import pandas as pd
from hashlib import sha1
from nicHelper.dictUtil import saveStringToFile, loadStringFromFile
from pynamodb.constants import BINARY
from pynamodb.attributes import Attribute, UnicodeAttribute
from pynamodb.models import Model
from s3bz.s3bz import S3
import os

In [None]:
df = pd.DataFrame({'hello':[1,2,3,4,5,5]})
df

Unnamed: 0,hello
0,1
1,2
2,3
3,4
4,5
5,5


In [None]:
#export
def getDfHash(df:pd.DataFrame):
  df.to_feather('/tmp/feather')
  with open('/tmp/feather', 'rb') as f:
    objHash = sha1(f.read()).hexdigest()
  return objHash

In [None]:
%%time
getDfHash(df)

CPU times: user 14.7 ms, sys: 7.83 ms, total: 22.5 ms
Wall time: 37.7 ms


'068df0811eb710aa82148159e389157000f0b023'

## Local cache and hash

In [None]:
#export
def saveLocalCache( data:pd.DataFrame, path = '/tmp/cache'):
  saveLocalHash(data, path=path)
  return data.to_feather(path)
def saveLocalHash( data:pd.DataFrame, path = '/tmp/hash'):
  dataHash = getDfHash(data)
  saveStringToFile(dataHash,path)
def loadLocalCache( path = '/tmp/cache'):
  if not os.path.exists(path): raise Exception('cache doesnt exist')
  return pd.read_feather(path)
def loadLocalHash( path = '/tmp/hash'):
  if not os.path.exists(path): raise Exception('hash doesnt exist')
  return loadStringFromFile(path)

In [None]:
%time saveLocalCache(df)
%time saveLocalHash(df)
%time print(loadLocalHash())
%time loadLocalCache()

CPU times: user 3.19 ms, sys: 0 ns, total: 3.19 ms
Wall time: 3.6 ms
CPU times: user 1.72 ms, sys: 0 ns, total: 1.72 ms
Wall time: 1.71 ms
068df0811eb710aa82148159e389157000f0b023
CPU times: user 764 µs, sys: 0 ns, total: 764 µs
Wall time: 539 µs
CPU times: user 3.54 ms, sys: 0 ns, total: 3.54 ms
Wall time: 3.29 ms


Unnamed: 0,hello
0,1
1,2
2,3
3,4
4,5
5,5


## Remote cache and hash

In [None]:
#export
def saveRemoteHash(data:pd.DataFrame, key='', bucket='', **kwargs):
  hashKey = f'{key}-hash'
  hashString = getDfHash(data)
  dictToSave= {'hash': hashString }
  print(f'hashKey is {hashKey}')
  print('saving hash to s3')
  S3.save(key=hashKey,objectToSave=dictToSave, bucket=bucket, **kwargs )
  print(f'saved hash {hashString}')
  

def saveRemoteCache(data:pd.DataFrame, key = '', 
                    bucket = '', localCachePath='/tmp/cache', localHashPath='/tmp/hash', **kwargs):
  
  saveLocalCache(data=data, path = localCachePath)
  saveLocalHash(data=data, path = localHashPath)
  saveRemoteHash(data=data, key = key, bucket=bucket)
  S3.saveFile(key=key, path=localCachePath, bucket=bucket, **kwargs)
  
def loadRemoteCache(key='', bucket='', **kwargs):
  path = '/tmp/tmpPath'
  S3.loadFile(key,path=path ,bucket=bucket, **kwargs)
  df = pd.read_feather(path)
  return df

def loadRemoteHash(key='', bucket='', **kwargs):
  hashKey = f'{key}-hash'
  print(f'loading hashkey {hashKey}')
  loadedHash= S3.load(hashKey,bucket=bucket, **kwargs).get('hash')
  print(f'loaded hash is {loadedHash}')
  return loadedHash

In [None]:
#hide
testKey = 'testKey'
testBucket = 'villa-clipboard'
%time saveRemoteCache(df, key = testKey, bucket = testBucket)
print(loadRemoteHash(testKey, testBucket))
%timeit loadRemoteCache(key = testKey, bucket = testBucket)

hashKey is testKey-hash
saving hash to s3
saved hash 068df0811eb710aa82148159e389157000f0b023
CPU times: user 91.6 ms, sys: 23.1 ms, total: 115 ms
Wall time: 255 ms
loading hashkey testKey-hash
loaded hash is 068df0811eb710aa82148159e389157000f0b023
068df0811eb710aa82148159e389157000f0b023
123 ms ± 20.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


## PynamoAttributes

In [None]:
#export
class PandasDataFrameAttribute(Attribute):
  attr_type = BINARY
  def serialize(self, value: pd.DataFrame)->bin:
    bio = BytesIO()
    value.to_feather(bio)
    data:bin = bio.getvalue()
    return data
  def deserialize(self, value: bin)->pd.DataFrame:
    bio = BytesIo(bin)
    df: pd.DataFrame = pd.read_feather(bio)
    return df

In [None]:
class Database(Model):
  class Meta:
    table_name = ''
    region = ''
    billing_mode='PAY_PER_REQUEST'
    
  brcode = UnicodeAttribute(hash_key=True, default = '')
  data = PandasDataFrameAttribute()
  
import sys
df = pd.DataFrame({'cprcode':['1234', '12345'], 'quantity':[123, 345]})
db = Database(brcode='1234', data = df)
db.data

Unnamed: 0,cprcode,quantity
0,1234,123
1,12345,345


In [None]:
#export
# class PandasSeriesAttribute(Attribute):
#   attr_type = BINARY
#   def serialize(self, value: pd.Series)->bin:
#     bio = BytesIO()
#     df = s.to_frame()
#     value.to_feather(bio)
#     data:bin = bio.getvalue()
#     return data
#   def deserialize(self, value: bin)->pd.DataFrame:
#     bio = BytesIo(bin)
#     df: pd.DataFrame = pd.read_feather(bio)
#     return df

In [None]:
#export
from nicHelper.schema import getTypes

In [None]:
url = 'https://raw.githubusercontent.com/thanakijwanavit/villaMasterSchema/dev-manual/inventory/inventory.yaml'


inv = {
                  'iprcode': '0000009',
                  'brcode': '1000',
                  'ib_cf_qty': '50',
                  'new_ib_vs_stock_cv': '27',
                  'onlineflag': True
                }
getTypes(url)

{'iprcode': int,
 'brcode': int,
 'ib_cf_qty': int,
 'new_ib_vs_stock_cv': int,
 'onlineflag': bool}

In [None]:
#export
def forceType(url:str, df:pd.DataFrame, defaultType=str)->pd.DataFrame:
  typeDict = getTypes(url)
  typeList = {col:typeDict.get(col) for col in df.columns}
  print(typeList)
  df = df.astype(typeList)
  print(df.dtypes)
  df

In [None]:
forceType(url, pd.DataFrame([inv]))
# df.astype()

{'iprcode': <class 'int'>, 'brcode': <class 'int'>, 'ib_cf_qty': <class 'int'>, 'new_ib_vs_stock_cv': <class 'int'>, 'onlineflag': <class 'bool'>}
iprcode               int64
brcode                int64
ib_cf_qty             int64
new_ib_vs_stock_cv    int64
onlineflag             bool
dtype: object
