In [None]:
#default_exp pdUtils

# pd Utils
utilities function for pandas dataframe and series

In [None]:
#export
import pandas as pd
from hashlib import sha1
from nicHelper.dictUtil import saveStringToFile, loadStringFromFile
from s3bz.s3bz import S3
import os

In [None]:
df = pd.DataFrame({'hello':[1,2,3,4,5,5]})
df

Unnamed: 0,hello
0,1
1,2
2,3
3,4
4,5
5,5


In [None]:
#export
def getDfHash(df:pd.DataFrame):
  df.to_feather('/tmp/feather')
  with open('/tmp/feather', 'rb') as f:
    objHash = sha1(f.read()).hexdigest()
  return objHash

In [None]:
%%time
getDfHash(df)

CPU times: user 1.8 ms, sys: 0 ns, total: 1.8 ms
Wall time: 1.28 ms


'068df0811eb710aa82148159e389157000f0b023'

## Local cache and hash

In [None]:
#export
def saveLocalCache( data:pd.DataFrame, path = '/tmp/cache'):
  saveLocalHash(data, path=path)
  return data.to_feather(path)
def saveLocalHash( data:pd.DataFrame, path = '/tmp/hash'):
  dataHash = getDfHash(data)
  saveStringToFile(dataHash,path)
def loadLocalCache( path = '/tmp/cache'):
  if not os.path.exists(path): raise Exception('cache doesnt exist')
  return pd.read_feather(path)
def loadLocalHash( path = '/tmp/hash'):
  if not os.path.exists(path): raise Exception('hash doesnt exist')
  return loadStringFromFile(path)

In [None]:
%time saveLocalCache(df)
%time saveLocalHash(df)
%time print(loadLocalHash())
%time loadLocalCache()

CPU times: user 2.02 ms, sys: 0 ns, total: 2.02 ms
Wall time: 3.19 ms
CPU times: user 630 µs, sys: 0 ns, total: 630 µs
Wall time: 638 µs
068df0811eb710aa82148159e389157000f0b023
CPU times: user 247 µs, sys: 0 ns, total: 247 µs
Wall time: 252 µs
CPU times: user 1.53 ms, sys: 0 ns, total: 1.53 ms
Wall time: 1.54 ms


Unnamed: 0,hello
0,1
1,2
2,3
3,4
4,5
5,5


## Remote cache and hash

In [None]:
#export
def saveRemoteHash(data:pd.DataFrame, key='', bucket='', **kwargs):
  hashKey = f'{key}-hash'
  hashString = getDfHash(data)
  dictToSave= {'hash': hashString }
  print(f'hashKey is {hashKey}')
  print('saving hash to s3')
  S3.save(key=hashKey,objectToSave=dictToSave, bucket=bucket, **kwargs )
  print(f'saved hash {hashString}')
  

def saveRemoteCache(data:pd.DataFrame, key = '', 
                    bucket = '', localCachePath='/tmp/cache', localHashPath='/tmp/hash', **kwargs):
  
  saveLocalCache(data=data, path = localCachePath)
  saveLocalHash(data=data, path = localHashPath)
  saveRemoteHash(data=data, key = key, bucket=bucket)
  S3.saveFile(key=key, path=localCachePath, bucket=bucket, **kwargs)
  
def loadRemoteCache(key='', bucket='', **kwargs):
  path = '/tmp/tmpPath'
  S3.loadFile(key,path=path ,bucket=bucket, **kwargs)
  df = pd.read_feather(path)
  return df

def loadRemoteHash(key='', bucket='', **kwargs):
  hashKey = f'{key}-hash'
  print(f'loading hashkey {hashKey}')
  loadedHash= S3.load(hashKey,bucket=bucket, **kwargs).get('hash')
  print(f'loaded hash is {loadedHash}')
  return loadedHash

In [None]:
#hide
testKey = 'testKey'
testBucket = 'villa-clipboard'
saveRemoteCache(df, key = testKey, bucket = testBucket)
print(loadRemoteHash(testKey, testBucket))
loadRemoteCache(key = testKey, bucket = testBucket)

hashKey is testKey-hash
saving hash to s3
saved hash 068df0811eb710aa82148159e389157000f0b023
loading hashkey testKey-hash
loaded hash is 068df0811eb710aa82148159e389157000f0b023
068df0811eb710aa82148159e389157000f0b023


Unnamed: 0,hello
0,1
1,2
2,3
3,4
4,5
5,5
