In [34]:
import numpy as np
import pandas as pd

In [35]:
# HDF5(PyTables)
pd.set_option('io.hdf.default_format', 'table')

store = pd.HDFStore("hdf.h5")
# 기존에 hdf.h5라는 file이 존재 한다면, error발생
# in advance, to create a empty file.
print(store)

<class 'pandas.io.pytables.HDFStore'>
File path: hdf.h5



In [36]:
index = pd.date_range('01/01/2023', periods=8)
s = pd.Series(np.random.randn(5), index=['a', 'b', 'c', 'd', 'e'])
df = pd.DataFrame(np.random.randn(8, 3), index = index, columns=['A', 'B', 'C'])

store.put('s', s)
store.put('df', df)
# "store['s'] = s" -> dictionary key value assigning과 같은 형태.

store.keys()
# hdf file에 들어 있는 내용물을 확인     🌸

['/df', '/s', '/foo/bar/bah']

In [37]:
store['df']
# equivalent to "store.df"

Unnamed: 0,A,B,C
2023-01-01,1.702872,0.308839,-0.365917
2023-01-02,0.851215,0.44881,0.16534
2023-01-03,-1.498664,-0.233697,-0.462251
2023-01-04,-0.259856,1.271397,-0.256739
2023-01-05,-1.4743,1.368424,-0.037265
2023-01-06,-0.054277,1.684516,1.270398
2023-01-07,-0.274158,0.875411,-0.446735
2023-01-08,-1.383003,1.533008,-2.200317


In [38]:
del store['df']
# deletion of the object specified by key

In [39]:
# closing a store 
store.close()
store

<class 'pandas.io.pytables.HDFStore'>
File path: hdf.h5

In [40]:
print('is_open' in dir(store))
# to check 'is_open' in store's attributes
store.is_open

True


False

In [41]:
with pd.HDFStore('hdf.h5') as store:
    print(store.keys())

['/s', '/foo/bar/bah']


In [42]:
# Read/write API
df_tl = pd.DataFrame({'A': list(range(5)), 'B': list(range(5))})

In [43]:
df_tl.to_hdf('store_tl.h5', 'table', append=True)
pd.read_hdf('store_tl.h5', 'table', where=['index>2'])

Unnamed: 0,A,B
3,3,3
4,4,4


In [44]:
df_with_missing = pd.DataFrame(
    {'col1': [0, np.nan, 2], 
     'col2': [1, np.nan, np.nan] 
    }
)
df_with_missing

Unnamed: 0,col1,col2
0,0.0,1.0
1,,
2,2.0,


In [45]:
df_with_missing.to_hdf('file.h5', 'df_with_missing', format='table', mode='w')
pd.read_hdf('file.h5', 'df_with_missing')

Unnamed: 0,col1,col2
0,0.0,1.0
1,,
2,2.0,


In [46]:
# "dropna = True"
df_with_missing.to_hdf(
    'file.h5', 'df_with_missing', mode='w', dropna=True
    )
pd.read_hdf('file.h5', 'df_with_missing')

Unnamed: 0,col1,col2
0,0.0,1.0
2,2.0,


In [47]:
# Fixed format

In [48]:
# Table format
store = pd.HDFStore('hdf.h5')
df1 = df[0:4]
df2 = df[4:]

In [49]:
store.keys()

['/s', '/foo/bar/bah']

In [50]:
store.append('df', df1, format='table')
store.append('df', df2, format='table')
store

<class 'pandas.io.pytables.HDFStore'>
File path: hdf.h5

In [51]:
# select the entire object
store.select('df')

Unnamed: 0,A,B,C
2023-01-01,1.702872,0.308839,-0.365917
2023-01-02,0.851215,0.44881,0.16534
2023-01-03,-1.498664,-0.233697,-0.462251
2023-01-04,-0.259856,1.271397,-0.256739
2023-01-05,-1.4743,1.368424,-0.037265
2023-01-06,-0.054277,1.684516,1.270398
2023-01-07,-0.274158,0.875411,-0.446735
2023-01-08,-1.383003,1.533008,-2.200317


In [52]:
store.root.df._v_attrs.pandas_type

'frame_table'

In [53]:
# Hierarchical keys

In [54]:
store.put("foo/bar/bah", df)
store.append("food/orange", df)
store.append("food/apple", df)
store

<class 'pandas.io.pytables.HDFStore'>
File path: hdf.h5

In [55]:
store.keys()

['/df', '/s', '/food/apple', '/food/orange', '/foo/bar/bah']

In [56]:
store.remove("food")
# "/"없이 지정할 수 있고, 지정된 지점을 포함하여 이하 모든 것을 삭제한다.(.remove(...))
store.keys()

['/df', '/s', '/foo/bar/bah']

In [57]:
# store['/foo/bar']
# 해당 지점에 data가 없으면 짜증(?, 혹시 Error?)을 낸다. 

In [58]:
# Walk through     🌸🪷🏵️💮😵‍💫
for (path, subgroups, subkeys) in store.walk():
    for subgroup in subgroups:
      print(f"Group:{path}/{subgroup}")
    for subkey in subkeys:
      key = "/".join([path, subkey])
      print(f"KEY:{key}")
      print(store.get(key))

Group:/foo
KEY:/s
a    0.266915
b   -0.573617
c    0.686586
d   -1.429270
e   -0.088349
dtype: float64
KEY:/df
                   A         B         C
2023-01-01  1.702872  0.308839 -0.365917
2023-01-02  0.851215  0.448810  0.165340
2023-01-03 -1.498664 -0.233697 -0.462251
2023-01-04 -0.259856  1.271397 -0.256739
2023-01-05 -1.474300  1.368424 -0.037265
2023-01-06 -0.054277  1.684516  1.270398
2023-01-07 -0.274158  0.875411 -0.446735
2023-01-08 -1.383003  1.533008 -2.200317
Group:/foo/bar
KEY:/foo/bar/bah
                   A         B         C
2023-01-01  1.702872  0.308839 -0.365917
2023-01-02  0.851215  0.448810  0.165340
2023-01-03 -1.498664 -0.233697 -0.462251
2023-01-04 -0.259856  1.271397 -0.256739
2023-01-05 -1.474300  1.368424 -0.037265
2023-01-06 -0.054277  1.684516  1.270398
2023-01-07 -0.274158  0.875411 -0.446735
2023-01-08 -1.383003  1.533008 -2.200317


In [59]:
store.keys()


['/df', '/s', '/foo/bar/bah']

In [60]:
store.groups()

[/df (Group) ''
   children := ['table' (Table)],
 /s (Group) ''
   children := ['table' (Table)],
 /foo/bar/bah (Group) ''
   children := ['table' (Table)]]

In [61]:
# 

for key in store.keys():
    print(f"KEY:{key}")
    print(store.get(key))

KEY:/df
                   A         B         C
2023-01-01  1.702872  0.308839 -0.365917
2023-01-02  0.851215  0.448810  0.165340
2023-01-03 -1.498664 -0.233697 -0.462251
2023-01-04 -0.259856  1.271397 -0.256739
2023-01-05 -1.474300  1.368424 -0.037265
2023-01-06 -0.054277  1.684516  1.270398
2023-01-07 -0.274158  0.875411 -0.446735
2023-01-08 -1.383003  1.533008 -2.200317
KEY:/s
a    0.266915
b   -0.573617
c    0.686586
d   -1.429270
e   -0.088349
dtype: float64
KEY:/foo/bar/bah
                   A         B         C
2023-01-01  1.702872  0.308839 -0.365917
2023-01-02  0.851215  0.448810  0.165340
2023-01-03 -1.498664 -0.233697 -0.462251
2023-01-04 -0.259856  1.271397 -0.256739
2023-01-05 -1.474300  1.368424 -0.037265
2023-01-06 -0.054277  1.684516  1.270398
2023-01-07 -0.274158  0.875411 -0.446735
2023-01-08 -1.383003  1.533008 -2.200317


In [62]:
store.get('/df')

Unnamed: 0,A,B,C
2023-01-01,1.702872,0.308839,-0.365917
2023-01-02,0.851215,0.44881,0.16534
2023-01-03,-1.498664,-0.233697,-0.462251
2023-01-04,-0.259856,1.271397,-0.256739
2023-01-05,-1.4743,1.368424,-0.037265
2023-01-06,-0.054277,1.684516,1.270398
2023-01-07,-0.274158,0.875411,-0.446735
2023-01-08,-1.383003,1.533008,-2.200317


In [64]:
del store['df_mixed']

KeyError: 'No object named df_mixed in the file'

In [65]:
# Storing types 
# storing mixed types in a table

df_mixed = pd.DataFrame(
    {
        'A': np.random.randn(8),
        'B': np.random.randn(8),
        'C': np.array(np.random.randn(8), dtype='float32'),
        'string': 'string',
        'int': 1,
        'bool': True,
        'datetime64':pd.Timestamp('20230102')
    },
    index=list(range(8)),
)

In [66]:
df_mixed.loc[df_mixed.index[3:5], ['A', 'B', 'string', 'datetime64']] = np.nan

In [67]:
store.append("df_mixed", df_mixed, min_itemsize={'values':50})

In [68]:
df_mixed1 = store.select('df_mixed')
df_mixed1

Unnamed: 0,A,B,C,string,int,bool,datetime64
0,0.289198,-1.149931,-0.698533,string,1,True,2023-01-02
1,1.972578,0.985096,2.040026,string,1,True,2023-01-02
2,-0.032942,0.347309,1.29251,string,1,True,2023-01-02
3,,,0.827232,,1,True,NaT
4,,,-1.028917,,1,True,NaT
5,-2.334176,-0.696779,0.40977,string,1,True,2023-01-02
6,-1.239585,1.307702,-0.424616,string,1,True,2023-01-02
7,-1.714415,0.327557,0.118833,string,1,True,2023-01-02


In [69]:
df_mixed1.dtypes.value_counts()

float64           2
float32           1
object            1
int64             1
bool              1
datetime64[ns]    1
Name: count, dtype: int64

In [70]:
# We have provided a minimun string column size
store.root.df_mixed.table

/df_mixed/table (Table(8,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": Float64Col(shape=(2,), dflt=0.0, pos=1),
  "values_block_1": Float32Col(shape=(1,), dflt=0.0, pos=2),
  "values_block_2": StringCol(itemsize=50, shape=(1,), dflt=b'', pos=3),
  "values_block_3": Int64Col(shape=(1,), dflt=0, pos=4),
  "values_block_4": BoolCol(shape=(1,), dflt=False, pos=5),
  "values_block_5": Int64Col(shape=(1,), dflt=0, pos=6)}
  byteorder := 'little'
  chunkshape := (689,)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [71]:
# Storing MultiIndex DataFrames
index = pd.MultiIndex(
    levels=[['foo', 'bar', 'baz', 'qux'], ['one', 'two', 'three']],
    codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
    names=['foo', 'bar'],
)
df_mi = pd.DataFrame(np.random.randn(10,3), index = index, columns=['A', 'B', 'C'])
df_mi

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
foo,bar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
foo,one,-0.446079,-0.925904,-1.931547
foo,two,-0.110306,-0.090154,0.166987
foo,three,-0.424137,-0.220232,-0.012431
bar,one,-0.592825,0.385448,-0.214493
bar,two,1.099544,0.364129,0.121549
baz,two,-1.79917,0.811012,-0.199961
baz,three,-1.503668,-0.087434,-1.053881
qux,one,-0.253864,-1.291565,0.468244
qux,two,0.255753,1.135225,0.314302
qux,three,1.10085,0.539631,-0.358576


In [72]:
store.append('df_mi', df_mi)
store.select('df_mi')

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
foo,bar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
foo,one,-0.446079,-0.925904,-1.931547
foo,two,-0.110306,-0.090154,0.166987
foo,three,-0.424137,-0.220232,-0.012431
bar,one,-0.592825,0.385448,-0.214493
bar,two,1.099544,0.364129,0.121549
baz,two,-1.79917,0.811012,-0.199961
baz,three,-1.503668,-0.087434,-1.053881
qux,one,-0.253864,-1.291565,0.468244
qux,two,0.255753,1.135225,0.314302
qux,three,1.10085,0.539631,-0.358576


In [73]:
# the Levels are automatically included as data columns
store.select('df_mi', "foo=bar")

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
foo,bar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
bar,one,-0.592825,0.385448,-0.214493
bar,two,1.099544,0.364129,0.121549


In [74]:
store.keys()

['/df', '/df_mi', '/df_mixed', '/s', '/foo/bar/bah']

In [75]:
store['df']

Unnamed: 0,A,B,C
2023-01-01,1.702872,0.308839,-0.365917
2023-01-02,0.851215,0.44881,0.16534
2023-01-03,-1.498664,-0.233697,-0.462251
2023-01-04,-0.259856,1.271397,-0.256739
2023-01-05,-1.4743,1.368424,-0.037265
2023-01-06,-0.054277,1.684516,1.270398
2023-01-07,-0.274158,0.875411,-0.446735
2023-01-08,-1.383003,1.533008,-2.200317


In [76]:
# Querying
string = "HolyMoly"
store.select('df', 'index == string')

DateParseError: Unknown datetime string format, unable to parse: HolyMoly

In [None]:
# Querying
string = "HolyMoly"
store.select('df', f'index == {string}')

In [None]:
store.select('df', 'index == %r' % string)

In [77]:
dfq = pd.DataFrame(
    np.random.randn(10, 4),
    columns=list("ABCD"),
    index=pd.date_range("20130101", periods=10),
)

store.append("dfq", dfq, format="table", data_columns=True)

In [78]:
store.select("dfq", "index>pd.Timestamp('20130104') & columns=['A', 'B']")

Unnamed: 0,A,B
2013-01-05,0.011684,-0.368062
2013-01-06,1.528019,1.326785
2013-01-07,0.788208,0.705659
2013-01-08,0.64219,0.625035
2013-01-09,-0.298636,-0.941026
2013-01-10,-0.224245,-0.240643


In [79]:
store.select("dfq", where="A>0 or C>0")

Unnamed: 0,A,B,C,D
2013-01-01,-1.144049,0.069177,1.46995,0.250053
2013-01-04,0.001095,1.435721,-0.50863,-1.454968
2013-01-05,0.011684,-0.368062,-1.399531,0.366433
2013-01-06,1.528019,1.326785,-0.061721,0.820316
2013-01-07,0.788208,0.705659,0.184611,-1.790671
2013-01-08,0.64219,0.625035,1.09824,0.976372
2013-01-10,-0.224245,-0.240643,0.66924,-0.601939


In [80]:
store.select("df", "columns=['A', 'B']")

Unnamed: 0,A,B
2023-01-01,1.702872,0.308839
2023-01-02,0.851215,0.44881
2023-01-03,-1.498664,-0.233697
2023-01-04,-0.259856,1.271397
2023-01-05,-1.4743,1.368424
2023-01-06,-0.054277,1.684516
2023-01-07,-0.274158,0.875411
2023-01-08,-1.383003,1.533008


In [81]:
# Query timedelta64[ns]
from datetime import timedelta

dftd = pd.DataFrame(
    {
        "A": pd.Timestamp("20130101"),
        "B": [
            pd.Timestamp("20230101") + timedelta(days=i, seconds=10) for i in range(10)
            ]
    }
)

In [82]:
dftd

Unnamed: 0,A,B
0,2013-01-01,2023-01-01 00:00:10
1,2013-01-01,2023-01-02 00:00:10
2,2013-01-01,2023-01-03 00:00:10
3,2013-01-01,2023-01-04 00:00:10
4,2013-01-01,2023-01-05 00:00:10
5,2013-01-01,2023-01-06 00:00:10
6,2013-01-01,2023-01-07 00:00:10
7,2013-01-01,2023-01-08 00:00:10
8,2013-01-01,2023-01-09 00:00:10
9,2013-01-01,2023-01-10 00:00:10


In [83]:
dftd['C'] = dftd['B'] - dftd['A']
dftd

Unnamed: 0,A,B,C
0,2013-01-01,2023-01-01 00:00:10,3652 days 00:00:10
1,2013-01-01,2023-01-02 00:00:10,3653 days 00:00:10
2,2013-01-01,2023-01-03 00:00:10,3654 days 00:00:10
3,2013-01-01,2023-01-04 00:00:10,3655 days 00:00:10
4,2013-01-01,2023-01-05 00:00:10,3656 days 00:00:10
5,2013-01-01,2023-01-06 00:00:10,3657 days 00:00:10
6,2013-01-01,2023-01-07 00:00:10,3658 days 00:00:10
7,2013-01-01,2023-01-08 00:00:10,3659 days 00:00:10
8,2013-01-01,2023-01-09 00:00:10,3660 days 00:00:10
9,2013-01-01,2023-01-10 00:00:10,3661 days 00:00:10


In [84]:
# Query MultiIndex
df_mi.index.names

FrozenList(['foo', 'bar'])

In [85]:
store.select("df_mi", "foo=baz and bar=two")

Unnamed: 0_level_0,Unnamed: 1_level_0,A,B,C
foo,bar,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
baz,two,-1.79917,0.811012,-0.199961


In [86]:
index = pd.MultiIndex(
    levels=[["foo", "bar", "baz", "qux"], ["one", "two", "three"]],
    codes=[[0, 0, 0, 1, 1, 2, 2, 3, 3, 3], [0, 1, 2, 0, 1, 1, 2, 0, 1, 2]],
)


df_mi_2 = pd.DataFrame(np.random.randn(10, 3), index=index, columns=["A", "B", "C"])

df_mi_2

Unnamed: 0,Unnamed: 1,A,B,C
foo,one,-0.329835,-0.292254,-0.052864
foo,two,0.218845,0.809367,-0.813748
foo,three,-0.434197,-0.553327,0.324452
bar,one,-0.838166,0.572503,0.494084
bar,two,1.493548,0.587029,-0.716085
baz,two,0.7412,0.410625,-0.487601
baz,three,-0.016964,-0.418947,0.713699
qux,one,1.012518,1.757079,-0.573581
qux,two,-1.087826,-0.067309,0.052423
qux,three,-1.670873,-0.79368,0.51992


In [87]:
store.append('df_mi_2', df_mi_2)
# the levels are automatically included as data columns with keyword level_n
store.select('df_mi_2', 'level_0 = foo and level_1 = two')

Unnamed: 0,Unnamed: 1,A,B,C
foo,two,0.218845,0.809367,-0.813748


In [88]:
# indexing
# indexes are automatically created on the indexables and any data columns you specify. 
# This behavior can be turned off by passing index=False to append().

In [89]:
i = store.root.df.table.cols.index.index
i.optlevel, i.kind

(6, 'medium')

In [90]:
# changing an index by passing new parameters
store.create_table_index('df', optlevel=9, kind='full')
i = store.root.df.table.cols.index.index

i.optlevel, i.kind


(9, 'full')

In [91]:
df_1 = pd.DataFrame(np.random.randn(10,2), columns=list("AB"))
df_2 = pd.DataFrame(np.random.randn(10,2), columns=list("AB"))
st = pd.HDFStore('appends.ht', mode='w')
st.append('df', df_1, data_columns=["B"], index=False)
st.append('df', df_2, data_columns=["B"], index=False)
st.get_storer('df').table

/df/table (Table(20,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": Float64Col(shape=(1,), dflt=0.0, pos=1),
  "B": Float64Col(shape=(), dflt=0.0, pos=2)}
  byteorder := 'little'
  chunkshape := (2730,)

In [92]:
# then create the index when finished appending.
st.create_table_index('df', columns=['B'], optlevel=9, kind='full')
st.get_storer('df').table

/df/table (Table(20,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": Float64Col(shape=(1,), dflt=0.0, pos=1),
  "B": Float64Col(shape=(), dflt=0.0, pos=2)}
  byteorder := 'little'
  chunkshape := (2730,)
  autoindex := True
  colindexes := {
    "B": Index(9, fullshuffle, zlib(1)).is_csi=True}

In [93]:
st.close()

In [94]:
# Query via data columns
df_dc = df.copy()
df_dc['string'] = 'foo'
df_dc.loc[df_dc.index[4:6], 'string'] = np.nan
df_dc.loc[df_dc.index[7:9], 'string'] = 'bar'
df_dc['string2'] = 'cool'
df_dc.loc[df_dc.index[1:3], ['B', 'C']] = 1.0
df_dc

Unnamed: 0,A,B,C,string,string2
2023-01-01,1.702872,0.308839,-0.365917,foo,cool
2023-01-02,0.851215,1.0,1.0,foo,cool
2023-01-03,-1.498664,1.0,1.0,foo,cool
2023-01-04,-0.259856,1.271397,-0.256739,foo,cool
2023-01-05,-1.4743,1.368424,-0.037265,,cool
2023-01-06,-0.054277,1.684516,1.270398,,cool
2023-01-07,-0.274158,0.875411,-0.446735,foo,cool
2023-01-08,-1.383003,1.533008,-2.200317,bar,cool


In [95]:
# on-disk operation
store.append('df_dc', df_dc, data_columns=['B', 'C', 'string', 'string2'])
store.select('df_dc', where="B > 0")

Unnamed: 0,A,B,C,string,string2
2023-01-01,1.702872,0.308839,-0.365917,foo,cool
2023-01-02,0.851215,1.0,1.0,foo,cool
2023-01-03,-1.498664,1.0,1.0,foo,cool
2023-01-04,-0.259856,1.271397,-0.256739,foo,cool
2023-01-05,-1.4743,1.368424,-0.037265,,cool
2023-01-06,-0.054277,1.684516,1.270398,,cool
2023-01-07,-0.274158,0.875411,-0.446735,foo,cool
2023-01-08,-1.383003,1.533008,-2.200317,bar,cool


In [96]:
# getting creative
store.select('df_dc', "B > 0 & C > 0 & string==foo")

Unnamed: 0,A,B,C,string,string2
2023-01-02,0.851215,1.0,1.0,foo,cool
2023-01-03,-1.498664,1.0,1.0,foo,cool


In [97]:
# in-memory version
df_dc[(df_dc.B > 0) & (df_dc.C > 0) & (df_dc.string == 'foo')]

Unnamed: 0,A,B,C,string,string2
2023-01-02,0.851215,1.0,1.0,foo,cool
2023-01-03,-1.498664,1.0,1.0,foo,cool


In [98]:
store.root.df_dc.table

/df_dc/table (Table(8,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": Float64Col(shape=(1,), dflt=0.0, pos=1),
  "B": Float64Col(shape=(), dflt=0.0, pos=2),
  "C": Float64Col(shape=(), dflt=0.0, pos=3),
  "string": StringCol(itemsize=3, shape=(), dflt=b'', pos=4),
  "string2": StringCol(itemsize=4, shape=(), dflt=b'', pos=5)}
  byteorder := 'little'
  chunkshape := (1680,)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False,
    "B": Index(6, mediumshuffle, zlib(1)).is_csi=False,
    "C": Index(6, mediumshuffle, zlib(1)).is_csi=False,
    "string": Index(6, mediumshuffle, zlib(1)).is_csi=False,
    "string2": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [99]:
store['df_dc']

Unnamed: 0,A,B,C,string,string2
2023-01-01,1.702872,0.308839,-0.365917,foo,cool
2023-01-02,0.851215,1.0,1.0,foo,cool
2023-01-03,-1.498664,1.0,1.0,foo,cool
2023-01-04,-0.259856,1.271397,-0.256739,foo,cool
2023-01-05,-1.4743,1.368424,-0.037265,,cool
2023-01-06,-0.054277,1.684516,1.270398,,cool
2023-01-07,-0.274158,0.875411,-0.446735,foo,cool
2023-01-08,-1.383003,1.533008,-2.200317,bar,cool


In [100]:
# Iterator
for df in store.select('df', chunksize=3):
    print(df)

                   A         B         C
2023-01-01  1.702872  0.308839 -0.365917
2023-01-02  0.851215  0.448810  0.165340
2023-01-03 -1.498664 -0.233697 -0.462251
                   A         B         C
2023-01-04 -0.259856  1.271397 -0.256739
2023-01-05 -1.474300  1.368424 -0.037265
2023-01-06 -0.054277  1.684516  1.270398
                   A         B         C
2023-01-07 -0.274158  0.875411 -0.446735
2023-01-08 -1.383003  1.533008 -2.200317


In [101]:
store.close()

In [103]:
# You can also use the iterator with read_hdf which will open, 
# then "automatically close" the store when finished iterating.
for df in pd.read_hdf('hdf.h5', 'df', chunksize=3):
    print(df)


ValueError: The file 'hdf.h5' is already opened, but not in read-only mode (as requested).

In [104]:
dfeq = pd.DataFrame({'number': np.arange(1, 11)})
dfeq

Unnamed: 0,number
0,1
1,2
2,3
3,4
4,5
5,6
6,7
7,8
8,9
9,10


In [105]:
store.open()
store.append('dfeq', dfeq, data_columns=['number'])

In [106]:
# list를 argument로 받아서, 일부를 slicing하고 이것들로 list를 만든다.   🌺㊙️ 
def chunks(l, n):
    return[l[i:i+n] for i in range(0, len(l), n)]

In [107]:
evens = [2, 4, 6, 8, 10]
coordinates = store.select_as_coordinates('dfeq', 'number=evens')
# 😵‍💫

for c in chunks(coordinates, 2):
    print(store.select('dfeq', where=c))

   number
1       2
3       4
   number
5       6
7       8
   number
9      10


In [108]:
# Advanced queries
store.select_column('df_dc', 'index')


0   2023-01-01
1   2023-01-02
2   2023-01-03
3   2023-01-04
4   2023-01-05
5   2023-01-06
6   2023-01-07
7   2023-01-08
Name: index, dtype: datetime64[ns]

In [109]:
store.select_column('df_dc', 'string')

0    foo
1    foo
2    foo
3    foo
4    NaN
5    NaN
6    foo
7    bar
Name: string, dtype: object

In [110]:
df_coord = pd.DataFrame(
    np.random.randn(1000, 2), index=pd.date_range("20000101", periods=1000)
)

In [111]:
store.append('df_coord', df_coord)
c = store.select_as_coordinates('df_coord', 'index > 20020101')
c

Index([732, 733, 734, 735, 736, 737, 738, 739, 740, 741,
       ...
       990, 991, 992, 993, 994, 995, 996, 997, 998, 999],
      dtype='int64', length=268)

In [112]:
store.select('df_coord', where=c)

Unnamed: 0,0,1
2002-01-02,-1.428768,-0.690780
2002-01-03,0.622003,-0.307886
2002-01-04,1.250637,0.673392
2002-01-05,0.759347,0.207316
2002-01-06,-1.016900,1.504435
...,...,...
2002-09-22,1.173989,2.830376
2002-09-23,1.538382,1.382405
2002-09-24,1.351019,-0.429503
2002-09-25,0.543693,-0.062825


In [115]:
# Selecting using a where mask   😵‍💫
df_mask = pd.DataFrame(
    np.random.randn(1000,2), index = pd.date_range('20230101', periods=1000)
)

store.append('df_mask', df_mask)
c = store.select_column('df_mask', 'index')
where = c[pd.DatetimeIndex(c).month == 5].index
store.select("df_mask", where = where)

Unnamed: 0,0,1
2023-05-01,-1.346260,0.221828
2023-05-02,0.753195,-0.593957
2023-05-03,0.710422,0.341018
2023-05-04,1.628852,0.667433
2023-05-05,-0.240896,-2.300389
...,...,...
2025-05-27,-2.508820,-1.811162
2025-05-28,1.005386,-0.019249
2025-05-29,-1.221461,0.491445
2025-05-30,0.163551,0.532799


In [116]:
# Storer object
# If you want to inspect the stored object, retrieve via get_storer. 
# You could use this programmatically to say get the number of rows in an object.
store.get_storer("df_dc").nrows

8

In [123]:
# Multiple table queries
# .append_to_multiple()
# .select_as_multiple()
df_mt = pd.DataFrame(
    np.random.randn(8,6),
    index=pd.date_range("1/1/2023", periods=8),
    columns=['A', 'B', 'C', 'D', 'E', 'F'],
)
df_mt['foo'] = 'bar'
df_mt.loc[df_mt.index[1], ["A", "B"]] = np.nan

In [124]:
df_mt

Unnamed: 0,A,B,C,D,E,F,foo
2023-01-01,-0.699808,0.403539,1.896242,0.094363,-0.417406,-1.87646,bar
2023-01-02,,,-0.487058,-0.147007,1.270127,-0.718027,bar
2023-01-03,2.003796,-1.712992,-1.963351,0.183264,-0.934968,0.192752,bar
2023-01-04,0.971725,0.473517,-0.300583,1.309548,-1.862831,1.177397,bar
2023-01-05,-0.372445,-0.878931,-0.546347,0.51144,0.574112,0.917205,bar
2023-01-06,0.259749,1.08443,0.975621,1.368922,0.608446,-0.60448,bar
2023-01-07,0.799187,2.388504,-1.023292,-0.324905,0.76747,0.062319,bar
2023-01-08,0.383035,0.044912,0.566036,-0.509358,-0.279289,-1.870118,bar


In [134]:
# You can also create the tables individually
# selector & None ㊙️
store.append_to_multiple(
    {"df1_mt":["A", "B"], "df2_mt":None}, df_mt, selector = "df1_mt"
)
store

<class 'pandas.io.pytables.HDFStore'>
File path: hdf.h5

In [135]:
# Individual tables were created
store.select("df1_mt")

Unnamed: 0,A,B
2023-01-01,-0.699808,0.403539
2023-01-02,,
2023-01-03,2.003796,-1.712992
2023-01-04,0.971725,0.473517
2023-01-05,-0.372445,-0.878931
2023-01-06,0.259749,1.08443
2023-01-07,0.799187,2.388504
2023-01-08,0.383035,0.044912


In [136]:
store.select("df2_mt")

Unnamed: 0,C,D,E,F,foo
2023-01-01,1.896242,0.094363,-0.417406,-1.87646,bar
2023-01-02,-0.487058,-0.147007,1.270127,-0.718027,bar
2023-01-03,-1.963351,0.183264,-0.934968,0.192752,bar
2023-01-04,-0.300583,1.309548,-1.862831,1.177397,bar
2023-01-05,-0.546347,0.51144,0.574112,0.917205,bar
2023-01-06,0.975621,1.368922,0.608446,-0.60448,bar
2023-01-07,-1.023292,-0.324905,0.76747,0.062319,bar
2023-01-08,0.566036,-0.509358,-0.279289,-1.870118,bar


In [138]:
store.select_as_multiple(
    ["df1_mt", "df2_mt"],
    where= ["A>0", "B>0"],
    selector="df1_mt",
)

Unnamed: 0,A,B,C,D,E,F,foo
2023-01-04,0.971725,0.473517,-0.300583,1.309548,-1.862831,1.177397,bar
2023-01-06,0.259749,1.08443,0.975621,1.368922,0.608446,-0.60448,bar
2023-01-07,0.799187,2.388504,-1.023292,-0.324905,0.76747,0.062319,bar
2023-01-08,0.383035,0.044912,0.566036,-0.509358,-0.279289,-1.870118,bar


In [None]:
# Delete from a table


In [None]:
# Notes & Caveats
# compression
# Enable compression for all objects within the fiel:
store_compresstion = pd.HDFStore(
    "store_compression.h5", complevel=9, complib="blosc:blosclz"
)

In [140]:
# Categorical data
dfcat = pd.DataFrame(
    {"A":pd.Series(list("aabbcdba")).astype("category"), "B":np.random.randn(8)}
)
dfcat

Unnamed: 0,A,B
0,a,-1.389342
1,a,-0.830936
2,b,0.068541
3,b,1.299523
4,c,-1.174021
5,d,-0.330031
6,b,-0.394783
7,a,-0.172721


In [141]:
dfcat.dtypes

A    category
B     float64
dtype: object

In [142]:
cstore = pd.HDFStore('cats.h5', mode='w')
cstore.append("dfcat", dfcat, format="table", data_columns=["A"])

In [143]:
result = cstore.select("dfcat", where="A in ['b', 'c']")
result

Unnamed: 0,A,B
2,b,0.068541
3,b,1.299523
4,c,-1.174021
6,b,-0.394783


In [144]:
# String columns
dfs = pd.DataFrame({"A":"food", "B":"bar"}, index=list(range(5)))
dfs

Unnamed: 0,A,B
0,food,bar
1,food,bar
2,food,bar
3,food,bar
4,food,bar


In [146]:
# A and B have a size of 30
store.append("dfs", dfs, min_itemsize=30)
store.get_storer("dfs").table

/dfs/table (Table(5,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": StringCol(itemsize=30, shape=(2,), dflt=b'', pos=1)}
  byteorder := 'little'
  chunkshape := (963,)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [147]:
store.append("dfs2", dfs, min_itemsize={"A":30})
store.get_storer("dfs2").table

/dfs2/table (Table(5,)) ''
  description := {
  "index": Int64Col(shape=(), dflt=0, pos=0),
  "values_block_0": StringCol(itemsize=3, shape=(1,), dflt=b'', pos=1),
  "A": StringCol(itemsize=30, shape=(), dflt=b'', pos=2)}
  byteorder := 'little'
  chunkshape := (1598,)
  autoindex := True
  colindexes := {
    "index": Index(6, mediumshuffle, zlib(1)).is_csi=False,
    "A": Index(6, mediumshuffle, zlib(1)).is_csi=False}

In [148]:
# nan_rep
dfss = pd.DataFrame({"A":["foo", "bar", "nan"]})
dfss

Unnamed: 0,A
0,foo
1,bar
2,


In [149]:
store.append("dfss", dfss)
store.select("dfss")

Unnamed: 0,A
0,foo
1,bar
2,


In [151]:
store.append("dfss2", dfss, nan_rep="_nan_")
store.select("dfss2")

Unnamed: 0,A
0,foo
1,bar
2,


In [155]:
dfss5 = pd.DataFrame({"A":["foo", "bar", "_nan_"]})
store.append("dfss5", dfss5, nan_rep="_nan_")
store.select("dfss5")

Unnamed: 0,A
0,foo
1,bar
2,
