# More on Data I/O

## Imports

In [1]:
import numpy as np
np.random.seed(0)

import pandas as pd
import csv
import json
import h5py
import tables
import pickle
import msgpack

# CSV

In [2]:
!head -n 5 players-top30.csv

# 2013-2014 / Regular Season / All Skaters / Summary / Points
Rank,Player,Team,Pos,GP,G,A,P,+/-,PIM,PPG,PPP,SHG,SHP,GW,OT,S,S%,TOI/GP,Shift/GP,FO%
1,Sidney Crosby,PIT,C,80,36,68,104,+18,46,11,38,0,0,5,1,259,13.9,21:58,24.0,52.5
2,Ryan Getzlaf,ANA,C,77,31,56,87,+28,31,5,23,0,0,7,1,204,15.2,21:17,25.2,49.0
3,Claude Giroux,PHI,C,82,28,58,86,+7,46,7,37,0,0,7,1,223,12.6,20:26,25.1,52.9


In [3]:
rows = []

In [4]:
with open("players.csv") as f:
    csvreader = csv.reader(f)
    rows = [fields for fields in csvreader]

In [5]:
rows[1][1:6]

['Player', 'Team', 'Pos', 'GP', 'G']

In [6]:
rows[2][1:6]

['Sidney Crosby', 'PIT', 'C', '80', '36']

In [7]:
data = np.random.randn(100, 3)

In [8]:
np.savetxt("data.csv", data, delimiter=",", header="x, y, z", comments="# Random x, y, z coordinates\n")

In [9]:
!head -n 5 data.csv

# Random x, y, z coordinates
x, y, z
1.764052345967664026e+00,4.001572083672232938e-01,9.787379841057392005e-01
2.240893199201457797e+00,1.867557990149967484e+00,-9.772778798764110153e-01
9.500884175255893682e-01,-1.513572082976978872e-01,-1.032188517935578448e-01


In [10]:
data_load = np.loadtxt("data.csv", skiprows=2, delimiter=",")

In [11]:
data_load[1,:]

array([ 2.2408932 ,  1.86755799, -0.97727788])

In [12]:
data_load.dtype

dtype('float64')

In [13]:
(data == data_load).all()

True

In [14]:
data = np.loadtxt("players.csv", skiprows=2, delimiter=",", dtype=bytes)

In [15]:
data[0][1:6]

array([b'Sidney Crosby', b'PIT', b'C', b'80', b'36'], dtype='|S13')

In [16]:
np.loadtxt("players.csv", skiprows=2, delimiter=",", usecols=[6,7,8])

array([[ 68., 104.,  18.],
       [ 56.,  87.,  28.],
       [ 58.,  86.,   7.],
       [ 47.,  84.,  16.],
       [ 39.,  82.,  32.]])

In [17]:
df = pd.read_csv("players.csv", skiprows=1)

In [18]:
df = df.set_index("Rank")

In [19]:
df[["Player", "GP", "G", "A", "P"]]

Unnamed: 0_level_0,Player,GP,G,A,P
Rank,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84
5,Corey Perry,81,43,39,82


In [20]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5 entries, 1 to 5
Data columns (total 20 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Player    5 non-null      object 
 1   Team      5 non-null      object 
 2   Pos       5 non-null      object 
 3   GP        5 non-null      int64  
 4   G         5 non-null      int64  
 5   A         5 non-null      int64  
 6   P         5 non-null      int64  
 7   +/-       5 non-null      int64  
 8   PIM       5 non-null      int64  
 9   PPG       5 non-null      int64  
 10  PPP       5 non-null      int64  
 11  SHG       5 non-null      int64  
 12  SHP       5 non-null      int64  
 13  GW        5 non-null      int64  
 14  OT        5 non-null      int64  
 15  S         5 non-null      int64  
 16  S%        5 non-null      float64
 17  TOI/GP    5 non-null      object 
 18  Shift/GP  5 non-null      float64
 19  FO%       5 non-null      float64
dtypes: float64(3), int64(13), object(4)


In [21]:
df[["Player", "GP", "G", "A", "P"]].to_csv("players-subset.csv")

In [22]:
!head -n 5 players-subset.csv

Rank,Player,GP,G,A,P
1,Sidney Crosby,80,36,68,104
2,Ryan Getzlaf,77,31,56,87
3,Claude Giroux,82,28,58,86
4,Tyler Seguin,80,37,47,84


# HDF5

## h5py

In [23]:
import h5py

In [24]:
# mode = "w", "r", "w-", "r+", "a"
f = h5py.File("data.h5", "w")

In [25]:
f.mode

'r+'

In [26]:
f.flush()

In [27]:
f.close()

In [28]:
f = h5py.File("data.h5", "w")

In [29]:
f.name

'/'

In [30]:
grp1 = f.create_group("experiment1")

In [31]:
grp1.name

'/experiment1'

In [32]:
grp2_meas = f.create_group("experiment2/measurement")

In [33]:
grp2_meas.name

'/experiment2/measurement'

In [34]:
grp2_sim = f.create_group("experiment2/simulation")

In [35]:
grp2_sim.name

'/experiment2/simulation'

In [36]:
f["/experiment1"]

<HDF5 group "/experiment1" (0 members)>

In [37]:
f["/experiment2/simulation"]

<HDF5 group "/experiment2/simulation" (0 members)>

In [38]:
grp_expr2 = f["/experiment2"]

In [39]:
grp_expr2['simulation']

<HDF5 group "/experiment2/simulation" (0 members)>

In [40]:
list(f.keys())

['experiment1', 'experiment2']

In [41]:
list(f.items())

[('experiment1', <HDF5 group "/experiment1" (0 members)>),
 ('experiment2', <HDF5 group "/experiment2" (2 members)>)]

In [42]:
f.visit(lambda x: print(x))

experiment1
experiment2
experiment2/measurement
experiment2/simulation


In [43]:
f.visititems(lambda name, value: print(name, value))

experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (0 members)>
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>


In [44]:
"experiment1" in f

True

In [45]:
"simulation" in f["experiment2"]

True

In [46]:
"experiment3" in f

False

In [47]:
f.flush()

In [48]:
!h5ls -r data.h5

data.h5: unable to open file


In [49]:
data1 = np.arange(10)

In [50]:
data2 = np.random.randn(100, 100)

In [51]:
f["array1"] = data1

In [52]:
f["/experiment2/measurement/meas1"] = data2

In [53]:
f.visititems(lambda name, value: print(name, value))

array1 <HDF5 dataset "array1": shape (10,), type "<i8">
experiment1 <HDF5 group "/experiment1" (0 members)>
experiment2 <HDF5 group "/experiment2" (2 members)>
experiment2/measurement <HDF5 group "/experiment2/measurement" (1 members)>
experiment2/measurement/meas1 <HDF5 dataset "meas1": shape (100, 100), type "<f8">
experiment2/simulation <HDF5 group "/experiment2/simulation" (0 members)>


In [54]:
ds = f["array1"]

In [55]:
ds

<HDF5 dataset "array1": shape (10,), type "<i8">

In [56]:
ds.name

'/array1'

In [57]:
ds.dtype

dtype('int64')

In [58]:
ds.shape

(10,)

In [59]:
ds.len()

10

In [60]:
ds[:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [61]:
ds = f["/experiment2/measurement/meas1"]

In [62]:
ds

<HDF5 dataset "meas1": shape (100, 100), type "<f8">

In [63]:
ds.dtype

dtype('<f8')

In [64]:
ds.shape

(100, 100)

In [65]:
data_full = ds[...]

In [66]:
type(data_full)

numpy.ndarray

In [67]:
data_full.shape

(100, 100)

In [68]:
data_col = ds[:, 0]

In [69]:
data_col.shape

(100,)

In [70]:
ds[10:20:3, 10:20:3]

array([[ 0.60270766, -0.34804638, -0.813596  , -1.29737966],
       [ 0.91320192, -1.06343294,  0.22734595,  0.52759738],
       [ 1.25774422, -0.32775492,  1.4849256 ,  0.28005786],
       [-0.84907287, -0.30000358,  1.79691852, -0.19871506]])

In [71]:
ds[[1,2,3], :].shape

(3, 100)

In [72]:
ds[[1,2,3], :].shape

(3, 100)

In [73]:
mask = ds[:, 0] > 2.0

In [74]:
mask.shape, mask.dtype

((100,), dtype('bool'))

In [75]:
ds[mask, 0]

array([2.04253623, 2.1041854 , 2.05689385])

In [76]:
ds[mask, :5]

array([[ 2.04253623, -0.91946118,  0.11467003, -0.1374237 ,  1.36552692],
       [ 2.1041854 ,  0.22725706, -1.1291663 , -0.28133197, -0.7394167 ],
       [ 2.05689385,  0.18041971, -0.06670925, -0.02835398,  0.48480475]])

In [77]:
# create empty data sets, assign and update datasets

In [78]:
ds = f.create_dataset("array2", data=np.random.randint(10, size=10))

In [79]:
ds

<HDF5 dataset "array2": shape (10,), type "<i8">

In [80]:
ds[:]

array([0, 2, 2, 4, 7, 3, 7, 2, 4, 1])

In [81]:
ds = f.create_dataset("/experiment2/simulation/data1", shape=(5, 5), fillvalue=-1)

In [82]:
ds

<HDF5 dataset "data1": shape (5, 5), type "<f4">

In [83]:
ds[:]

array([[-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.],
       [-1., -1., -1., -1., -1.]], dtype=float32)

In [84]:
ds = f.create_dataset("/experiment1/simulation/data1", shape=(5000, 5000, 5000),
                      fillvalue=0, compression='gzip')

In [85]:
ds

<HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">

In [86]:
ds[:, 0, 0] = np.random.rand(5000)

In [87]:
ds[1, :, 0] += np.random.rand(5000)

In [88]:
ds[:2, :5, 0]

array([[0.6939344 , 0.        , 0.        , 0.        , 0.        ],
       [1.4819994 , 0.01639538, 0.54387355, 0.11130908, 0.9928771 ]],
      dtype=float32)

In [89]:
ds.fillvalue

0.0

In [90]:
f["experiment1"].visititems(lambda name, value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (1 members)>
simulation/data1 <HDF5 dataset "data1": shape (5000, 5000, 5000), type "<f4">


In [91]:
float(np.prod(ds.shape) * ds[0,0,0].nbytes) / (1024**3)  # Gb

465.66128730773926

In [92]:
f.flush()

In [93]:
f.filename

'data.h5'

In [94]:
!ls -lh data.h5

-rw-r--r-- 1 schuang schuang 357K Aug 15 11:35 data.h5


In [95]:
del f["/experiment1/simulation/data1"]

In [96]:
f["experiment1"].visititems(lambda name, value: print(name, value))

simulation <HDF5 group "/experiment1/simulation" (0 members)>


In [97]:
f.close()

In [98]:
# attributes

In [99]:
f = h5py.File("data.h5", "r+")  # read/write mode

In [100]:
f.attrs

<Attributes of HDF5 object at 140288431591344>

In [101]:
f.attrs["desc"] = "Result sets from experiments and simulations"

In [102]:
f["experiment1"].attrs["date"] = "2015-1-1"

In [103]:
f["experiment2"].attrs["date"] = "2015-1-2"

In [104]:
f["experiment2/simulation/data1"].attrs["k"] = 1.5

In [105]:
f["experiment2/simulation/data1"].attrs["T"] = 1000

In [106]:
list(f["experiment1"].attrs.keys())

['date']

In [107]:
list(f["experiment2/simulation/data1"].attrs.items())

[('T', 1000), ('k', 1.5)]

In [108]:
"T" in f["experiment2/simulation/data1"].attrs

True

In [109]:
del f["experiment2/simulation/data1"].attrs["T"]

In [110]:
"T" in f["experiment2/simulation/data1"].attrs

False

In [111]:
f["experiment2/simulation/data1"].attrs["t"] = np.array([1, 2, 3])

In [112]:
f["experiment2/simulation/data1"].attrs["t"]

array([1, 2, 3])

In [113]:
f.close()

# JSON

In [114]:
data = ["string", 1.0, 2, None]

In [115]:
data_json = json.dumps(data)

In [116]:
data_json

'["string", 1.0, 2, null]'

In [117]:
data2 = json.loads(data_json)

In [118]:
data

['string', 1.0, 2, None]

In [119]:
data[0]

'string'

In [120]:
data = {"one": 1, "two": 2.0, "three": "three"}

In [121]:
data_json = json.dumps(data)

In [122]:
print(data_json)

{"one": 1, "two": 2.0, "three": "three"}


In [123]:
data = json.loads(data_json)

In [124]:
data["two"]

2.0

In [125]:
data["three"]

'three'

In [126]:
data = {"one": [1], 
        "two": [1, 2], 
        "three": [1, 2, 3]}

In [127]:
data_json = json.dumps(data, indent=True)

In [128]:
print(data_json)

{
 "one": [
  1
 ],
 "two": [
  1,
  2
 ],
 "three": [
  1,
  2,
  3
 ]
}


In [129]:
data = {"one": [1], 
        "two": {"one": 1, "two": 2}, 
        "three": [(1,), (1, 2), (1, 2, 3)],
        "four": "a text string"}

In [130]:
with open("data.json", "w") as f:
    json.dump(data, f)

In [131]:
!cat data.json

{"one": [1], "two": {"one": 1, "two": 2}, "three": [[1], [1, 2], [1, 2, 3]], "four": "a text string"}

In [132]:
with open("data.json", "r") as f:
    data_from_file = json.load(f)

In [133]:
data_from_file["two"]

{'one': 1, 'two': 2}

In [134]:
data_from_file["three"]

[[1], [1, 2], [1, 2, 3]]

In [135]:
!head -n 20 stations.json

{
    "C": {
        "color": "#149848", 
        "transfers": [
            [
                "C3", 
                "F15"
            ], 
            [
                "C4", 
                "Z2"
            ], 
            [
                "C4", 
                "G2"
            ], 
            [
                "C7", 
                "M14"
            ], 


In [136]:
!wc stations.json

 1471  1508 27638 stations.json


In [137]:
with open("stations.json", "r") as f:
    data = json.load(f)

In [138]:
data.keys()

dict_keys(['C', 'G', 'F', 'H', 'M', 'N', 'T', 'Y', 'Z'])

In [139]:
data["C"].keys()

dict_keys(['color', 'transfers', 'travel_times'])

In [140]:
data["C"]["color"]

'#149848'

In [141]:
data["C"]["transfers"]

[['C3', 'F15'],
 ['C4', 'Z2'],
 ['C4', 'G2'],
 ['C7', 'M14'],
 ['C7', 'N6'],
 ['C7', 'G6'],
 ['C8', 'M15'],
 ['C8', 'H6'],
 ['C9', 'H7'],
 ['C9', 'Y18'],
 ['C11', 'T9'],
 ['C11', 'M18'],
 ['C11', 'Z8'],
 ['C12', 'M19'],
 ['C18', 'H21']]

In [142]:
[(s, e, tt) for s, e, tt in data["C"]["travel_times"] if tt == 1]

[('C3', 'C4', 1), ('C7', 'C8', 1), ('C9', 'C10', 1)]

In [143]:
#data