# Reading Binary Files

In [None]:
# imports required but not shown in the video lecture.
from numpy import array, dtype, int32, memmap

In [None]:
# Create binary files
content = ('\x06\x00\x00\x00\x05\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xf0?'
           '\x00\x00\x00\x00\x00\x00\x00@\x00\x00\x00\x00\x00\x00\x08@\x00\x00\x00\x00\x00\x00\x10@\x00'
           '\x00\x00\x00\x00\x00\x14@\x00\x00\x00\x00\x00\x00\x18@\x00\x00\x00\x00\x00\x00\x1c@\x00\x00'
           '\x00\x00\x00\x00 @\x00\x00\x00\x00\x00\x00"@\x00\x00\x00\x00\x00\x00$@\x00\x00\x00\x00\x00\x00&@'
           '\x00\x00\x00\x00\x00\x00(@\x00\x00\x00\x00\x00\x00*@\x00\x00\x00\x00\x00\x00,@\x00\x00\x00\x00\x00'
           '\x00.@\x00\x00\x00\x00\x00\x000@\x00\x00\x00\x00\x00\x001@\x00\x00\x00\x00\x00\x002@\x00\x00\x00'
           '\x00\x00\x003@\x00\x00\x00\x00\x00\x004@\x00\x00\x00\x00\x00\x005@\x00\x00\x00\x00\x00\x006@\x00'
           '\x00\x00\x00\x00\x007@\x00\x00\x00\x00\x00\x008@\x00\x00\x00\x00\x00\x009@\x00\x00\x00\x00\x00\x00:@'
           '\x00\x00\x00\x00\x00\x00;@\x00\x00\x00\x00\x00\x00<@\x00\x00\x00\x00\x00\x00=@'
          )
with open('data.bin', 'wb') as f:
    f.write(content)
with open('writable.bin', 'wb') as f:
    f.write(content)

## Working With File Header

Create a dtype to represent the header.

In [None]:
header_dtype = dtype([('rows', int32), ('cols', int32)])

Create a memory mapped array using this dtype. Note the shape is empty.

In [None]:
header = memmap('data.bin', mode='r', dtype=header_dtype, shape=())

Read the row and column sizes from using this structured array.

In [None]:
rows = header['rows']
cols = header['cols']
rows, cols

Create a memory map to the data segment, using rows, cols for shape information and the header size to determine the correct offset.

In [None]:
data = memmap('data.bin', mode="r+", dtype='float64',
              shape=(rows, cols), offset=header_dtype.itemsize)

In [None]:
data

## Memory Maps with ndarray

`mmap` is a standard Python module for working with memory maps.

In [None]:
import mmap 
import numpy

Create a dtype to represent the header.

In [None]:
header_dtype = numpy.dtype([('rows', int32), ('cols', int32)])

Open a file for read/write access in binary mode.

In [None]:
f = open('writable.bin', 'r+b')

Create a read-only memory map from the opened file with the correct size to read the header of the file.

In [None]:
mm = mmap.mmap(f.fileno(), header_dtype.itemsize,
               access=mmap.ACCESS_READ)

Create a new array using the ndarray constructor. The first argument is the shape, and we pass in the data type and the memory buffer to use (mm) as keyword arguments.

In [None]:
header = numpy.ndarray((), dtype=header_dtype, buffer=mm)
rows = header['rows']
cols = header['cols']

In [None]:
rows, cols

Create a writable memory map to use for the data array. The size of the memory map in bytes is the size of a float64 (8) x rows x columns.

In [None]:
mm = mmap.mmap(f.fileno(), 8*rows*cols + header_dtype.itemsize, access=mmap.ACCESS_WRITE)

Create our data array using this new memory map. Start the arrays data at the memory location directly after the header using offset.

In [None]:
data = numpy.ndarray((rows, cols), dtype='float64', buffer=mm,
                     offset=header_dtype.itemsize)

In [None]:
data

## Structured Arrays

| Name         |      Time |       Value |
|:-------------|----------:|------------:|
| __char[12]__ | __int64__ | __float32__ |
| MSFT_profit  |        10 |        6.20 |
| GOOG_profit  |        12 |       -1.08 |
| MSFT_profit  |        18 |        8.40 |
| INTC_profit  |        25 |       -0.20 |
| ...          |       ... |         ... |
| ...          |       ... |         ... |
| GOOG_profit  |   1000325 |        3.20 |
| GOOD_profit  |   1000350 |        4.50 |
| INTC_profit  |   1000385 |       -1.05 |
| MSFT_profit  |   1000390 |        5.60 |


### memmap single array
Elements of array can be any fixed-size data structure!

In [None]:
import numpy as np
fmt = np.dtype ([('name', 'S12'),
                 ('time', np.int64),
                 ('value', np.float32)])
v = [('MSFT_profit', 10, 6.20),
     ('GOOG_profit', 12, -1.08),
     ('INTC_profit', 1000385, -1.05),
     ('MSFT_profit', 1000390, 5.60)]

In [None]:
arr = np.array(v, dtype=fmt)
arr

Save the data to disk.

In [None]:
arr.tofile('db.dat')

And read it back with:

In [None]:
arr2 = np.fromfile('db.dat', dtype=fmt)
arr2

In [None]:
#or
arr3 = np.memmap('db.dat', dtype=fmt, mode='c')
arr3