<img src="http://hilpisch.com/tpq_logo.png" alt="The Python Quants" width="35%" align="right" border="0"><br>

# Python for Finance (2nd ed.)

**Mastering Data-Driven Finance**

&copy; Dr. Yves J. Hilpisch | The Python Quants GmbH

<img src="http://hilpisch.com/images/py4fi_2nd_shadow.png" width="300px" align="left">

# Numerical Computing with NumPy

## Arrays with Python Lists

In [154]:
v = [0.5, 0.75, 1.0, 1.5, 2.0]  

In [155]:
m = [v, v, v]  
m  

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [156]:
m[1]

[0.5, 0.75, 1.0, 1.5, 2.0]

In [157]:
m[1][0]

0.5

In [158]:
v1 = [0.5, 1.5]
v2 = [1, 2]
m = [v1, v2]
c = [m, m]  
c

[[[0.5, 1.5], [1, 2]], [[0.5, 1.5], [1, 2]]]

In [159]:
c[1][1][0]

1

In [160]:
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = [v, v, v]
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [161]:
v[0] = 'Python'
m

[['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0],
 ['Python', 0.75, 1.0, 1.5, 2.0]]

In [162]:
from copy import deepcopy
v = [0.5, 0.75, 1.0, 1.5, 2.0]
m = 3 * [deepcopy(v), ]  
m

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

In [163]:
v[0] = 'Python'  
m  

[[0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0],
 [0.5, 0.75, 1.0, 1.5, 2.0]]

## Interlude: Python Array Class

In [164]:
v = [0.5, 0.75, 1.0, 1.5, 2.0]

In [165]:
import array

In [166]:
a = array.array('f', v)  
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0])

In [167]:
a.append(0.5)  
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5])

In [168]:
a.extend([5.0, 6.75])  
a

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75])

In [169]:
2 * a  

array('f', [0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75, 0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75])

In [170]:
# causes intentional error
# a.append('string')  

In [171]:
a.tolist()  

[0.5, 0.75, 1.0, 1.5, 2.0, 0.5, 5.0, 6.75]

In [172]:
f = open('array.apy', 'wb')  
a.tofile(f)  
f.close()  

In [173]:
with open('array.apy', 'wb') as f:  
    a.tofile(f)  

In [174]:
!ls -n arr*  

'ls' �����ڲ����ⲿ���Ҳ���ǿ����еĳ���
���������ļ���


In [175]:
b = array.array('f')  

In [176]:
with open('array.apy', 'rb') as f:  
    b.fromfile(f, 5)  

In [177]:
b  

array('f', [0.5, 0.75, 1.0, 1.5, 2.0])

In [178]:
b = array.array('d')  

In [179]:
with open('array.apy', 'rb') as f:
    b.fromfile(f, 2)  

In [180]:
b  

array('d', [0.0004882813645963324, 0.12500002956949174])

## Regular NumPy Arrays

### The Basics

In [181]:
import numpy as np  

In [182]:
a = np.array([0, 0.5, 1.0, 1.5, 2.0])  
a

array([0. , 0.5, 1. , 1.5, 2. ])

In [183]:
type(a)  

numpy.ndarray

In [184]:
a = np.array(['a', 'b', 'c'])  
a

array(['a', 'b', 'c'], dtype='<U1')

In [185]:
a = np.arange(2, 20, 2)  
a

array([ 2,  4,  6,  8, 10, 12, 14, 16, 18])

In [186]:
a = np.arange(8, dtype=np.float)  
a

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [187]:
a[5:]  

array([5., 6., 7.])

In [188]:
a[:2]  

array([0., 1.])

In [189]:
a.sum()  

28.0

In [190]:
a.std()  

2.29128784747792

In [191]:
a.cumsum()  

array([ 0.,  1.,  3.,  6., 10., 15., 21., 28.])

In [192]:
l = [0., 0.5, 1.5, 3., 5.]
2 * l  

[0.0, 0.5, 1.5, 3.0, 5.0, 0.0, 0.5, 1.5, 3.0, 5.0]

In [193]:
a

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [194]:
2 * a  

array([ 0.,  2.,  4.,  6.,  8., 10., 12., 14.])

In [195]:
a ** 2  

array([ 0.,  1.,  4.,  9., 16., 25., 36., 49.])

In [196]:
2 ** a  

array([  1.,   2.,   4.,   8.,  16.,  32.,  64., 128.])

In [197]:
a ** a  

array([1.00000e+00, 1.00000e+00, 4.00000e+00, 2.70000e+01, 2.56000e+02,
       3.12500e+03, 4.66560e+04, 8.23543e+05])

In [198]:
np.exp(a)  

array([1.00000000e+00, 2.71828183e+00, 7.38905610e+00, 2.00855369e+01,
       5.45981500e+01, 1.48413159e+02, 4.03428793e+02, 1.09663316e+03])

In [199]:
np.sqrt(a)  

array([0.        , 1.        , 1.41421356, 1.73205081, 2.        ,
       2.23606798, 2.44948974, 2.64575131])

In [200]:
np.sqrt(2.5)  

1.5811388300841898

In [201]:
import math  

In [202]:
math.sqrt(2.5)  

1.5811388300841898

In [203]:
# causes intentional error
# math.sqrt(a)  

In [204]:
%timeit np.sqrt(2.5)  

736 ns ± 30 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)


In [205]:
%timeit math.sqrt(2.5)  

115 ns ± 3.92 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)


### Multiple Dimensions

In [206]:
b = np.array([a, a * 2])  
b

array([[ 0.,  1.,  2.,  3.,  4.,  5.,  6.,  7.],
       [ 0.,  2.,  4.,  6.,  8., 10., 12., 14.]])

In [207]:
b[0]  

array([0., 1., 2., 3., 4., 5., 6., 7.])

In [208]:
b[0, 2]  

2.0

In [209]:
b[:, 1]  

array([1., 2.])

In [210]:
b.sum()  

84.0

In [211]:
b.sum(axis=0)  

array([ 0.,  3.,  6.,  9., 12., 15., 18., 21.])

In [212]:
b.sum(axis=1)  

array([28., 56.])

In [213]:
c = np.zeros((2, 3), dtype='i', order='C')  
c

array([[0, 0, 0],
       [0, 0, 0]], dtype=int32)

In [214]:
c = np.ones((2, 3, 4), dtype='i', order='C')  
c

array([[[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]],

       [[1, 1, 1, 1],
        [1, 1, 1, 1],
        [1, 1, 1, 1]]], dtype=int32)

In [265]:
d = np.zeros_like(c, dtype='float', order='C')  
d

array([[[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]],

       [[0., 0., 0., 0.],
        [0., 0., 0., 0.],
        [0., 0., 0., 0.]]])

In [266]:
d = np.ones_like(c, dtype='float', order='C')  
d

array([[[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]],

       [[1., 1., 1., 1.],
        [1., 1., 1., 1.],
        [1., 1., 1., 1.]]])

In [217]:
e = np.empty((2, 3, 2))  
e

array([[[0. , 0.5],
        [1. , 1.5],
        [2. , 2.5]],

       [[3. , 3.5],
        [4. , 4.5],
        [5. , 5.5]]])

In [218]:
f = np.empty_like(c)  
f

array([[[         0,          0,          0, 1071644672],
        [         0, 1072693248,          0, 1073217536],
        [         0, 1073741824,          0, 1074003968]],

       [[         0, 1074266112,          0, 1074528256],
        [         0, 1074790400,          0, 1074921472],
        [         0, 1075052544,          0, 1075183616]]], dtype=int32)

In [219]:
np.eye(5)  

array([[1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1.]])

In [220]:
g = np.linspace(5, 15, 12) 
g

array([ 5.        ,  5.90909091,  6.81818182,  7.72727273,  8.63636364,
        9.54545455, 10.45454545, 11.36363636, 12.27272727, 13.18181818,
       14.09090909, 15.        ])

### Meta-Information

In [221]:
g.size  

12

In [222]:
g.itemsize  

8

In [223]:
g.ndim  

1

In [224]:
g.shape  

(12,)

In [225]:
g.dtype  

dtype('float64')

In [226]:
g.nbytes  

96

### Reshaping, Resizing, Stacking, Flattening

In [227]:
g = np.arange(15)

In [228]:
g

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [229]:
g.shape  

(15,)

In [230]:
np.shape(g) 

(15,)

In [231]:
g.reshape((3, 5))  

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [232]:
g.reshape((-1, 5))

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14]])

In [233]:
h = g.reshape((5, 3))  
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [234]:
h.T  

array([[ 0,  3,  6,  9, 12],
       [ 1,  4,  7, 10, 13],
       [ 2,  5,  8, 11, 14]])

In [235]:
h.transpose()  

array([[ 0,  3,  6,  9, 12],
       [ 1,  4,  7, 10, 13],
       [ 2,  5,  8, 11, 14]])

In [236]:
g

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [237]:
np.resize(g, (3, 1))  

array([[0],
       [1],
       [2]])

In [238]:
np.resize(g, (1, 5))  

array([[0, 1, 2, 3, 4]])

In [239]:
np.resize(g, (2, 5))  

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

In [240]:
n = np.resize(g, (5, 4))  
n

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14,  0],
       [ 1,  2,  3,  4]])

In [241]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [242]:
np.hstack((h, 2 * h))  

array([[ 0,  1,  2,  0,  2,  4],
       [ 3,  4,  5,  6,  8, 10],
       [ 6,  7,  8, 12, 14, 16],
       [ 9, 10, 11, 18, 20, 22],
       [12, 13, 14, 24, 26, 28]])

In [243]:
np.vstack((h, 0.5 * h))  

array([[ 0. ,  1. ,  2. ],
       [ 3. ,  4. ,  5. ],
       [ 6. ,  7. ,  8. ],
       [ 9. , 10. , 11. ],
       [12. , 13. , 14. ],
       [ 0. ,  0.5,  1. ],
       [ 1.5,  2. ,  2.5],
       [ 3. ,  3.5,  4. ],
       [ 4.5,  5. ,  5.5],
       [ 6. ,  6.5,  7. ]])

In [244]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [245]:
h.flatten()  

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [246]:
h.flatten(order='C')  

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [247]:
h.flatten(order='F')  

array([ 0,  3,  6,  9, 12,  1,  4,  7, 10, 13,  2,  5,  8, 11, 14])

In [248]:
for i in h.flat:  
    print(i, end=',')

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,

In [249]:
for i in h.ravel(order='C'):  
    print(i, end=',')

0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,

In [250]:
for i in h.ravel(order='F'):  
    print(i, end=',')

0,3,6,9,12,1,4,7,10,13,2,5,8,11,14,

### Boolean Arrays

In [251]:
h

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [252]:
h > 8  

array([[False, False, False],
       [False, False, False],
       [False, False, False],
       [ True,  True,  True],
       [ True,  True,  True]])

In [253]:
h <= 7  

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True, False],
       [False, False, False],
       [False, False, False]])

In [254]:
h == 5  

array([[False, False, False],
       [False, False,  True],
       [False, False, False],
       [False, False, False],
       [False, False, False]])

In [255]:
(h == 5).astype(int)  

array([[0, 0, 0],
       [0, 0, 1],
       [0, 0, 0],
       [0, 0, 0],
       [0, 0, 0]])

In [256]:
(h > 4) & (h <= 12)  

array([[False, False, False],
       [False, False,  True],
       [ True,  True,  True],
       [ True,  True,  True],
       [ True, False, False]])

In [257]:
h[h > 8]  

array([ 9, 10, 11, 12, 13, 14])

In [258]:
h[(h > 4) & (h <= 12)]  

array([ 5,  6,  7,  8,  9, 10, 11, 12])

In [259]:
h[(h < 4) | (h >= 12)]  

array([ 0,  1,  2,  3, 12, 13, 14])

In [260]:
np.where(h > 7, 1, 0)  

array([[0, 0, 0],
       [0, 0, 0],
       [0, 0, 1],
       [1, 1, 1],
       [1, 1, 1]])

In [261]:
np.where(h % 2 == 0, 'even', 'odd')  

array([['even', 'odd', 'even'],
       ['odd', 'even', 'odd'],
       ['even', 'odd', 'even'],
       ['odd', 'even', 'odd'],
       ['even', 'odd', 'even']], dtype='<U4')

In [262]:
np.where(h <= 7, h * 2, h / 2)  

array([[ 0. ,  2. ,  4. ],
       [ 6. ,  8. , 10. ],
       [12. , 14. ,  4. ],
       [ 4.5,  5. ,  5.5],
       [ 6. ,  6.5,  7. ]])

### Speed Comparison

In [263]:
import random
I = 5000

In [268]:
%%timeit
mat = [[random.gauss(0, 1) for j in range(I)] for i in range(I)]  

15.9 s ± 100 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [269]:
mat[0][:5]  

array([ 0.50960172, -2.37661639,  1.270615  , -0.48007585, -0.63673628])

In [270]:
%%timeit
sum([sum(l) for l in mat])  

3.69 s ± 28.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [271]:
import sys
print("%d MB" % (sum([sys.getsizeof(l) for l in mat])/1024/1024))

0 MB


In [272]:
%%timeit
mat = np.random.standard_normal((I, I))  

643 ms ± 2.04 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [273]:
mat = np.random.standard_normal((I, I)) 

In [274]:
%%timeit 
mat.sum()  

26.5 ms ± 158 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [275]:
print(mat.nbytes)
print(mat.nbytes / 1024 / 1024)  

200000000
190.73486328125


In [276]:
sys.getsizeof(mat)  

200000112

### Structured Arrays

In [277]:
dt = np.dtype([('Name', 'S10'), ('Age', 'i4'),
               ('Height', 'f'), ('Children/Pets', 'i4', 2)])  

In [278]:
dt  

dtype([('Name', 'S10'), ('Age', '<i4'), ('Height', '<f4'), ('Children/Pets', '<i4', (2,))])

In [279]:
dt = np.dtype({'names': ['Name', 'Age', 'Height', 'Children/Pets'],
             'formats':'O int float int,int'.split()})  

In [280]:
dt  

dtype([('Name', 'O'), ('Age', '<i4'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i4'), ('f1', '<i4')])])

In [281]:
s = np.array([('Smith', 45, 1.83, (0, 1)),
              ('Jones', 53, 1.72, (2, 2))], dtype=dt)  

In [282]:
s  

array([('Smith', 45, 1.83, (0, 1)), ('Jones', 53, 1.72, (2, 2))],
      dtype=[('Name', 'O'), ('Age', '<i4'), ('Height', '<f8'), ('Children/Pets', [('f0', '<i4'), ('f1', '<i4')])])

In [283]:
type(s)  

numpy.ndarray

In [284]:
s['Name']  

array(['Smith', 'Jones'], dtype=object)

In [285]:
s['Height'].mean()  

1.775

In [286]:
s[0]  

('Smith', 45, 1.83, (0, 1))

In [287]:
s[1]['Age']  

53

## Vectorization of Code

In [288]:
np.random.seed(100)
r = np.arange(12).reshape((4, 3))  
s = np.arange(12).reshape((4, 3)) * 0.5  

In [289]:
r  

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [290]:
print(s)
print(np.arange(0, 6, 0.5).reshape(4, -1))

[[0.  0.5 1. ]
 [1.5 2.  2.5]
 [3.  3.5 4. ]
 [4.5 5.  5.5]]
[[0.  0.5 1. ]
 [1.5 2.  2.5]
 [3.  3.5 4. ]
 [4.5 5.  5.5]]


In [291]:
r + s  

array([[ 0. ,  1.5,  3. ],
       [ 4.5,  6. ,  7.5],
       [ 9. , 10.5, 12. ],
       [13.5, 15. , 16.5]])

In [292]:
r + 3  

array([[ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11],
       [12, 13, 14]])

In [293]:
2 * r  

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22]])

In [294]:
2 * r + 3  

array([[ 3,  5,  7],
       [ 9, 11, 13],
       [15, 17, 19],
       [21, 23, 25]])

In [295]:
r

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [296]:
r.shape

(4, 3)

In [297]:
s = np.arange(0, 12, 4)  
s  

array([0, 4, 8])

In [298]:
r + s  

array([[ 0,  5, 10],
       [ 3,  8, 13],
       [ 6, 11, 16],
       [ 9, 14, 19]])

In [299]:
s = np.arange(0, 12, 3)  
s  

array([0, 3, 6, 9])

In [300]:
# causes intentional error
# r + s  

In [301]:
r.transpose() + s  

array([[ 0,  6, 12, 18],
       [ 1,  7, 13, 19],
       [ 2,  8, 14, 20]])

In [302]:
sr = s.reshape(-1, 1)  
sr

array([[0],
       [3],
       [6],
       [9]])

In [303]:
sr.shape  

(4, 1)

In [304]:
r + s.reshape(-1, 1)  

array([[ 0,  1,  2],
       [ 6,  7,  8],
       [12, 13, 14],
       [18, 19, 20]])

In [305]:
def f(x):
    return 3 * x + 5  

In [306]:
f(0.5)  

6.5

In [307]:
f(r)  

array([[ 5,  8, 11],
       [14, 17, 20],
       [23, 26, 29],
       [32, 35, 38]])

## Memory Layout

Cf. http://eli.thegreenplace.net/2015/memory-layout-of-multi-dimensional-arrays/

使用小型数组时，这对数组操作性能没有任何可测的影响。但是，当数组很大时情况就有所不同，这取决于具体在数组上实施的操作。这时内存布局就派上用场了

In [321]:
x = np.random.standard_normal((1000000, 5))  

In [322]:
y = 2 * x + 3  

In [310]:
C = np.array((x, y), order='C')  

In [311]:
F = np.array((x, y), order='F')  

In [312]:
x = 0.0; y = 0.0  

In [313]:
C[:2].round(2)  

array([[[-1.75,  0.34,  1.15, -0.25,  0.98],
        [ 0.51,  0.22, -1.07, -0.19,  0.26],
        [-0.46,  0.44, -0.58,  0.82,  0.67],
        ...,
        [-0.05,  0.14,  0.17,  0.33,  1.39],
        [ 1.02,  0.3 , -1.23, -0.68, -0.87],
        [ 0.83, -0.73,  1.03,  0.34, -0.46]],

       [[-0.5 ,  3.69,  5.31,  2.5 ,  4.96],
        [ 4.03,  3.44,  0.86,  2.62,  3.51],
        [ 2.08,  3.87,  1.83,  4.63,  4.35],
        ...,
        [ 2.9 ,  3.28,  3.33,  3.67,  5.78],
        [ 5.04,  3.6 ,  0.54,  1.65,  1.26],
        [ 4.67,  1.54,  5.06,  3.69,  2.07]]])

In [314]:
%timeit C.sum()  

10.4 ms ± 21.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [315]:
%timeit F.sum()  

10.4 ms ± 38.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [316]:
%timeit C.sum(axis=0)  

27.8 ms ± 218 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [317]:
%timeit C.sum(axis=1)  

31 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [318]:
%timeit F.sum(axis=0)  

65.5 ms ± 1.16 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [319]:
%timeit F.sum(axis=1)  

66.4 ms ± 771 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [323]:
F = 0.0; C = 0.0  

计算所有元素总和时，内存布局不重要；

加总C顺序的ndarray对象不管按行和按列都更快（绝对的速度优势）；对于C顺序（行优先）的ndarray对象，按行加总相对快于按列加总；

对于F顺序（列优先）的ndarray对象，按列行加总相对快于按行加总。