# Numpy

<img src="figures/python-antigravity.jpg" width="70%">


In [1]:
import numpy as np

*numpy* is a module operating on vectors, like Matlab, IDL. In combination with *ipython* and *matplotlib* you have full interactive environment

- **Why to use numpy? Because it makes things much faster!**

In [2]:
def suma(N=10000000):
    s = 0
    for i in range(N):
        s += i
    return s

print( suma())
%timeit suma()

49999995000000
573 ms ± 21.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [41]:
def suma_numpy(N=10000000):
    s = np.arange(N).sum()
    return s

print(suma_numpy())
%timeit suma_numpy()

49999995000000
22.1 ms ± 1.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [4]:
573/21.

27.285714285714285

In [5]:
%%timeit
a = []
for i in range(1000000):
    a.append(i**2)

315 ms ± 1.82 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [6]:
%%timeit
i = np.arange(1000000)
a = i**2

1.06 ms ± 39.3 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [49]:
a = np.linspace(1,100,10)

b = np.zeros( (2,3) )

c = np.empty(5)

d = np.ones_like(b)

g = np.zeros_like(b)

e = np.arange(300)

r = np.random.random(10)

In [48]:
b

array([[0., 0., 0.],
       [0., 0., 0.]])

In [16]:
r

array([0.14869381, 0.01695894, 0.65160381, 0.47226778, 0.8486556 ,
       0.05446601, 0.40535801, 0.3652012 , 0.57949566, 0.49584773])

In [17]:
r.shape

(10,)

In [18]:
r.reshape(2,5)

array([[0.14869381, 0.01695894, 0.65160381, 0.47226778, 0.8486556 ],
       [0.05446601, 0.40535801, 0.3652012 , 0.57949566, 0.49584773]])

In [19]:
r.reshape(2,5).shape

(2, 5)

In [21]:
np.array(  [4,5,6]  )

array([4, 5, 6])

In [22]:
np.array( [3,"ahoj", 14.4] )
# the values  must be of the same type,
# it's the price for speed

array(['3', 'ahoj', '14.4'], dtype='<U21')

In [25]:
x = np.array([4,5,6])
y = np.array([10,1,2])
x + y

array([14,  6,  8])

In [26]:
x * y

array([40,  5, 12])

In [27]:
x + 100

array([104, 105, 106])

In [28]:
import math
math.sin(1.)

0.8414709848078965

In [30]:
for i in x:
    print(math.sin(i))

-0.7568024953079282
-0.9589242746631385
-0.27941549819892586


In [34]:
np.cos(x)

array([-0.65364362,  0.28366219,  0.96017029])

In [36]:
print(x)
print(y)
np.dot(x,y)

[4 5 6]
[10  1  2]


57

In [37]:
np.matmul(x, y.transpose())

57

In [39]:
my_homemade_vectorized_sine = np.vectorize(math.sin)
my_homemade_vectorized_sine(x)

array([-0.7568025 , -0.95892427, -0.2794155 ])

## Vectorize whenever it's possible, more for loops you replace, faster your code will run!

In [42]:
%%timeit
x = np.random.random(size=(1000,1000))

for i in range(1000):
    for j in range(1000):
        x[i, j] = x[i, j] ** 2

518 ms ± 7.01 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
%%timeit
x = np.random.random(size=(1000,1000))
x = x**2

8.32 ms ± 235 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [51]:
moje_dlouha_promenna = 1

In [52]:
moje_dlouha_promenna

1

### Exersise:
use or import *numpy.linalg*, explore its capabilities and try to calculate determinant, eigenvalues of a matrix

In [53]:
import numpy as np

m = np.array( [   [1,2], [3,10]   ])
m

array([[ 1,  2],
       [ 3, 10]])

In [55]:
g = np.sin

In [56]:
g(3.14)

0.0015926529164868282

In [54]:
for f in [ np.linalg.eigvals, np.linalg.det ]:
    print(f(m))

[ 0.37652462 10.62347538]
3.999999999999999


In [72]:
a = np.array([[3,1], [1,2]])

b = np.array([[3.00000000000001,1], [1,2]])

np.allclose(a, b)

True

In [None]:
a = np.array([[3,1], [1,2]])

b = np.array([9,8])

x = np.linalg.solve(a, b)
print(x)

#Check that the solution is correct:
(np.dot(a, x) == b).all()



## Indexing


In [73]:
a = np.array( [3, 3.5, 99.1, 1., -12] )

In [74]:
a

array([  3. ,   3.5,  99.1,   1. , -12. ])

In [79]:
a[0]

3.0

In [82]:
a[2:4]

array([99.1,  1. ])

In [86]:
a[-1]

-12.0

In [88]:
a[3: ]

array([  1., -12.])

In [89]:
a[:2]

array([3. , 3.5])

In [94]:
b = np.arange(10)
b[2:8:2]

array([2, 4, 6])

In [95]:
b [:]

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [97]:
# invert order of the elements
# it's super fast
# because it just changes the way how access the data
# in memory, it does not do anything 

b[::-1]

array([9, 8, 7, 6, 5, 4, 3, 2, 1, 0])

In [98]:
# exercise
def derivative():
    arr = np.arange(1000)
    dif = np.zeros(999, int)
    for i in range(1, len(arr)):
        dif[i-1] = arr[i]-arr[i-1]
    return dif
%timeit x=derivative()

402 µs ± 6.27 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [100]:
def derivative_numpy():
    arr = np.arange(1000)
    dif = arr[1:]-arr[:-1]
    return dif
%timeit x=derivative_numpy()

2.33 µs ± 9.3 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)


In [101]:
m = np.array([[1,2],[3,4]])

In [103]:
m

array([[1, 2],
       [3, 4]])

In [108]:
m.sum(axis=0)

array([4, 6])

In [109]:
m.sum(axis=1)

array([3, 7])

In [110]:
m.shape

(2, 2)

In [111]:
m.ndim

2

In [112]:
m

array([[1, 2],
       [3, 4]])

In [113]:
m.transpose()

array([[1, 3],
       [2, 4]])

In [114]:
#Naive matrix-matrix multiplication: 1264 s (1000x1000 doubles)

def dot_naive(a,b,N):
    nrows, ncols = N, N
    c = np.zeros((nrows, ncols), dtype='f8')
    for row in range(nrows):
        for col in range(ncols):
            for i in range(nrows):
                c[row,col] += a[row,i] * b[i,col]
    return c
    
N = 100
a = np.random.rand(N,N)
b = np.random.rand(N,N)
%timeit dot_naive(a,b,N)

678 ms ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [115]:
#Vectorized matrix-matrix multiplication: 20 s (64x faster)
def dot(a,b, N=100):
    nrows, ncols = N, N
    c = np.empty((nrows, ncols), dtype='f8')
    for row in range(nrows):
        for col in range(ncols):
            c[row, col] = np.sum(a[row] * b[:,col])
    return c

%timeit dot(a,b)

54.6 ms ± 303 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Fancy indexing

In [None]:
x = np.random.random(10)
x

In [None]:
x[ 2 ]

In [None]:
x > 0.5

In [None]:
x [ x > 0.5 ]

In [None]:
np.savetxt("datafile.dat",b.transpose()[:,0:2])
x, y = np.loadtxt("datafile.dat", unpack = True)

import matplotlib.pyplot as plt
plt.plot(np.sort(x),y**3,"o:")

In [None]:
x, y = np.loadtxt("datafile.dat", unpack = True)
ind = np.argsort(x)
x = x[ind]
y = y[ind]

In [None]:
import matplotlib.pyplot as plt
plt.plot(np.sort(x),y**3,"o:")

In [None]:
# sortinng two arrays

x = np.random.randint(10, size=10)
y = np.arange(10)

print(y[ x ])

In [None]:
x = np.random.random(10)
y = np.arange(10)

ind = np.argsort(x)

x

In [None]:
ind

In [None]:
print(x[ind])
print(y[ind])

## Access by reference (copy)

In [None]:
x =  np.arange(10.)
y = x
print(x)
print(y)

In [None]:
x[4] =  -1.5
print(x)

In [None]:
print(y) # huh?

- idea of *pointers* to the beginning of a memory block (like in C)
- c[2] - offset from the beginning
- c[i,j] - offset = ( j $\times$ size + i ) $\times$ size
- that's why accessing the array and some operation like *transpose()* or *reshape* are so fast, it just changes the way how to calculate the desired offset

**"strides" - bytes to jump to get to the next element in each dimension**

<img src="figures/strides.png" width="80%">

In [None]:
x = np.arange(12).reshape((3,4))
x

In [None]:
x.shape, x.dtype, x.dtype.itemsize, x.strides

In [None]:
# back to the original question, how to assign the wholee array,
# not just teh pointer

x =  np.arange(10, dtype=float)
y = x.copy()
x[5] = -1.5
print(x)
print(y)

In [None]:
mat1 = np.array([[1, 2, 3], [4, 5, 6]])
mat2 = np.array([[7, 8, 9], [10, 11, 12]])
mat1.shape, mat2.shape

In [None]:
np.concatenate([mat1, mat2])

In [None]:
np.concatenate([mat1, mat2], axis=1)

## Broadcasting

In [None]:
a = np.arange(10).reshape(5,2)
b = np.arange(2)
# b = np.arange(5).reshape(5,1)

In [None]:
a

In [None]:
b

In [None]:
a + b

In [None]:
a = np.arange(3)
b = np.arange(3)[:, np.newaxis]

print(a)
print(b)

In [None]:
a+b

### Rules of Broadcasting
Broadcasting in NumPy follows a strict set of rules to determine the interaction between the two arrays:

- Rule 1: If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.
- Rule 2: If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.
- Rule 3: If in any dimension the sizes disagree and neither is equal to 1, an error is raised.

The broadcasting rules are straightforward—mostly:

1. Compare dimensions, starting from the last.
2. Match when either dimension is one or None, or if dimensions are equal

<img src="figures/broadcasting1d.png" width="50%">

In [None]:
np.array([ 0, 1, 2, 3]) + 3

<img src="figures/broadcasting.png" width="80%">

The safest approach is to compute (matrix + scalar), or compute with matrices of the same size

but sometimes it can be useful for creating a grid

In [None]:
a = np.arange(3)
print(a)
a.shape

In [None]:
a[:, np.newaxis].shape

In [None]:
# Plotting a two-dimensional function

# x and y have 50 steps from 0 to 5
x = np.linspace(0, 5, 50)
y = np.linspace(0, 5, 50)[:, np.newaxis]

z = np.sin(x) ** 10 + np.cos(10 + y * x) * np.cos(x)

z.shape

In [None]:
plt.imshow(z, origin='lower', extent=[0, 5, 0, 5],
           cmap='viridis')
plt.colorbar();

## Mandelbrot

We construct the Mandelbrot set, i.e. all the black points shown above. The set is calculated as follows:

Given a complex number z, make a copy of the number (call it c), and then perform the following operation recursively:

$$z = z^2 + c$$

If we repeat this an infinite number of times (not very practical!), the result will either blow up or shrink to nothing. All the points whose magnitudes go to infinity are part of the Mandelbrot set.

We clearly cannot compute an infinite number of iterations, so we make a compromise. We say that any point z which, after 100 iterations, has a magnitude of greater than 10, belongs to the Mandelbrot set.



In [None]:

re = np.linspace(-2, 1, 1000)
im = np.linspace(-1.5, 1.5, 1000)

x, y = np.meshgrid(re, im)

In [None]:
np.meshgrid( np.array([1,2,3]) , np.array([1,2]) )

In [None]:
z = x + 1j*y
z.shape

In [None]:
fractal = np.zeros(z.shape)

In [None]:
ITERATIONS = 50
DENSITY = 200

x_min, x_max = -2, 1
y_min, y_max = -1.5, 1.5

x, y = np.meshgrid(np.linspace(x_min, x_max, DENSITY),
                   np.linspace(y_min, y_max, DENSITY))

c = x + 1j*y # complex grid
z = c.copy()
fractal = np.zeros(z.shape) + 255

for n in range(ITERATIONS):

    # --- Uncomment to see different sets ---

    # Tricorn
    # z = z.conj()

    # Burning ship
    #z = abs(z.real) + 1j*abs(z.imag)

    z = z**2 + c

    #z[np.isnan(z)] = np.inf
    mask = (np.abs(z) > 100)
    fractal[mask] = 254* n / float(ITERATIONS)
    
    # mask = (fractal == 255) & (abs(z) > 10)
    # fractal[mask] = 254 * n / float(ITERATIONS)

In [None]:
import matplotlib.pyplot as plt

plt.imshow(np.log(fractal), cmap=plt.cm.hot,
           extent=(x_min, x_max, y_min, y_max))
plt.title('Mandelbrot Set')
plt.xlabel('Re(z)')
plt.ylabel('Im(z)');

## Monte Carlo calculating Pi

<img src="figures/calc-pi.png" width="50%">

In [None]:
# %%timeit
N = 1000000
M = 0
for i in range(N):
    x = np.random.random()
    y = np.random.random()
    if x**2 + y**2 <= 1:
        M += 1

In [None]:
print(4*M/N)

In [None]:
%%timeit
h = np.random.random(size=(N,2))
M = (h[:,0]**2 + h[:,1]**2 <= 1 ).sum()
pi = 4*M/N

In [None]:
print(pi)

In [None]:
%%timeit
h = np.random.random(size=(N,2))
# M = len(np.argwhere(h[:,0]**2 + h[:,1]**2 <= 1 ))
M = len(np.argwhere(np.linalg.norm(h, axis=1) <= 1 ))
pi = 4*M/N

In [None]:
print(pi)

## Exercise
Calculate by Monte Carlo an integral of a function $$f=\sin(30 x)/(x+0.1)^2$$
on the interval [0,1]

In [None]:
def f(x):
    return np.sin(30*x)/(x+0.1)**2 + 20

In [None]:
x = np.linspace(0,1,100)
plt.plot(x, f(x))

In [None]:
from scipy.integrate import quad

In [None]:
quad(f,0,1)

In [None]:
N = 10000000
x = np.random.random(size=(N))
y = 70*np.random.random(size=(N))
M = (y <= f(x) ).sum()
print(70*M/N)

They say this one has not a closed-form analytic solutioon

$$\int_0^4 \sqrt[4]{15x^3+21x^2+41x+3}e^{-0.5x}dx$$

## Monte Carlo integration (even faster version)

$$E_f[h(X)] = \int_a^b h(x) f(x) dx$$ 

$$f(x) = \frac{1}{b - a}, \rm{\ i.e. \ } x \sim Unif(a,b)$$

$$E(X) = \frac{1}{b - a} \int_a^b h(x)$$

$$(b-a) \frac{1}{N} \Sigma_{i=0}^N h(x_i) \approx \int_a^b h(x) dx$$

In [None]:
# integral_5^{20} \frac{x}{(x+1)^3}dx = 0.10629

def h(x):
    return x/(1.+x)**3

In [None]:
quad(h,5,20)

In [None]:
N = 100000
a, b = 5., 20.
x = np.random.uniform(a,b,N)
h_x = h(x)
integ = np.mean(h_x)*(b-a)
print(integ)

In [None]:
def mc_int(f=f, a=0., b=1., N= 100000, verbose=True):
    x = np.random.uniform(a,b,N)
    h_x = f(x)
    integ = np.mean(h_x)*(b-a)
    if verbose:
        print(integ)
    return integ

In [None]:
n = 1000*np.arange(1,200)
y = []
for i in n:
    y.append(mc_int(f=h,N=i, verbose=False))

true_int = quad(h,0,1)[0]
plt.plot(n,y)
plt.axhline(y=true_int,color="red", linestyle=":")
plt.xlabel("#iterations")

In [None]:
true_int

In [None]:
n = 1000*(np.arange(200)+1)
y = np.empty(200)
for j,i in enumerate(n):
    y[j] = (mc_int(f=h,N=i, verbose=False))

true_int = quad(h,0,1)[0]
plt.plot(n,y)
plt.axhline(y=true_int,color="red", linestyle=":")
plt.xlabel("#iterations");

## list or np.array?

- If you don't know the length of the array in advance and if you need to append and the length is not too high, use list

- otherwise create a big enough array beforehand

```
y = np.empty(N)
```
- numpy can also "append" an array, but it's slow, you need to allocate a new array and copy the data to it and delete the old one 

```
y = np.append(y, value)
```

In [None]:
# dict {}

d = {}
d['jmeno'] = 'Martin'
d['vyska'] = 1.74
d['tloustka'] = 85.5
d['pocet deti'] = 1
d[2] = "dvojka"
d[True] = "nevim"
d



In [None]:
other_d = dict({'jmeno': 'Martin',
 'vyska': 1.74,
 'tloustka': 85.5,
 'pocet deti': 1,
 2: 'dvojka',
 True: 'nevim'})

other_d.keys()

In [None]:
for k,v in other_d.items():
    print(k,"->", v)

In [None]:
labels = ["Ia", "IIb", "Ia", "Ib/c", "Ia", "IIp", "Ib/c","IIb","Ia"]

freq = {}
for i in labels:
    if i in freq:
        freq[i] +=  1
    else:
        freq[i] = 1
    # freq[i] = freq.get(i,0) + 1
    # a more general way how to say freq[i],
    # not vulnarable to accessing non-existing key,
    # if it does not exist, it returens 0
    
freq

