This numpy tutorial is based on a freely available version from freeCodeCamp.

### Load in NumPy (remember to pip install numpy first)

In [2]:
import numpy as np

### Some Basics

In [None]:
# np.array(.) creates a numpy array
a = np.array([1,2,3], dtype='int32')
print(a)

In [None]:
# a 2D array
b = np.array([[9.0,8.0,7.0],[6.0,5.0,4.0]])
print(b)

In [None]:
# a 3D array
c = np.array([[[9,8],[6,5]],
              [[1,2],[4,5]]])
print(b)

In [None]:
# Get Dimension
a.ndim

In [None]:
# Get Shape
b.shape

In [None]:
# Get datatype of array
a.dtype

In [None]:
# Get array size on disk
a.nbytes

In [None]:
# Get number of elements
a.size

In [None]:
 # create a range with a (start, stop, stepsize)
d = np.arange(0,10,2)
print(d)

In [None]:
 # create a range with a (start, stop, number_of_steps)
d = np.linspace(0,10,2)
print(d)

### Tasks

1. Create a 2D np.array with shape 2x3 (2 elements in one dimension and 3 in the other) 

1.1 Print your array

1.2 Print the np.shape of your array

In [None]:
a = np.array([[2,3],[4,5]])
print(a)
print(a.shape)

2. Suppose you want to create the x-axis values for a dataset in order to plot them. Your data were collected on every second day (starting with day 1 and ending with day 11). Create an array containing the x-axis values.

In [None]:
x = np.arange(1,12,2)

### Accessing/Changing specific elements, rows, columns, etc
remember: indexing starts at 0 !

In [6]:
a = np.array([[1,2,3,4,5,6,7],[8,9,10,11,12,13,14]])
print(a)

[[ 1  2  3  4  5  6  7]
 [ 8  9 10 11 12 13 14]]


In [7]:
# Get a specific element [row, column]
a[1, 5]

13

In [8]:
# Get a specific row (the colon (:) indicates "all elements in the dimension")
a[0, :]

array([1, 2, 3, 4, 5, 6, 7])

In [9]:
# Get a part of a specific row
a[0, 3:]

array([4, 5, 6, 7])

In [None]:
# Get a specific column
a[:, 2]

In [None]:
# Get part of a row defined by [startindex : endindex : stepsize]
a[0, 1:-1:2]

In [10]:
a[1,5] = 20

a[:,2] = [1,2]
print(a)

[[ 1  2  1  4  5  6  7]
 [ 8  9  2 11 12 20 14]]


*3-d example

In [15]:
b = np.array([[[1,2],[3,4]],[[5,6],[7,8]]])
print(b)

[[[1 2]
  [3 4]]

 [[5 6]
  [7 8]]]


In [None]:
# Get specific element (work outside in)
b[0,1,1]

In [16]:
# replace 
b[:,1,:] = [[9,9],[8,8]]
print(b)

[[[1 2]
  [9 9]]

 [[5 6]
  [8 8]]]


some advanced functions

In [19]:
# np.nonzero(): returns an array containing the indices of non-zero elements
c = np.array([[0,0,0,1,0,1],
              [0,1,0,0,0,1],
              [1,0,0,1,0,1]])
d = np.nonzero(c[0])
print(d)
e = np.nonzero(c[2])
print(e)

(array([3, 5], dtype=int64),)
(array([0, 3, 5], dtype=int64),)


In [4]:
# np.diff():  get the differences between array elements
np.diff(e)

array([[3, 2]], dtype=int64)

combining numpy with python lists to write results

In [None]:
# use a loop to go through the rows of array c and make a computation for each row, which you append to a python list
list = []     #initialize an empty python list
for i in range(len(c)):
    print(i)    # just to make sure the index is going through the rows
    list.extend(np.nonzero(c[i]))
print(list)


### Tasks

1. Print array a, then use indexing to print the 3rd element of the second row (should be a 2). 

In [12]:
print(a)
a[1,2]

[[ 1  2  1  4  5  6  7]
 [ 8  9  2 11 12 20 14]]


2

2. Print the first column of a

In [13]:
a[:,0]

array([1, 8])

3. Print array b. Determine the indexes of the "6" and print it.

In [18]:
b
b[1,0,1]

6

4. Write a for-loop to go through the rows of array c, then use np.nonzero and np.diff to get the intervals between ones. Use a list to record your results.

In [24]:
intervals = []
for i in c:
    intervals.append(np.diff(np.nonzero(i)))


[[2]]


### Initializing Different Types of Arrays

"empty" arrays:
This is something you may need when e.g. later filling an array in a for-loop.
An 'empty' arry needs to be 'initialized', i.e. you need to predetermine the size of the array and create it in memory.

random number arrays:
np.random.. provides an easy way to create arrays of random numbers following various distributions e.g. normal, uniform, ....


In [25]:
# All 0s matrix
a = np.zeros((2,3))

In [26]:
# All 1s matrix
np.ones((4,2,2), dtype='int32')

array([[[1, 1],
        [1, 1]],

       [[1, 1],
        [1, 1]],

       [[1, 1],
        [1, 1]],

       [[1, 1],
        [1, 1]]])

In [27]:
# Any other number
np.full((2,2), 99)

array([[99, 99],
       [99, 99]])

In [28]:
# Any other number with shape like a (full_like)
np.full_like(a, 4)

array([[4., 4., 4.],
       [4., 4., 4.]])

Pseudo-random numbers: Pseudo means that they are not really random, but can e.g. be reproduced by setting np.random.seed(seed=1)

In [29]:
# pseudo-random numbers between 0 and 1 (uniform probability)
c = np.random.rand(4,2)
print(c)

[[0.82168219 0.53428451]
 [0.26349947 0.5550383 ]
 [0.35960065 0.10734795]
 [0.46845408 0.72073121]]


In [30]:
# Random Integer values
np.random.randint(-4,8, size=(3,3))

array([[ 3, -4,  1],
       [-1, -1,  3],
       [ 3,  3,  3]])

In [31]:
# pseudo-random numbers from normal distribution (mean, standard dev, array_size)
d = np.random.normal(2,1,50)
print(d)

[1.52251934 0.99261446 2.04402405 1.34998055 4.24099776 2.57849941
 0.33584055 1.22248767 2.06948166 2.72807455 2.62406559 1.427819
 1.39361659 1.5257364  0.37949441 1.02466979 0.1254741  3.32478248
 1.47574715 1.47686434 2.43557409 2.53167151 1.11740861 1.21173751
 1.16664485 2.94882185 2.67362377 2.590009   1.90378287 2.4431925
 2.73170267 2.33976077 1.98897225 0.75640216 3.16747824 3.35798543
 2.54645816 1.8851147  2.38466287 1.08552531 1.89704128 0.72634011
 2.97954632 2.03776917 3.21666747 1.36453607 1.93598309 2.50099309
 2.25052463 2.93620175]


In [32]:
# Choose 10 random entries from array a(e.g. for shuffling analyses)
e = np.random.choice(d,10)
print(e)

[1.427819   1.89704128 1.22248767 2.73170267 0.75640216 1.89704128
 2.62406559 1.08552531 2.97954632 2.03776917]


In [34]:
# Repeat an array
arr = np.array([[1,2,3]])
r1 = np.repeat(arr,3, axis=0)
print(r1)

[[1 2 3]
 [1 2 3]
 [1 2 3]]


##### Be careful when copying arrays!!!
a simple equation (e.g. b=a) acts as a pointer, meaning changing b will also change a

In [35]:
a = np.array([1,2,3])
b = a
print(a)

b[0] = 100
print(a)

[1 2 3]
[100   2   3]


In [36]:

a = np.array([1,2,3])
b = a.copy()
print(a)

b[0] = 100
print(a)


[1 2 3]
[1 2 3]


### Tasks

1. Create an array of length 50 filled with random numbers from a normal distribution with mean=2 and standard deviation=1

In [40]:
a = np.random.normal(2,1,50)
a

array([0.7702314 , 2.69879055, 2.00078273, 2.04971133, 2.86597662,
       1.0122366 , 2.56220986, 2.58200494, 1.79188619, 2.1210321 ,
       1.26411857, 1.8773244 , 0.57608585, 2.17333298, 2.40132679,
       1.12251013, 1.25223052, 0.40094504, 2.98044771, 1.00350566,
       1.25250341, 1.27692919, 2.27112455, 1.78159883, 1.69288281,
       2.94137585, 2.0965407 , 1.99449826, 0.92009299, 1.62587348,
       1.7656709 , 3.22449195, 3.490684  , 3.26878908, 0.29825883,
       1.45552126, 1.53696281, 1.84866592, 2.37455815, 1.42968876,
       2.64854423, 0.0082529 , 2.26347507, 3.15414263, 0.80145293,
       1.01153527, 1.73967253, 0.95517949, 3.18667131, 1.97344174])

2. Create 5 arrays of length 10 containing random integers between 0 and 100

In [None]:
a = np.random.rand()*100
b = np.random.rand()*100
c = np.random.rand()*100
d = np.random.rand()*100
e = np.random.rand()*100

3. Find the online documentation for np.random.choice (simply google it). Create an integer array from 0 to 10, draw random samples from it, and determine what the "replace" argument does.

In [44]:
a = np.arange(11)
sample = np.random.choice(a,5, replace=True)
# replace=True means that elements can be chosen again, even after they have been chosen once
print(sample)

[0 9]


### Mathematics

In [None]:
a = np.array([1,2,3,4])
print(a)

In [None]:
a + 2

In [None]:
a - 2

In [None]:
a * 2

In [None]:
a / 2

In [None]:
b = np.array([1,0,1,0])
a + b

In [None]:
a ** 2

In [None]:
# Take the cosinus
np.cos(a)



In [None]:
# For a lot more (https://docs.scipy.org/doc/numpy/reference/routines.math.html)

##### Linear Algebra
We're assuming you're not that interested in linear algebra. However, just so you've seen numpy can do this kind of stuff...

In [None]:
# matrix multiplication
a = np.ones((2,3))
print(a)

b = np.full((3,2), 2)
print(b)

np.matmul(a,b)

In [None]:
# Find the determinant of a matrix
c = np.identity(3)
np.linalg.det(c)

In [None]:
## Reference docs (https://docs.scipy.org/doc/numpy/reference/routines.linalg.html)

# Determinant
# Trace
# Singular Vector Decomposition
# Eigenvalues
# Matrix Norm
# Inverse
# Etc...

##### Statistics

In [45]:
a = np.array([[1,2,3],[4,5,6]])
a

array([[1, 2, 3],
       [4, 5, 6]])

In [46]:
np.min(a)

1

In [47]:
np.max(a, axis=1)

array([3, 6])

In [48]:
np.sum(a, axis=0)

array([5, 7, 9])

In [49]:
# mean of entire array
np.mean(a)

3.5

In [50]:
#row means
np.mean(a, axis=1)

array([2., 5.])

In [51]:
# standard deviation within rows
np.std(a,axis=1)

array([0.81649658, 0.81649658])

In [52]:
# generate noisy data and compute pearson correlation coefficient using np.corrcoef()
# note that corrcoef returns the whole matrix of correllations, where the entries in the main diagonal represent data correlated with itself (correlation =1)

x_data = np.linspace(0,10,20)
slope = 0.5
y_data = x_data * slope + np.random.normal(0,1,20)

np.corrcoef(x_data,y_data)


array([[1.        , 0.79772056],
       [0.79772056, 1.        ]])

### Tasks

1. Get the column means of a.

In [56]:
a.mean(axis=0)


array([2.5, 3.5, 4.5])

2. For the correlation analysis, vary the the degree of noise (i.e. the standard deviation of normally distributed noise, say 2).
Try to do this in a for-loop, where you first create an array with 5 values (say 0.5 to 2.5). You then loop through these values print out the correlation resulting from each value using indexing within the array given back by np.corrcoef.

In [58]:
noise = np.linspace(0.5,2.5,5)
x_data = np.linspace(0,10,20)
slope = 0.5

for n in noise:
    y_data = x_data * slope + np.random.normal(0,n,20)
    result = np.corrcoef(x_data,y_data)
    print(result[1,0])
    

0.9503552413814107
0.820141281086161
0.7504694103025611
0.7672018051364452
0.5089730585441093


### Reorganizing Arrays

In [59]:
before = np.array([[1,2,3,4],[5,6,7,8]])
print(before)

after = before.reshape((4,2))
print(after)

[[1 2 3 4]
 [5 6 7 8]]
[[1 2]
 [3 4]
 [5 6]
 [7 8]]


In [60]:
# Vertically stacking vectors
v1 = np.array([1,2,3,4])
v2 = np.array([5,6,7,8])

np.vstack([v1,v2,v1,v2])

array([[1, 2, 3, 4],
       [5, 6, 7, 8],
       [1, 2, 3, 4],
       [5, 6, 7, 8]])

In [61]:
# Horizontal  stack
h1 = np.ones((2,4))
h2 = np.zeros((2,2))

h = np.hstack((h1,h2))
h

array([[1., 1., 1., 1., 0., 0.],
       [1., 1., 1., 1., 0., 0.]])

In [62]:
# dealing with missing data (NaNs)
h[1,1]= np.NaN
h

array([[ 1.,  1.,  1.,  1.,  0.,  0.],
       [ 1., nan,  1.,  1.,  0.,  0.]])

In [63]:
# standard statistics like mean are not defined for NaNs, but you can use np.nanmean() to simply ignore nans
means = np.mean(h,axis=1)
print(means)

nan_means = np.nanmean(h,axis=1)
print(nan_means)

[0.66666667        nan]
[0.66666667 0.6       ]


Now we loop through different combinations of slopes and noise values (say 10 values each), in a nested for-loop to see how their combination affects the correlation coefficient.

In [64]:
# example of a nested for loop
# the enumerate function returns the elements of an array (say s) preceded by indexes (say i_s)

slopes = np.linspace(0,5,10)
noises = np.linspace(0,5,10)
x_data = np.linspace(0,10,20)
results = np.zeros([10,10])

for i_s,s in enumerate(slopes):
    for i_n, n in enumerate(noises):
        y_data = x_data * s + np.random.normal(0,n,20)
        results[i_s,i_n] = np.corrcoef(x_data,y_data)[1,0]

results

  c /= stddev[:, None]
  c /= stddev[None, :]


array([[        nan, -0.06949493, -0.34300908,  0.59446194, -0.12462711,
        -0.22563949, -0.07089255, -0.05423858, -0.19879673, -0.03615629],
       [ 1.        ,  0.96522079,  0.85104613,  0.75374575,  0.61810279,
         0.41004857,  0.46742942,  0.46332524,  0.2686808 ,  0.69991763],
       [ 1.        ,  0.98476596,  0.96572282,  0.90904165,  0.78513486,
         0.65229185,  0.51267103,  0.6706932 ,  0.72207124,  0.57613876],
       [ 1.        ,  0.99590382,  0.97848811,  0.9333918 ,  0.9263547 ,
         0.91603942,  0.86160833,  0.75233224,  0.72967994,  0.62799845],
       [ 1.        ,  0.99692366,  0.99008686,  0.98388879,  0.93617856,
         0.9068069 ,  0.92966864,  0.92273088,  0.86591973,  0.84214494],
       [ 1.        ,  0.99884802,  0.99348194,  0.98500269,  0.9657021 ,
         0.94555631,  0.94898495,  0.94473735,  0.87444819,  0.93029982],
       [ 1.        ,  0.99821188,  0.99434337,  0.98514227,  0.97443654,
         0.98109328,  0.94990387,  0.94910012

Go through the code carefully until you understand it. Then write some code where you vary slopes also into the negative, keeping noise constant at 1, but changing the bias (i.e. the mean of the gaussian noise function).

In [67]:
slopes = np.linspace(-5,0,10)
bias = np.linspace(0,5,10)
x_data = np.linspace(0,10,20)
results = np.zeros([10,10])
n = 1

for i_s,s in enumerate(slopes):
    for i_n, b in enumerate(bias):
        y_data = x_data * s + np.random.normal(b,n,20)
        results[i_s,i_n] = np.corrcoef(x_data,y_data)[1,0]

results

''' Carefully inspecting the values, we see that changing the bias doesn't really do anything. 
The values within the 2nd dimension (the rows) are all about the same.
Meanwhile, the slopes seem to not much affect the correlation coefficient until they get close to zero. 
But when the slope is close to zero, then the corr_coeficient also drops to zero.

This all makes perfect sense, as the corr_coefficient should normalize away the slope in general. 
However, if the slope gets close to zero, then the noise will overshadow it, and the corrcoefficent can't really detect anything sensible.
The bias, i.e. the y-axis shift, should also be normalized away.
To see why, check the definition of pearsons correlation coefficient: It normalizes the mean by subtraction
 and the data range (slope) by division throught the variance.

array([[-0.99597467, -0.99755373, -0.9980175 , -0.99763142, -0.99775432,
        -0.99825163, -0.99905632, -0.99809058, -0.99800186, -0.99766249],
       [-0.99782942, -0.99669729, -0.99757186, -0.99654637, -0.99824866,
        -0.99667795, -0.99711654, -0.99834251, -0.99645168, -0.99691368],
       [-0.99655899, -0.99720085, -0.99484745, -0.99613046, -0.99641018,
        -0.99608282, -0.99623407, -0.99761978, -0.99767204, -0.99761775],
       [-0.99406501, -0.99405693, -0.99549266, -0.9962968 , -0.99463536,
        -0.99774623, -0.99503432, -0.99476925, -0.99451281, -0.99561599],
       [-0.99008741, -0.99329425, -0.99506292, -0.99564115, -0.99651939,
        -0.98910594, -0.99589257, -0.99514582, -0.99322737, -0.9917785 ],
       [-0.98976235, -0.99191623, -0.98965435, -0.99283007, -0.98978363,
        -0.9880502 , -0.99127249, -0.987559  , -0.99079845, -0.99018904],
       [-0.9726467 , -0.97972909, -0.9821516 , -0.97242379, -0.9886432 ,
        -0.99183134, -0.98645063, -0.97908932