<a href="https://colab.research.google.com/github/xbsd/imperial_ml/blob/master/numpy.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [0]:
%matplotlib inline

In [0]:
test_list = list(range(1000000))
test_nump = np.arange(1000000)

In [5]:
%%timeit -n 3 -r 1
sum(test_list)

3 loops, best of 1: 8.22 ms per loop


In [6]:
%%timeit -n 3 -r 1
np.sum(test_nump)

3 loops, best of 1: 2.33 ms per loop


In [7]:
# But don't use non-numpy methods with base python operations
# sum(test_nump) will take 10x longer !

%%timeit -n 3 -r 1
sum(test_nump)

3 loops, best of 1: 97.1 ms per loop


In [8]:
np.sum(test_nump)

499999500000

In [0]:
a = np.array([-1,0,1,100], dtype='int8')

In [35]:
np.square(a)
# BECAUSE WE RESTRICTED TO 8 BITS, 100**2 BECOMES 16 !! INTEGER OVERFLOW ...


array([ 1,  0,  1, 16], dtype=int8)

In [0]:
b = a.astype('float32')

In [37]:
# It is impossible to tell what np.nan, and hence we
# can't even tell if it is equivalent to np.nan

np.nan == np.nan

False

In [38]:
np.isnan(np.nan)

True

In [39]:
print (np.zeros(10))
print (np.ones(10))

[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
[1. 1. 1. 1. 1. 1. 1. 1. 1. 1.]


In [61]:
np.empty((2,2))

array([[-1.,  0.],
       [ 0.,  1.]])

In [62]:
# Numpy starts as 0
a[0] # First Item

-1

In [43]:
a[-1] # Last Item

100

In [44]:
a[0:2] # Inclusive at the lower end and exclusive on the upper end

array([-1,  0], dtype=int8)

In [45]:
a[:2]

array([-1,  0], dtype=int8)

In [46]:
a[::2] # Step size of 2

array([-1,  1], dtype=int8)

In [47]:
a[-1] = 5 # Numpy arrays are mutable
a

array([-1,  0,  1,  5], dtype=int8)

In [50]:
b = np.arange(12).reshape(4,3)
b

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [55]:
np.arange(24).reshape(2,2,6)

array([[[ 0,  1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10, 11]],

       [[12, 13, 14, 15, 16, 17],
        [18, 19, 20, 21, 22, 23]]])

In [53]:
np.arange(24).reshape(2,2,2,3)

array([[[[ 0,  1,  2],
         [ 3,  4,  5]],

        [[ 6,  7,  8],
         [ 9, 10, 11]]],


       [[[12, 13, 14],
         [15, 16, 17]],

        [[18, 19, 20],
         [21, 22, 23]]]])

In [59]:
b = np.arange(12).reshape(4,3)
print (b.shape)
b

(4, 3)


array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [57]:
b[2,2]

8

In [63]:
b[:2,:2]

array([[0, 1],
       [3, 4]])

In [64]:
b[1:3,-1] # Without a colon, it collapses into a single-row vector

array([5, 8])

In [66]:
b[1:3,-1:] # If instead we did -1: then it maintains the shape

array([[5],
       [8]])

In [0]:
# (2,) # -- this is same as 2, but there are () to indicate that this is a tuple since

In [68]:
b[:1,:1]

array([[0]])

In [0]:
?np.loadtxt

In [156]:
c = np.arange(24).reshape(2,3,4)
c

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [157]:
c[1,1,1] # -> Index 1 of Index 1 of Index 1 

17

In [158]:
c[1,:,:]

array([[12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23]])

In [159]:
c[1,0,:]

array([12, 13, 14, 15])

In [160]:
c[:,:,0].ravel()

array([ 0,  4,  8, 12, 16, 20])

In [161]:
a = np.arange(25).reshape(5,5)
# Get last row
# Get 2nd and 4th column
# Get 5,15 7,12
a

array([[ 0,  1,  2,  3,  4],
       [ 5,  6,  7,  8,  9],
       [10, 11, 12, 13, 14],
       [15, 16, 17, 18, 19],
       [20, 21, 22, 23, 24]])

In [162]:
print(1)
print(a[-1,:]) # Last Row
print(2)
print(a[:,1::2]) # All row
print(3)
print(a[1:4:2,0:4:2]) # 5,15 and 7,17

1
[20 21 22 23 24]
2
[[ 1  3]
 [ 6  8]
 [11 13]
 [16 18]
 [21 23]]
3
[[ 5  7]
 [15 17]]


In [163]:
# Fancy Indexing => use [[ ... ]] and pass in indices to retrieve

# Fancy Indexing is slower than standard slicing, etc

# But note that they are interpreted as 1-1 if you have multiple values, see examples

a = np.arange(4)
a

array([0, 1, 2, 3])

In [164]:
a[[2,3]]

array([2, 3])

In [165]:
b

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [166]:
b[[2,3],[2,0]] # --> the (2,2) and (3,0) values 1-1

array([8, 9])

In [167]:
c

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7],
        [ 8,  9, 10, 11]],

       [[12, 13, 14, 15],
        [16, 17, 18, 19],
        [20, 21, 22, 23]]])

In [168]:
c[[0,1],[1],[1]]

array([ 5, 17])

In [169]:
c > 17

array([[[False, False, False, False],
        [False, False, False, False],
        [False, False, False, False]],

       [[False, False, False, False],
        [False, False,  True,  True],
        [ True,  True,  True,  True]]])

In [170]:
e = c[c>17]
e

array([18, 19, 20, 21, 22, 23])

In [0]:
d = c[:,1:2,1:3]

In [172]:
d

array([[[ 5,  6]],

       [[17, 18]]])

In [173]:
# But is d is new array ?
# The answer is that it isn't. d contains references to c
d.flags

# "OWNDATA : False" => means that this is not a standalone array

  C_CONTIGUOUS : False
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [174]:
# As a side-effect, if I change something in d, it will show up in c

print(d[0,0,0])
d[0,0,0] = 9999

c # There will be a 9999 in c now

5


array([[[   0,    1,    2,    3],
        [   4, 9999,    6,    7],
        [   8,    9,   10,   11]],

       [[  12,   13,   14,   15],
        [  16,   17,   18,   19],
        [  20,   21,   22,   23]]])

In [175]:
# But in fancy indexing, although numpy tries to not copy, sometimes
# It has to create a new data structure

e.flags # "OWNDATA : True" !

  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [176]:
e.strides

(8,)

In [177]:
# An interesting observation:

c.flags

# c -> "OWNDATA : False" -- this is because this too is a reference
# c is actually a reference to the arange !

  C_CONTIGUOUS : True
  F_CONTIGUOUS : False
  OWNDATA : False
  WRITEABLE : True
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [178]:
e

array([18, 19, 20, 21, 22, 23])

In [0]:
f = e[0]

In [183]:
print(f)
f.flags

18


  C_CONTIGUOUS : True
  F_CONTIGUOUS : True
  OWNDATA : True
  WRITEABLE : False
  ALIGNED : True
  WRITEBACKIFCOPY : False
  UPDATEIFCOPY : False

In [186]:
# "where" = where > 0 or True
a = [0,1,2,0,3]
np.where(a)

(array([1, 2, 4]),)

In [185]:
np.where(e)

(array([0, 1, 2, 3, 4, 5]),)

In [187]:
b

array([[ 0,  1,  2],
       [ 3,  4,  5],
       [ 6,  7,  8],
       [ 9, 10, 11]])

In [188]:
b + 5 # Array + Scalar

array([[ 5,  6,  7],
       [ 8,  9, 10],
       [11, 12, 13],
       [14, 15, 16]])

In [190]:
b + b # Array + Same-size array or one that can be broadcasted

array([[ 0,  2,  4],
       [ 6,  8, 10],
       [12, 14, 16],
       [18, 20, 22]])

In [191]:
b + b[2:,2] # Cannot be broadcast

ValueError: ignored

In [208]:
b + np.array((100,200,300)) # Can be "broadcast" -> row vector

array([[100, 201, 302],
       [103, 204, 305],
       [106, 207, 308],
       [109, 210, 311]])

In [209]:
b + np.array((100,200,300,400)).reshape(4,1) # Can be "broadcast" -> reshaped to a column vector

array([[100, 101, 102],
       [203, 204, 205],
       [306, 307, 308],
       [409, 410, 411]])

In [220]:
np.arange(5).reshape(5,1) + np.arange(5)

array([[ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9],
       [ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [ 2,  3,  4,  5,  6,  7,  8,  9, 10, 11],
       [ 3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [ 4,  5,  6,  7,  8,  9, 10, 11, 12, 13],
       [ 5,  6,  7,  8,  9, 10, 11, 12, 13, 14],
       [ 6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       [ 7,  8,  9, 10, 11, 12, 13, 14, 15, 16],
       [ 8,  9, 10, 11, 12, 13, 14, 15, 16, 17],
       [ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18]])

In [221]:
b.dtype

dtype('int64')

In [222]:
b.astype('float64') # To change type

array([[ 0.,  1.,  2.],
       [ 3.,  4.,  5.],
       [ 6.,  7.,  8.],
       [ 9., 10., 11.]])

In [234]:
c

array([[[   0,    1,    2,    3],
        [   4, 9999,    6,    7],
        [   8,    9,   10,   11]],

       [[  12,   13,   14,   15],
        [  16,   17,   18,   19],
        [  20,   21,   22,   23]]])

In [226]:
# Axis = just the position in the shape tuple that you'd like to collapse
np.sum(b, axis = 1,) # Axis = basically the dimension in the b.shape tuple that you want to collapse


array([ 3, 12, 21, 30])

In [233]:
print(c.shape) # 3 dimension, and if I want to collapse the 3rd shape attribute,

print ("Collapse Dimension 2")

np.sum(c, axis = 2)

(2, 3, 4)
Collapse Dimension 2


array([[    6, 10016,    38],
       [   54,    70,    86]])

In [235]:
np.mean(c)

427.9166666666667

In [236]:
np.mean(c, axis = 2)

array([[1.500e+00, 2.504e+03, 9.500e+00],
       [1.350e+01, 1.750e+01, 2.150e+01]])

In [237]:
np.std(c)

1995.7203778056905

In [238]:
np.var(c)

3982899.8263888885

In [240]:
# You can also do these across axes ...

np.max(c, axis = 1)

array([[   8, 9999,   10,   11],
       [  20,   21,   22,   23]])

In [241]:
np.max(c)

9999