### NumPy Basics for Data Science

In [1]:
import numpy as np
import pandas as pd
import time

#### List vs NumPy

In [2]:
mylist = [10,20,30,40,50]
mylist / 5

TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [3]:
myarray = np.array(mylist)
myarray / 5

array([ 2.,  4.,  6.,  8., 10.])

#### Speed of NumPy Arrays

In [4]:
size_of_vec = 1000000

def pure_python_version():                                                
    time_python = time.time()                                             
    my_list1 = range(size_of_vec)                                         
    my_list2 = range(size_of_vec)
    sum_list = [my_list1[i] + my_list2[i] for i in range(len(my_list1))]  
    return time.time() - time_python                                      

def numpy_version():                                                      
    time_numpy = time.time()                                              
    my_arr1 = np.arange(size_of_vec)                                      
    my_arr2 = np.arange(size_of_vec)
    sum_array = my_arr1 + my_arr2                                         
    return time.time() - time_numpy                                       


python_time = pure_python_version()                                       
numpy_time = numpy_version()                                              
print("Pure Python version {:0.4f}".format(python_time))
print("Numpy version {:0.4f}".format(numpy_time))
print("Numpy is in this example {:0.4f} times faster!".format(python_time/numpy_time))

Pure Python version 0.2890
Numpy version 0.0030
Numpy is in this example 96.4017 times faster!


#### Pandas Series and NumPy arrays

In [5]:
myseries = pd.Series([10,20,30,40,50], index = ['A', 'B', 'C', 'D', 'E'])
myseries

A    10
B    20
C    30
D    40
E    50
dtype: int64

In [6]:
pd_to_np = myseries.to_numpy()
pd_to_np

array([10, 20, 30, 40, 50], dtype=int64)

#### NumPy array initialization

In [7]:
list_to_np = np.array([1,2,3,4,5])
list_to_np

array([1, 2, 3, 4, 5])

In [8]:
tuple_to_np = np.array((1,2,3,4,5))
tuple_to_np

array([1, 2, 3, 4, 5])

In [9]:
# This will result in an error
wrong_np= np.array(1,2,3,4,5)
wrong_np

TypeError: array() takes from 1 to 2 positional arguments but 5 were given

#### Multi-dimensional arrays

In [10]:
array_2d = np.array([
    [1,2,3,4,5], 
    [11,12,13,14,15]
]
)
array_2d

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [11]:
# Ragged Array
array_ragged = np.array([
    [1,2,3,4,5], 
    [11,12,13,14,15, 16]
]
)
array_ragged


  array_ragged = np.array([


array([list([1, 2, 3, 4, 5]), list([11, 12, 13, 14, 15, 16])],
      dtype=object)

#### Descriptive Attributes

```ndim``` : Number of dimensions (axis) of an array

In [12]:
myarray

array([10, 20, 30, 40, 50])

In [13]:
myarray.ndim

1

In [14]:
array_2d

array([[ 1,  2,  3,  4,  5],
       [11, 12, 13, 14, 15]])

In [15]:
array_2d.ndim

2

```shape``` : returns the number of elements across each dimension

In [16]:
myarray.shape

(5,)

In [17]:
array_2d.shape

(2, 5)

```size``` : The number of elements in the array

In [18]:
myarray.size

5

In [19]:
array_2d.size

10

```dtype``` : datatype of the array elements

In [20]:
myarray

array([10, 20, 30, 40, 50])

In [21]:
myarray.dtype

dtype('int32')

In [22]:
myarray / 5

array([ 2.,  4.,  6.,  8., 10.])

In [23]:
(myarray / 5).dtype

dtype('float64')

#### Special Arrays in NumPy

In [24]:
ones_ar = np.ones([3,7])
ones_ar

array([[1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1., 1., 1.]])

In [25]:
zeros_ar = np.zeros([6,4])
zeros_ar

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [26]:
eye_ar = np.eye(6,4, 1)
eye_ar

array([[0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.]])

In [27]:
eye_ar2 = np.eye(6,4, -2)
eye_ar2

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

```arange``` 

In [28]:
arange_ar = np.arange(1,20,3)
arange_ar

array([ 1,  4,  7, 10, 13, 16, 19])

```linspace```

In [29]:
linspace_ar = np.linspace(1,12,5)
linspace_ar

array([ 1.  ,  3.75,  6.5 ,  9.25, 12.  ])

#### Reshaping Arrays

In [30]:
ar60 = np.arange(1,61)
ar60

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60])

In [31]:
ar_6_10 = ar60.reshape(6,10)
ar_6_10

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
       [31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
       [41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
       [51, 52, 53, 54, 55, 56, 57, 58, 59, 60]])

In [32]:
ar_6_10.shape

(6, 10)

In [33]:
ar_4_3_5 = ar60.reshape(4,3,5)
ar_4_3_5

array([[[ 1,  2,  3,  4,  5],
        [ 6,  7,  8,  9, 10],
        [11, 12, 13, 14, 15]],

       [[16, 17, 18, 19, 20],
        [21, 22, 23, 24, 25],
        [26, 27, 28, 29, 30]],

       [[31, 32, 33, 34, 35],
        [36, 37, 38, 39, 40],
        [41, 42, 43, 44, 45]],

       [[46, 47, 48, 49, 50],
        [51, 52, 53, 54, 55],
        [56, 57, 58, 59, 60]]])

In [34]:
ar_4_3_5.shape

(4, 3, 5)

- This will return an error

In [35]:
ar60.reshape(7,10)

ValueError: cannot reshape array of size 60 into shape (7,10)

- Letting NumPy calculate a dimension

In [36]:
ar60.reshape(6,-1)

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10],
       [11, 12, 13, 14, 15, 16, 17, 18, 19, 20],
       [21, 22, 23, 24, 25, 26, 27, 28, 29, 30],
       [31, 32, 33, 34, 35, 36, 37, 38, 39, 40],
       [41, 42, 43, 44, 45, 46, 47, 48, 49, 50],
       [51, 52, 53, 54, 55, 56, 57, 58, 59, 60]])

In [37]:
# This will return an error
ar60.reshape(7,-1)

ValueError: cannot reshape array of size 60 into shape (7,newaxis)

In [38]:
# So will this
ar60.reshape(6,-1,-1)

ValueError: can only specify one unknown dimension

#### Indexing and Slicing

In [39]:
ar60

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60])

- Accessing the item at index = 5 (Note NumPy is zero indexed)

In [40]:
ar60[5]

6

- Accessing the first 10 elements

In [41]:
ar60[:10]

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10])

- Indexing 2d arrays

In [42]:
ar5_12 = ar60.reshape(5,12)
ar5_12

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36],
       [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
       [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]])

In [43]:
ar5_12[2,5]

30

In [44]:
ar5_12[2][5]

30

- NumPy indexing is very similar to ```iloc``` method in Pandas

In [45]:
ar5_12[:, 3]

array([ 4, 16, 28, 40, 52])

In [46]:
ar5_12[2, :]

array([25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36])

#### Boolean Masking

In [47]:
ar60

array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
       52, 53, 54, 55, 56, 57, 58, 59, 60])

In [48]:
ar60[ar60 % 3 == 0]

array([ 3,  6,  9, 12, 15, 18, 21, 24, 27, 30, 33, 36, 39, 42, 45, 48, 51,
       54, 57, 60])

In [49]:
ar60[ar60 // 12 == 3]

array([36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47])

In [50]:
ar60[(ar60 % 3 == 0) | (ar60 % 5 == 0)]

array([ 3,  5,  6,  9, 10, 12, 15, 18, 20, 21, 24, 25, 27, 30, 33, 35, 36,
       39, 40, 42, 45, 48, 50, 51, 54, 55, 57, 60])

In [51]:
# This will result in an error
ar60[(ar60 % 3 == 0) or (ar60 % 5 == 0)]

ValueError: The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()

#### Functions in NumPy

In [52]:
ar5_12

array([[ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12],
       [13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24],
       [25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36],
       [37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48],
       [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60]])

In [53]:
ar5_12.sum()

1830

In [54]:
ar5_12.shape

(5, 12)

- axus 0 is the outer most axis, -1 is the innermost

In [55]:
ar5_12.sum(axis = 0)

array([125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180])

In [56]:
ar5_12.sum(axis = 1)

array([ 78, 222, 366, 510, 654])

##### Negative axis

In [57]:
ar5_12.sum(axis = -1)

array([ 78, 222, 366, 510, 654])

In [58]:
ar5_12.sum(axis = -2)

array([125, 130, 135, 140, 145, 150, 155, 160, 165, 170, 175, 180])

#### Vectorized Operations in NumPy

In [59]:
list01 = [10,11,12,13]
list02 = [20,21,22,23]

In [60]:
np.array(list01) + np.array(list02)

array([30, 32, 34, 36])

In [61]:
list01 + list02

[10, 11, 12, 13, 20, 21, 22, 23]