# Week 1 - Part 1
### Numpy Intro

Key points:
* numpy offers very efficient multi-dim. arrays
* main class is ndarray
* contain homogenous types, not dynamic like base python - more efficient
* array dims are called axes
  - axis 0 traverses the rows - so you actually get the columns
  - axis 1 traverses the columns - so you actually get the rows 

In [43]:
import numpy as np

### Init Data

In [44]:
# init random engine
np.random.seed(1)

In [45]:
# generate a 2D array
arr = np.random.multivariate_normal(mean = [1, 0.5], cov = [[1,0],[0,1]], size=10000)

In [46]:
arr.shape

(10000, 2)

In [42]:
# Note: a 3D array is AKA a tensor
np.random.rand(3,2,2)  # just giving array dims here

array([[[0.48382042, 0.30519443],
        [0.39997738, 0.09613724]],

       [[0.70862236, 0.72072961],
        [0.28462039, 0.05752638]],

       [[0.79270584, 0.34496008],
        [0.68896627, 0.26391377]]])

# Indexing
* Specifies locations in arrays
* Can make use of regular Python ranges
```
          arr[start:stop:step]
          e.g.  a[0:3:1]
          starts at 0, stops at 3 (not including), steps by 1
```



In [6]:
# top left element
arr[0,0]

2.6243453636632417

In [None]:
# bottom right record
arr[-1,-1]

In [7]:
arr.dtype

dtype('float64')

In [8]:
# returns a copy by default
arr.astype(int)

array([[ 2,  0],
       [ 0,  0],
       [ 1, -1],
       ...,
       [ 2,  0],
       [ 1,  2],
       [ 1,  0]])

In [9]:
arr

array([[ 2.62434536, -0.11175641],
       [ 0.47182825, -0.57296862],
       [ 1.86540763, -1.8015387 ],
       ...,
       [ 2.10957003, -0.44320839],
       [ 1.78221575,  2.9084338 ],
       [ 1.88278555,  0.40040369]])

In [10]:
# index into first row; take all columns (: is a regular Python range)
arr[0,:]

array([ 2.62434536, -0.11175641])

In [12]:
# same idea - index into just the first column, but take all rows 
arr[:,0].shape

(10000,)

In [14]:
# select even rows - again this just uses regular Python ranges
#   start:stop:step
# note - prof. likes to work in Python tuples like this
arr[0::2,:].shape, arr[1::2,:].shape

((5000, 2), (5000, 2))

In [15]:
# flip first row for last row 
#   that is a negative step so the rows are selected bkwds
arr[::-1,:]

array([[ 1.88278555,  0.40040369],
       [ 1.78221575,  2.9084338 ],
       [ 2.10957003, -0.44320839],
       ...,
       [ 1.86540763, -1.8015387 ],
       [ 0.47182825, -0.57296862],
       [ 2.62434536, -0.11175641]])

# Filtering
### Extracting subsets of arrays

In [47]:
# numpy operations apply to each element
arr > 0

array([[ True, False],
       [ True, False],
       [ True, False],
       ...,
       [ True, False],
       [ True,  True],
       [ True,  True]])

In [49]:
# the output here is rather confusing, but I guess the alternative is to 
#    return half the values as NaN, leaving a bunch of empty holes in the arrays
# seems like the choice was to extract matches into a single output array
arr[arr > 0].shape

(15382,)

In [50]:
# We can specify explicit indices to extract
wanted_rows=[0,1,5,99]
wanted_cols=[0]

In [51]:
arr[wanted_rows, wanted_cols]

array([2.62434536, 0.47182825, 2.46210794, 1.81095167])

In [54]:
# all rows w/ at least one positive value

# run this by itself to understand...
#   note that any is operating on axis 1, DOWN the column so effectively per row 
#   return a 1-dim array - per row, has a val > 0 - makes a good row filter
# (arr>0).any(axis=1)
arr[(arr>0).any(axis=1),:]

array([[ 2.62434536, -0.11175641],
       [ 0.47182825, -0.57296862],
       [ 1.86540763, -1.8015387 ],
       ...,
       [ 2.10957003, -0.44320839],
       [ 1.78221575,  2.9084338 ],
       [ 1.88278555,  0.40040369]])

In [26]:
# comparing any and all (these are the Python built-ins, but same idea)
any([True, False]), all([True,False])

(True, False)

In [27]:
# same filter, just a little tighter - requires all row entries to meet the test
arr[(arr>0).all(axis=1),:]

array([[1.3190391 , 0.25062962],
       [0.6775828 , 0.11594565],
       [1.04221375, 1.08281521],
       ...,
       [1.41951494, 0.62272589],
       [1.78221575, 2.9084338 ],
       [1.88278555, 0.40040369]])

In [30]:
# tells where in the array the test is successful, by index
np.where(arr>0)

(array([   0,    1,    2, ..., 9998, 9999, 9999]),
 array([0, 0, 0, ..., 1, 0, 1]))

In [31]:
# applies mult conditions
#   via bitwise operator - recall this is applied element-wise
arr[((arr>0) & (arr<2)).all(axis=1),:]

array([[1.3190391 , 0.25062962],
       [0.6775828 , 0.11594565],
       [1.04221375, 1.08281521],
       ...,
       [1.78974846, 0.28892712],
       [1.41951494, 0.62272589],
       [1.88278555, 0.40040369]])

# Sorting

In [56]:
# argsort returns the index of the ordered elements 
#  e.g. sorting on first column
#np.argsort(arr[:,0])

# - sorting on second column
#np.argsort(arr[:,1])

# output the array in the index order given
arr[np.argsort(arr[:,0]),:]

array([[-2.6564401 , -0.74300894],
       [-2.43592581,  1.08961331],
       [-2.29485841,  0.59083978],
       ...,
       [ 4.61327701,  1.39894868],
       [ 4.83438102,  1.53704785],
       [ 5.16811768, -0.61761377]])

In [34]:
# and same thing in reverse order
#   note the reversal is applied as a second step [::-1]
arr[np.argsort(arr[:,1])[::-1],:]

array([[ 0.83966508,  4.52684904],
       [-0.63744959,  4.4586027 ],
       [ 0.51572248,  4.2402489 ],
       ...,
       [ 0.69710098, -2.7803276 ],
       [ 2.20353388, -2.81084256],
       [-0.97038056, -2.95140291]])

# Apply UDFs  (user defined functions)

In [37]:
func = lambda x: np.percentile(x, q=[25,50,75])

In [40]:
# apply the function along the axis

# for all columns (traverses the row)
#   note each column has 3 results - the quartiles defined in q
np.apply_along_axis(func1d = func, axis=0, arr=arr)

# could do for all rows - traverse cols via axis=1

array([[ 0.33302073, -0.16586014],
       [ 1.03000632,  0.49529121],
       [ 1.6951908 ,  1.16643536]])

In [41]:
# didn't know this, but plugging an iterable into a lambda seems to 
#   do the right thing
#
# so, note this ends up being another way to do the previous
func(arr[:,0]) , func(arr[:,1])

(array([0.33302073, 1.03000632, 1.6951908 ]),
 array([-0.16586014,  0.49529121,  1.16643536]))