# This notebook contains APIs in Python often used by Data Scientist.

# **1. NUMPY**

In [3]:
import numpy as np

## 1.1 Basics of Array

(1) Initialization

In [4]:
#create a NumPy array from a python list
np.array([1,2,3,4], dtype = 'float32')

array([1., 2., 3., 4.], dtype=float32)

In [5]:
#create a multi-dimensional array (from nested list)
np.array([range(i,i+3) for i in [2,4,6]])

array([[2, 3, 4],
       [4, 5, 6],
       [6, 7, 8]])

In [6]:
#create an array with ten 0's
np.zeros(10, dtype=int)

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

In [7]:
#create a 3x5 floating-point array filled with ones
np.ones((3,5), dtype=float)

array([[1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.],
       [1., 1., 1., 1., 1.]])

In [8]:
#create a 3x5 array filled with 3.14
np.full((3,5), 3.14)

array([[3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14],
       [3.14, 3.14, 3.14, 3.14, 3.14]])

In [9]:
#create an array filled with linear sequence
#start at 0, end 20, step by 2
np.arange(0,20,2)

array([ 0,  2,  4,  6,  8, 10, 12, 14, 16, 18])

In [10]:
#create an array of evenly seperated sequence
#start at 0, end at 100, 5 points in total
np.linspace(0, 100, 5)

array([  0.,  25.,  50.,  75., 100.])

In [11]:
#create a 3x3 array of *uniformly* distributed random values 
#between 0 and 1
np.random.random((3,3))

array([[0.67201734, 0.3179899 , 0.60715536],
       [0.45222801, 0.2861667 , 0.70581716],
       [0.82725687, 0.12275929, 0.37996205]])

In [12]:
#create a 3x3 array of *normally* distributed random values
#with mean 0 and standard deviation 1
np.random.normal(0, 1, (3,3))

array([[ 1.03476205e+00, -1.68605916e+00, -1.49831874e+00],
       [-7.57880652e-04, -2.70248959e+00,  1.90587641e+00],
       [-7.28123540e-02,  3.67080130e-03,  6.38648488e-02]])

In [13]:
#create a 3x3 array of random integers in the interval [0,10)
np.random.randint(0, 10, (3,3))

array([[8, 5, 9],
       [9, 2, 9],
       [9, 2, 1]])

In [14]:
#create a 4x4 identity matrix
np.eye(4)

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [15]:
#create an uninitialized array of five integers
#the value will be whatever happens to already exist at that memory
np.empty(5)

array([  0.,  25.,  50.,  75., 100.])

(2) Attributes

Each array has attributes: <br>
`ndim`, the number of dimentsions <br>
`shape`, the size of each dimension <br>
`size`, the total size of the array <br>
`dtype`, the data type of the array <br>
`itemsize`, the size (in bytes) of each item <br>
`nbytes`, the total size (in bytes) of the array <br>


In [16]:
np.random.seed(0)

x1 = np.random.randint(10, size=6) # 1D array
x2 = np.random.randint(10, size=(3,4)) # 2D array
x3 = np.random.randint(10, size=(3,4,5)) # 3D array

In [17]:
print("x3's attibutes: ", x3.ndim, x3.shape, x3.size, x3.dtype, x3.itemsize, x3.nbytes)

x3's attibutes:  3 (3, 4, 5) 60 int64 8 480


(3) Indexing

In [18]:
# for 1D array
print(x1)
print(x1[4], x1[-1])

[5 0 3 3 7 9]
7 9


In [19]:
# for multi-dimensional array
print(x2)
print(x2[0,0], x2[-1,-2])

[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
3 7


(4) Slicing

To access a subarray, we can use *slice* notation (`:`) in this way: <br>
`x[start:stop:step]`, by default start=0, stop=len(x), step=1 <br>
<br>
*Note*: We create a *reference* in this way, so any modification on subarray of `x` will also change the original array. To make a *copy* (allocate a new block of memory), the method `.copy()` is added. <br>
Generally speaking, the opetaion such as setnull() which modifies the array will create a copy simultaneously, but the operation such as reshape() which doesn't influence the value of any data will create a reference.

In [20]:
# for 1D array
x = np.arange(10)
print(x)
print(x[::2])
print(x[1::2])
print(x[::-1]) #convenient way for reversion

[0 1 2 3 4 5 6 7 8 9]
[0 2 4 6 8]
[1 3 5 7 9]
[9 8 7 6 5 4 3 2 1 0]


In [21]:
#for multi-dimensional array
print(x2)
print(x2[:2, :3]) # first two rows, first three columns
print(x2[:2, ::2]) # first two rows. every other columns
print(x2[::-1, ::-1]) # reverse

[[3 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
[[3 5 2]
 [7 6 8]]
[[3 2]
 [7 8]]
[[7 7 6 1]
 [8 8 6 7]
 [4 2 5 3]]


In [22]:
# access rows and columns
print(x2[:,0]) # first column
print(x2[0,:]) # first row
print(x2[0])   # also first row

[3 7 1]
[3 5 2 4]
[3 5 2 4]


In [23]:
# Difference between *reference* and *copy*
x2_ref = x2[:2, :3]
x2_copy = x2[:2, :3].copy()
x2_ref[0,0] = 0
print(x2)
x2_copy[0,0] = 99
print(x2)

[[0 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]
[[0 5 2 4]
 [7 6 8 8]
 [1 6 7 7]]


(5) Reshaping

Use `reshape()` method to reshape to any dimension <br>
Also use `np.newaxis` to add a new axis<br>

In [24]:
grid_ref = x1.reshape((2,3)) # the size of x1 must be equal to 2*3
print(x1)
print(grid_ref) # on most cases, grid_ref is a reference, but with non-contiguous memory buffers it may not be.

[5 0 3 3 7 9]
[[5 0 3]
 [3 7 9]]


In [25]:
# reshape 1D array to multi-dimensional array
print(x1)

print(x1.reshape(1, 6)) # into a row
print(x1[np.newaxis, :]) # also into a row

print(x1.reshape(6, 1)) # into a column
print(x1[:, np.newaxis]) #also into a column

[5 0 3 3 7 9]
[[5 0 3 3 7 9]]
[[5 0 3 3 7 9]]
[[5]
 [0]
 [3]
 [3]
 [7]
 [9]]
[[5]
 [0]
 [3]
 [3]
 [7]
 [9]]


(6) Joining and splitting

For concatenation of arrays:<br>
`np.concatenate`, can concatenate at defined axis <br>
`np.vstack`, concatenate vertically <br>
`np.hstack`, horizontally <br>
`np.dstack`, concatenate in the third axis <br>
For seperation of arrays: <br>
`np.split`, split an array along specified axis (default=0) <br>
`np.hsplit`, horizontally <br>
`np.vsplit`, vertically <br>
`np.dsplit`, along the 3rd axis <br>

In [98]:
# usage of np.concatenate
# np.concatenate([a,b,c,...])
a = np.array([0,1,2,3,4])
b = np.array([5,6,7,8,9])
c = np.concatenate([a,b])
print("ab\n",c)
print("abab\n",np.concatenate([a,b,a,b])) # nested concatenation
c = c.reshape((2,5))
print("c\n", c)
print("c\nc\n", np.concatenate([c,c])) # join along the first (default) axis
print("cc, axis=1\n",np.concatenate([c,c], axis=1)) # join along the second axis

ab
 [0 1 2 3 4 5 6 7 8 9]
abab
 [0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9]
c
 [[0 1 2 3 4]
 [5 6 7 8 9]]
c
c
 [[0 1 2 3 4]
 [5 6 7 8 9]
 [0 1 2 3 4]
 [5 6 7 8 9]]
cc, axis=1
 [[0 1 2 3 4 0 1 2 3 4]
 [5 6 7 8 9 5 6 7 8 9]]


In [27]:
# usage of np.vstack (vertical stack), np.hstack (horizontal) and np.dstack (depth)
a = np.array([0,1,2,3,4])
b = np.array([5,6,7,8,9])
print("a+b, vertical\n",np.vstack([a,b]))
print("a+b, horizontal\n", np.hstack([a,b]))
print("a+b, in 3rd axis\n", np.dstack([a,b]))

a+b, vertical
 [[0 1 2 3 4]
 [5 6 7 8 9]]
a+b, horizontal
 [0 1 2 3 4 5 6 7 8 9]
a+b, in 3rd axis
 [[[0 5]
  [1 6]
  [2 7]
  [3 8]
  [4 9]]]


In [28]:
# usage of np.split
# np.split(c, [*SLIPPING POINTS*])
c = np.array([0,1,2,3,4,5,6,7,8,9])
a,b,d = np.split(c, [2,7])
print(a)
print(b)
print(d)

[0 1]
[2 3 4 5 6]
[7 8 9]


In [29]:
# usage of np.vsplit, np.hsplit
c = np.arange(16).reshape((4,4))
print(c)
a,b = np.vsplit(c, [2])
print("vsplit\na", a, "\nb", b)
a,b = np.hsplit(c, [2])
print("hsplit\na", a, "\nb", b)

[[ 0  1  2  3]
 [ 4  5  6  7]
 [ 8  9 10 11]
 [12 13 14 15]]
vsplit
a [[0 1 2 3]
 [4 5 6 7]] 
b [[ 8  9 10 11]
 [12 13 14 15]]
hsplit
a [[ 0  1]
 [ 4  5]
 [ 8  9]
 [12 13]] 
b [[ 2  3]
 [ 6  7]
 [10 11]
 [14 15]]


## 1.2 Computation on NumPy Array: Universal Functions (ufuncs)

Vectorized operations are implemented via ufuncs, whose main purpose is to QUICKLY execute repeated operations on values in NumPy arrays. Ufuncs exist in two flavors: unary ufuncs (single input) and binary ufuncs (two inputs).

In [30]:
x = np.arange(1, 6)
print("x:\n",x, "\n1/x:\n", 1/x)
y = np.arange(5, 0, -1)
print("y:\n",y, "\nx/y:\n", x/y)
print("2**np.vstack([x,y]):\n",2**np.vstack([x,y]))

x:
 [1 2 3 4 5] 
1/x:
 [1.         0.5        0.33333333 0.25       0.2       ]
y:
 [5 4 3 2 1] 
x/y:
 [0.2 0.5 1.  2.  5. ]
2**np.vstack([x,y]):
 [[ 2  4  8 16 32]
 [32 16  8  4  2]]


(1) Array arithmetic

In [31]:
x = np.arange(5)
print("x:\n",x)
print("x+5:\n", x+5)  # np.add(x,5)
print("x-5:\n", x-5)  # np.subtract(x,5)
print("x*2:\n", x*2)  # np.multiply(x,2)
print("x/2:\n", x/2)  # np.divide(x,2)
print("x//2:\n", x//2)  # np.floor_divide(x,2)
print("-x:\n", -x)    # np.negative(x)
print("x**2:\n", x**2) # np.power(x,2)
print("2**x:\n", 2**x) # np.power(2,x)
print("x%2:\n", x%2)  # np.mod(x,2)

x:
 [0 1 2 3 4]
x+5:
 [5 6 7 8 9]
x-5:
 [-5 -4 -3 -2 -1]
x*2:
 [0 2 4 6 8]
x/2:
 [0.  0.5 1.  1.5 2. ]
x//2:
 [0 0 1 1 2]
-x:
 [ 0 -1 -2 -3 -4]
x**2:
 [ 0  1  4  9 16]
2**x:
 [ 1  2  4  8 16]
x%2:
 [0 1 0 1 0]


(2) Absolute value

In [32]:
x = np.arange(-6,-1)
print("x:\n",x)
print("abs(x):\n", abs(x)) # np.abs(x)

x:
 [-6 -5 -4 -3 -2]
abs(x):
 [6 5 4 3 2]


(3) Trigonometric functions

`sin(x) cos(x) tan(x) arcsin(x) arccos(x) arctan(x)`, `x` is in rad

(4) Exponents and logarithms

`np.exp(x)` = e^x <br>
`np.exp2(x)` = 2^x <br>
`np.power(3,x)` = 3^x <br>
<br>
`np.log(x)` = ln(x) <br>
`np.log2(x)` = log2(x) <br>
`np.log10(x)` = log10(x) <br>
<br>
Those are more precise when `x` is very small: <br>
`np.expm1(x)` = exp(x) - 1 <br>
`np.log1p(x)` = ln(1+x) <br>

(5) Specialized ufuncs

In [34]:
from scipy import special

In [35]:
# Gamma functions (generalized factorials) and related functions
x = [1, 5, 10]
print("gamma(x):\n", special.gamma(x))
print("ln|gamma(x):\n", special.gammaln(x))
print("beta(x, 2):\n", special.beta(x,2))

gamma(x):
 [1.0000e+00 2.4000e+01 3.6288e+05]
ln|gamma(x):
 [ 0.          3.17805383 12.80182748]
beta(x, 2):
 [0.5        0.03333333 0.00909091]


In [36]:
# Error function (integral of Gaussian)
# its comlement, and its inverse
x = np.array([0, 0.3, 0.7, 1.0])
print("erf(x):\n", special.erf(x))
print("erfc(x):\n", special.erfc(x))
print("erfinv(x):\n", special.erfinv(x))

erf(x):
 [0.         0.32862676 0.67780119 0.84270079]
erfc(x):
 [1.         0.67137324 0.32219881 0.15729921]
erfinv(x):
 [0.         0.27246271 0.73286908        inf]


(6) Advanced Ufunc Features

Specify the `out` attribute.

In [37]:
# specifying `out' is faster than using 'y=x*5'
# the later first write x*5 to empty memory then
# copy data to y

x = np.arange(5)
y = np.empty(5)
np.multiply(x, 5, out=y)
print(y)

# `:` is also appliable
y = np.zeros(15)
np.add(x, 10, out=y[5:15:2])
print(y)

[ 0.  5. 10. 15. 20.]
[ 0.  0.  0.  0.  0. 10.  0. 11.  0. 12.  0. 13.  0. 14.  0.]


In [48]:
# aggregates

## `reduce`, like the next half of mapreduce
x = np.arange(1,10)
print(x)
print("The sum is:", np.add.reduce(x))
print("The product is:", np.multiply.reduce(x))
print("The accumulated sum is:", np.add.accumulate(x))
print("The accumulated product is", np.multiply.accumulate(x))

[1 2 3 4 5 6 7 8 9]
The sum is: 45
The product is: 362880
The accumulated sum is: [ 1  3  6 10 15 21 28 36 45]
The accumulated product is [     1      2      6     24    120    720   5040  40320 362880]


In [49]:
# Outer products - the opposite of inner product

x = np.arange(1,6)
print(x)
print("inner product\n", np.add.reduce(np.multiply(x,x)))
print("outer product\n", np.multiply.outer(x,x))

[1 2 3 4 5]
inner product
 55
outer product
 [[ 1  2  3  4  5]
 [ 2  4  6  8 10]
 [ 3  6  9 12 15]
 [ 4  8 12 16 20]
 [ 5 10 15 20 25]]


## 1.3 Aggregations: Min, Max, and Everything In Between

**Those are available aggregation functions**<br>
Func NaN-safe Version	Description<br>
`np.sum`	` np.nansum`	Compute sum of elements<br>
`np.prod`	 `np.nanprod`	Compute product of elements<br>
`np.mean`	 `np.nanmean`	Compute mean of elements<br>
`np.std`	 `np.nanstd`	Compute standard deviation<br>
`np.var`	`np.nanvar`	Compute variance<br>
`np.min`	`np.nanmin`	Find minimum value<br>
`np.max`	`np.nanmax`	Find maximum value<br>
`np.argmin`	`np.nanargmin`	Find index of minimum value<br>
`np.argmax`	`np.nanargmax`	Find index of maximum value<br>
`np.median`	`np.nanmedian`	Compute median of elements<br>
`np.percentile`	`np.nanpercentile`	<br>Compute rank-based statistics of elements. E.g., `np.percentile(x, 25)` compute 25th percentile<br>
`np.any`	`N/A`	Evaluate whether any elements are true<br>
`np.all`	`N/A`	Evaluate whether all elements are true<br>

(1) Sum

np.sum(x) is faster than sum(x)

In [50]:
# difference between two
x = np.random.randint(1, 10, (4,4))
print(x)
print("sum(x):\n", sum(x))
print("np.sum(x) or x.sum():\n", np.sum(x))

[[1 2 3 5]
 [3 1 4 3]
 [1 8 6 1]
 [3 8 3 3]]
sum(x):
 [ 8 19 16 12]
np.sum(x) or x.sum():
 55


(2) Min & Max

`np.min` faster than `min`
`np.max` faster than `max`

In [51]:
x = np.arange(0,10).reshape([2,5])
print(x)
print("np.min(x) or x.min():\n", np.min(x))
print("np.max(x) or x.max():\n", np.max(x))
print("x.min(axis=0)\n", x.min(axis=0))
print("x.min(axis=1)\n", x.min(axis=1))

[[0 1 2 3 4]
 [5 6 7 8 9]]
np.min(x) or x.min():
 0
np.max(x) or x.max():
 9
x.min(axis=0)
 [0 1 2 3 4]
x.min(axis=1)
 [0 5]


## 1.4 Broadcasting

Broadcasting rules apply to any binary `ufunc`. <br>
Rules:<br>
(a) If the two arrays differ in their number of dimensions, the shape of the one with fewer dimensions is padded with ones on its leading (left) side.<br>
(b) If the shape of the two arrays does not match in any dimension, the array with shape equal to 1 in that dimension is stretched to match the other shape.<br>
(c) If in any dimension the sizes disagree and neither is equal to 1, an error is raised.<br>

In [42]:
a = np.arange(0,3)
b = np.arange(3,6)
c = np.ones((3,3))
print("a\n", a)
print("b\n", b)
print("c\n", c )
print("a+b\n", a+b)
print("c+a\n", c+a)

a
 [0 1 2]
b
 [3 4 5]
c
 [[1. 1. 1.]
 [1. 1. 1.]
 [1. 1. 1.]]
a+b
 [3 5 7]
c+a
 [[1. 2. 3.]
 [1. 2. 3.]
 [1. 2. 3.]]


In [43]:
a = np.ones((2,3))
b = np.arange(3)
print("a.shape=",a.shape)
print(a)
print("b.shape=",b.shape)
print(b)


# by rule (a), b.shape = (3,) --> (1,3)
# by rule (b), b.shape = (1,3)--> (2,3), same as a.shape
print("a+b.shape=", (a+b).shape)
print(a+b)

a.shape= (2, 3)
[[1. 1. 1.]
 [1. 1. 1.]]
b.shape= (3,)
[0 1 2]
a+b.shape= (2, 3)
[[1. 2. 3.]
 [1. 2. 3.]]


In [44]:
a = np.arange(3)
b = np.arange(3,6).reshape(3,1)
print("a.shape=",a.shape)
print(a)
print("b.shape=",b.shape)
print(b)

# by rule (a), a.shape = (3,)  --> (1,3)
# by rule (b), a.shape = (1,3) --> (3,3)
#              b.shape = (3,1) --> (3,3)
print("(a+b).shape=", (a+b).shape)
print(a+b)

a.shape= (3,)
[0 1 2]
b.shape= (3, 1)
[[3]
 [4]
 [5]]
(a+b).shape= (3, 3)
[[3 4 5]
 [4 5 6]
 [5 6 7]]


In [45]:
a = np.ones((3,2))
b = np.arange(3)
print("a.shape=", a.shape)
print(a)
print("b.shape=", b.shape)
print(b)

# by rule (a) and (b), b.shape = (3,) --> ... --> (3,3)
# but a.shape = (3,2), according to rule (c), an error happens
#print(a+b)

a.shape= (3, 2)
[[1. 1.]
 [1. 1.]
 [1. 1.]]
b.shape= (3,)
[0 1 2]


## 1.5 Comparisons, Masks, and Boolean Logic

(1) Comparison Operations as ufuncs:<br>
`>`,`>=`,`<`,`<=`,`==`,`!=`; or<br>
`np.greater`,`np.greater_euqal`,`np.less`,`np.less_equal`,`np.equal`,`np.not_equal`

In [46]:
# comparison
x = np.arange(5)
print("x\n",x)
print("x<3\n",x<3)
print("x>=2\n",x>=2)
print("2*x==x**2\n", 2*x==x**2)

x
 [0 1 2 3 4]
x<3
 [ True  True  True False False]
x>=2
 [False False  True  True  True]
2*x==x**2
 [ True False  True False False]


(2) Working with Boolean Arrays

We can apply `np.sum()`,`np.any()`and `np.all()`

In [47]:
x = np.arange(5)
print("x\n",x)
print("np.count_nonzero(x<3)\n", np.count_nonzero(x<3))
print("np.sum(x<3)\n",np.sum(x<3)) #sum default to columns
x1 = x[:,np.newaxis]
print("x1\n",x1)
print("np.sum(x1<3,axis=1)\n",np.sum(x1<3,axis=1)) #sum each row

x
 [0 1 2 3 4]
np.count_nonzero(x<3)
 3
np.sum(x<3)
 3
x1
 [[0]
 [1]
 [2]
 [3]
 [4]]
np.sum(x1<3,axis=1)
 [1 1 1 0 0]


(3) Boolean operators
* `&` `np.bitwise_and`
* `|` `np.bitwise_or`
* `^` `np.bitwise_xor`
* `~` `np.bitwise_not`
<br>E.g., `np.sum((x>5) & (x<10))`

(4) Boolean Arrays as Masks

In [52]:
#  Example of a masking operation
x = np.arange(10).reshape(2,5)
print("x=\n",x)
# `(x>3) & (x<7)` is the mask
print("x[(x>3) & (x<7)]=\n",x[(x>3)&(x<7)])

x=
 [[0 1 2 3 4]
 [5 6 7 8 9]]
x[(x>3) & (x<7)]=
 [4 5 6]


## 1.6 Fancy Indexing

Difference types of indexing:
* Simple indices (e.g., `x[0]`)
* Slices (e.g., `x[:5]`)
* Boolean masks (e.g., `x[x>5]`)
* Fancy Indexing

In [53]:
x = np.arange(10)
print("x\n", x)
fancy_index = np.array([[3,7],[5,4]])
x[fancy_index]

x
 [0 1 2 3 4 5 6 7 8 9]


array([[3, 7],
       [5, 4]])

In [54]:
x = np.arange(20).reshape((4,5))
print("x\n",x)
row = np.array([0,3,2])
col = np.array([1,3,4])
# [0,3,2],[1,3,4] = [(0,1), (3,3), (2,4)]
print("x[row,col]\n",x[row,col])

# [row, col] follows the broadcasting rules
row = row[:,np.newaxis]
print("x[row,col]\n", x[row,col])

x
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
x[row,col]
 [ 1 18 14]
x[row,col]
 [[ 1  3  4]
 [16 18 19]
 [11 13 14]]


In [55]:
# combined indexing
x = np.arange(20).reshape((4,5))
print("x=\n", x)
print("x[2,[2,1,4]]=\n", x[2,[2,1,4]]) # combined with simple indices
print("x[1:,[3,0,2]]=\n",x[1:,[3,0,2]]) # combined with clising
mask = np.array([1,0,0,1,0], dtype=bool) # combined with masking
row = np.array([2,1,3])[:, np.newaxis]
print("x[row,mask]=\n", x[row,mask]) # note: len(mask) = len(x.col)

x=
 [[ 0  1  2  3  4]
 [ 5  6  7  8  9]
 [10 11 12 13 14]
 [15 16 17 18 19]]
x[2,[2,1,4]]=
 [12 11 14]
x[1:,[3,0,2]]=
 [[ 8  5  7]
 [13 10 12]
 [18 15 17]]
x[row,mask]=
 [[10 13]
 [ 5  8]
 [15 18]]


In [56]:
# modify values with fancy indexing
x = np.arange(10)
print(x)
i = np.array([4,2,8])
x[i] = 666
print(x)

[0 1 2 3 4 5 6 7 8 9]
[  0   1 666   3 666   5   6   7 666   9]


In [57]:
# notice the difference
x = np.zeros(10)
i = np.array([1,2,2,3,3,3])
x[i] += 1 # read value of x[i], add 1, then write to x
print(x)

x = np.zeros(10)
i = np.array([1,2,2,3,3,3])
np.add.at(x,i,1) # write in place
print(x)
# TODO find `reduceat()`

[0. 1. 1. 1. 0. 0. 0. 0. 0. 0.]
[0. 1. 2. 3. 0. 0. 0. 0. 0. 0.]


## 1.7 Sorting Arrays
Use build-in function `np.sort()` with default algorithm *quicksort*. For multi-dimensional arrays, we can specify axis to be sorted by using `np.sort(x, axis=0)` or `np.sort(x, axis=1)`. <br>
To select *k* smallest values, we use `np.partition()` which arrange the k smallest values at left side. `axis` can also be specified.

In [58]:
x = np.random.randint(0,100,(10,))
print(x)
print("Sorted array\n",np.sort(x))
print("Sorted indices\n", np.argsort(x))

print("Sort in place")
x.sort()
print(x)

[62 35 94 67 82 46 99 20 81 50]
Sorted array
 [20 35 46 50 62 67 81 82 94 99]
Sorted indices
 [7 1 5 9 0 3 8 4 2 6]
Sort in place
[20 35 46 50 62 67 81 82 94 99]


In [59]:
# k smallest values
x = np.random.randint(0,100,(10,))
np.partition(x,3) # three smallest values are put at left side

array([14, 11, 10, 27, 58, 36, 41, 86, 43, 65])

## 1.8 Structured Array

In [60]:
name = ['Alice', 'Bob', "Cathy", "Doug"]
age = [25,45,37,19]
weight = [55.0, 85.5, 68.0, 61.5]

In [61]:
data = np.zeros(4, dtype={'names':('name', 'age', 'weight'), 'formats':('U10', 'i4', 'f8')})
print(data.dtype)

[('name', '<U10'), ('age', '<i4'), ('weight', '<f8')]


In [62]:
data['name'] = name
data['age'] = age
data['weight'] = weight
print(data)

[('Alice', 25, 55. ) ('Bob', 45, 85.5) ('Cathy', 37, 68. )
 ('Doug', 19, 61.5)]


In [63]:
# indexing
print(data['name'])
print(data[0])
print(data[-1]['name'])

['Alice' 'Bob' 'Cathy' 'Doug']
('Alice', 25, 55.)
Doug


## 1.9 Additional Notes

In [64]:
# randomly choose indices with no repeats
x = np.arange(20)[:,np.newaxis]
indices = np.random.choice(x.shape[0],10,replace=False) # replace=False ==> No duplicates!
print(indices)
print(x[indices])

[ 4  5 18  1  7  9 15 12 19 13]
[[ 4]
 [ 5]
 [18]
 [ 1]
 [ 7]
 [ 9]
 [15]
 [12]
 [19]
 [13]]


In [65]:
# select data not in indices
print(x[~indices])

[[15]
 [14]
 [ 1]
 [18]
 [12]
 [10]
 [ 4]
 [ 7]
 [ 0]
 [ 6]]


In [66]:
# notes of function `np.searchsorted`
#help(np.searchsorted)

# 2 PANDAS

In [68]:
import pandas as pd

## 2.1 Introduction

(1) Initialization of a Series object:<br>
`pd.Series(data, index=index)` <br>
data: can be a list, a scalar, or a dictionary<br>
index: optional

In [69]:
print(pd.Series([2,4,6]),"\n")
print(pd.Series(5, index=[10,20,30]),"\n")
print(pd.Series({2:'a', 1:'b', 3:'c'}),'\n')
print(pd.Series({2:'a', 1:'b', 3:'c'}, index=[3,2]),'\n')

0    2
1    4
2    6
dtype: int64 

10    5
20    5
30    5
dtype: int64 

2    a
1    b
3    c
dtype: object 

3    c
2    a
dtype: object 



In [70]:
data = pd.Series([0.25,0.5,0.75,1.0])
print("data:\n",data,'\n')
# Series has attributes `values` and `index`
print("data.values:\n",data.values)
print("data.index:\n",data.index)

# Access data
print("data[1]:\n",data[1])
print("data[1:3]\n",data[1:3])

# Change index
data.index = ['a','b','c','d']
print("\ndata:\n",data)
print("data['b']:\n",data['b'])
print(data.index)
data.index = [4,2,3,1]
print("\ndata:\n",data)
print("data[4]:\n",data[4])
print(data.index)

data:
 0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64 

data.values:
 [0.25 0.5  0.75 1.  ]
data.index:
 RangeIndex(start=0, stop=4, step=1)
data[1]:
 0.5
data[1:3]
 1    0.50
2    0.75
dtype: float64

data:
 a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
data['b']:
 0.5
Index(['a', 'b', 'c', 'd'], dtype='object')

data:
 4    0.25
2    0.50
3    0.75
1    1.00
dtype: float64
data[4]:
 0.25
Int64Index([4, 2, 3, 1], dtype='int64')


(2) Initialization of DataFrame


In [71]:
data = [{'a': i, 'b': 2 * i}
        for i in range(3)]
pd.DataFrame(data)

Unnamed: 0,a,b
0,0,0
1,1,2
2,2,4


In [72]:
pd.DataFrame([{'a': 1, 'b': 2}, {'b': 3, 'c': 4}])

Unnamed: 0,a,b,c
0,1.0,2,
1,,3,4.0


In [73]:
name = ['A', 'b', 'Hello', 'Coffin', 'dance']
age = pd.Series([1,2,5,999,666], index=name)
height = pd.Series([150,200,130,97,55], index=name)
boo = pd.Series([5,4,3,2,1], index=name)

In [74]:
df = pd.DataFrame({'age':age, 'height':height, 'boo':boo})
print("df\n",df)

print("\ndf.index\n",df.index)
print("df.columns\n",df.columns)

print("\ndf['height']\n",df['height'])

df
         age  height  boo
A         1     150    5
b         2     200    4
Hello     5     130    3
Coffin  999      97    2
dance   666      55    1

df.index
 Index(['A', 'b', 'Hello', 'Coffin', 'dance'], dtype='object')
df.columns
 Index(['age', 'height', 'boo'], dtype='object')

df['height']
 A         150
b         200
Hello     130
Coffin     97
dance      55
Name: height, dtype: int64


(3) Initialization of Index
<br>
Note: `Index` is *immutable*

In [75]:
ind = pd.Index([2,3,5,7,11])
ind

Int64Index([2, 3, 5, 7, 11], dtype='int64')

## 2.2 Data Indexing and Selection

(1) Series

In [76]:
# Series as dictionary


data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print("data:\n", data)

# dictionary-like interfaces
print(data['b'])
print('a' in data)
print(data.keys())
list(data.items())
data['e'] = 1.25
print(data)

data:
 a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
0.5
True
Index(['a', 'b', 'c', 'd'], dtype='object')
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64


In [77]:
# Series as one-dimensional array

data = pd.Series([0.25, 0.5, 0.75, 1.0],
                 index=['a', 'b', 'c', 'd'])
print("data:\n", data)

# interfaces
print(data['a':'c']) #slicing by EXPLICIT index, inding data['c']
print(data[0:2]) #slicing by IMPLICIT integer index, excluding data[2]
print(data[(data>0.3) & (data<0.8)]) #masking
print(data[['a', 'd']]) #fancy indexing

data:
 a    0.25
b    0.50
c    0.75
d    1.00
dtype: float64
a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
d    1.00
dtype: float64


Due to the confusion from *EXPLICIT* and *IMPLICIT* indexing, we use indexers: `loc`, `iloc`, and `ix`
* `loc` ==> explicit index
* `iloc` ==> implicit index
* `ix` ==> hybrid of the two, same as `[]`-based indexing

(2) DataFrame

In [78]:
# as dictionary
area = pd.Series({'California': 423967, 'Texas': 695662,
                  'New York': 141297, 'Florida': 170312,
                  'Illinois': 149995})
pop = pd.Series({'California': 38332521, 'Texas': 26448193,
                 'New York': 19651127, 'Florida': 19552860,
                 'Illinois': 12882135})
data = pd.DataFrame({'area':area, 'pop':pop})
print("data\n",data,"\n")

print(data['area']) # or we can use `data.area` for string index
data['density'] = data['pop'] / data['area']
print("\n",data)

data
               area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860
Illinois    149995  12882135 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64

               area       pop     density
California  423967  38332521   90.413926
Texas       695662  26448193   38.018740
New York    141297  19651127  139.076746
Florida     170312  19552860  114.806121
Illinois    149995  12882135   85.883763


In [79]:
# as two-dimensionoal array

print(data.values,"\n")
print(data.T,"\n") # transpose
print(data.values[0],"\n")
print(data['area'],"\n")
print(data.iloc[:3,:2], "\n")
print(data.loc[:'Illinois', :'pop'], "\n")
print(data.loc[data.density>100,['pop','density']])

[[4.23967000e+05 3.83325210e+07 9.04139261e+01]
 [6.95662000e+05 2.64481930e+07 3.80187404e+01]
 [1.41297000e+05 1.96511270e+07 1.39076746e+02]
 [1.70312000e+05 1.95528600e+07 1.14806121e+02]
 [1.49995000e+05 1.28821350e+07 8.58837628e+01]] 

           California         Texas      New York       Florida      Illinois
area     4.239670e+05  6.956620e+05  1.412970e+05  1.703120e+05  1.499950e+05
pop      3.833252e+07  2.644819e+07  1.965113e+07  1.955286e+07  1.288214e+07
density  9.041393e+01  3.801874e+01  1.390767e+02  1.148061e+02  8.588376e+01 

[4.23967000e+05 3.83325210e+07 9.04139261e+01] 

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64 

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127 

              area       pop
California  423967  38332521
Texas       695662  26448193
New York    141297  19651127
Florida     170312  19552860

In [80]:
# Additional notes

# indexing refers to *colunms*
print(data['area'])
#print(data['New York']) ==> This is wrong

# but slicing refers to *rows*
print(data['New York':'Florida'])
#print(data['area':'pop']) ==> Wroing!
print(data[1:3])

# masking also operates on *rows*
print(data[data.density>100])

California    423967
Texas         695662
New York      141297
Florida       170312
Illinois      149995
Name: area, dtype: int64
            area       pop     density
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121
            area       pop     density
Texas     695662  26448193   38.018740
New York  141297  19651127  139.076746
            area       pop     density
New York  141297  19651127  139.076746
Florida   170312  19552860  114.806121


## 2.3 Operating on Data

|     |                                 |
|:---:|:-------------------------------:|
|`+`  |`add()`|
|`-`  | `sub()`,`subtract()`|
|`*`  | `mul()`, `multiply()`|
|`/`  | `truediv()`, `div()`, `divide()`|
|`//` | `floordiv()`|
|`%`  | `mod()`|
|`**` | `pow()`|

(1) Ufuncs: Index Preservation

NumPy ufunc is also applicable to DataFrame

In [81]:
rng = np.random.RandomState(42)
ser = pd.Series(rng.randint(0,10,4))
print(ser)
print(np.exp(ser))

0    6
1    3
2    7
3    4
dtype: int64
0     403.428793
1      20.085537
2    1096.633158
3      54.598150
dtype: float64


In [82]:
df = pd.DataFrame(rng.randint(0,10,(3,4)), columns=['a','b','c','d'])
print(df)
print(np.sin(df*np.pi/4))

   a  b  c  d
0  6  9  2  6
1  7  4  3  7
2  7  2  5  4
          a             b         c             d
0 -1.000000  7.071068e-01  1.000000 -1.000000e+00
1 -0.707107  1.224647e-16  0.707107 -7.071068e-01
2 -0.707107  1.000000e+00 -0.707107  1.224647e-16


(2) UFuncs: Index Alignment

Series

In [83]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])
print(A)
print(B)

0    2
1    4
2    6
dtype: int64
1    1
2    3
3    5
dtype: int64


In [84]:
A + B

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64

In [85]:
A.add(B, fill_value=0)

0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64

DataFrame

In [86]:
A = pd.DataFrame(rng.randint(0,20,(2,2)), columns=['a','b'])
B = pd.DataFrame(rng.randint(0,10,(3,3)), columns=['b','a','c'])
print(A,"\n")
print(B)

   a   b
0  1  11
1  5   1 

   b  a  c
0  4  0  9
1  5  8  0
2  9  2  6


In [87]:
A + B

Unnamed: 0,a,b,c
0,1.0,15.0,
1,13.0,6.0,
2,,,


In [88]:
A.add(B, fill_value=A.stack().mean())

Unnamed: 0,a,b,c
0,1.0,15.0,13.5
1,13.0,6.0,4.5
2,6.5,13.5,10.5


(3) Ufuncs: Operations Between DataFrame and Series

Recall the rules of broadcasting!

In [89]:
B - B.iloc[0] # row-wise

Unnamed: 0,b,a,c
0,0,0,0
1,1,8,-9
2,5,2,-3


In [90]:
B.subtract(B['b'], axis=0) # column-wise

Unnamed: 0,b,a,c
0,0,-4,5
1,0,3,-5
2,0,-7,-3


## 2.4 Handling Missing Data

In [91]:
df_na = pd.DataFrame({'Name':['A', 'b', 'Hello', 'Coffin', 'dance'], 'Age':[1, 2, 5, None, 666], 'Height':[150, None, 130, 97, 55], 10:[None,4,3,2,None]})
df_na

Unnamed: 0,Name,Age,Height,10
0,A,1.0,150.0,
1,b,2.0,,4.0
2,Hello,5.0,130.0,3.0
3,Coffin,,97.0,2.0
4,dance,666.0,55.0,


In [92]:
df_na.isnull()

Unnamed: 0,Name,Age,Height,10
0,False,False,False,True
1,False,False,True,False
2,False,False,False,False
3,False,True,False,False
4,False,False,False,True


In [93]:
df_na[df_na.notnull()]

Unnamed: 0,Name,Age,Height,10
0,A,1.0,150.0,
1,b,2.0,,4.0
2,Hello,5.0,130.0,3.0
3,Coffin,,97.0,2.0
4,dance,666.0,55.0,


In [94]:
#default: axis='rows' or 1, how='any'
#df_na.dropna() 

#df_na.dropna(axis=1) #or axis = 'columns'

#only drop columns w/ all NaN's
#df_na.dropna(axis=1, how='all') 
 
#if the axis has # of non-null < thresh, drop it!
#In other words, the threshold is the minimum # of valid data
df_na.dropna(axis=1, thresh=4)

Unnamed: 0,Name,Age,Height
0,A,1.0,150.0
1,b,2.0,
2,Hello,5.0,130.0
3,Coffin,,97.0
4,dance,666.0,55.0


In [95]:
df_na.fillna(0) #fill w/ 0
#df_na.fillna(0, inplace=True) #Modify the original data, not a copy
#df_na.fillna(df_na.mean()) #fill w/ mean value
#df_na.fillna(method='ffill') #forward fill, fill w/ previous value
#df_na.fillna(method='bfill') #back fill, fill w/ next value
#df_na.fillna(method='ffill', axis=1)

Unnamed: 0,Name,Age,Height,10
0,A,1.0,150.0,0.0
1,b,2.0,0.0,4.0
2,Hello,5.0,130.0,3.0
3,Coffin,0.0,97.0,2.0
4,dance,666.0,55.0,0.0


In [96]:
df_na.mean()

Age       168.5
Height    108.0
10          3.0
dtype: float64

## 2.5 Hierarchical Indexing

## 2.10 Additional notes

In [97]:
# select data not in index
indices = [0, 7, 8, 9]
df[~df.index.isin(indices)]

Unnamed: 0,a,b,c,d
1,7,4,3,7
2,7,2,5,4
