In [78]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


In [2]:
#create 1 dimensional array
data1 = [6, 8, 2, 4]
arr1 = np.array(data1)
arr1


array([6, 8, 2, 4])

In [3]:
#create 2-dimensional array
data2 = [[2,3,4],[5,6,1]]
arr2 = np.array(data2)
arr2

array([[2, 3, 4],
       [5, 6, 1]])

In [4]:
#dimensions
arr2.ndim

2

In [5]:
#shape
arr2.shape

(2, 3)

In [6]:
#dtype
arr1.dtype

dtype('int32')

In [7]:
#create array of zeros (you can parse length or shape of array as argument)
arr3 = np.zeros((3, 4))
arr3

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.]])

In [8]:
#fill the whole array with certain value
arr4 = np.full((2,3), 5)
arr4

array([[5, 5, 5],
       [5, 5, 5]])

In [9]:
#specify dtype
arr5 = np.array([1, 3,  4], dtype = np.float64)
arr5.dtype

dtype('float64')

In [10]:
#array indexing and slicing
arr1 = np.arange(10)
arr1

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [11]:
arr1[4]

4

In [12]:
arr1[2:4]

array([2, 3])

In [13]:
#elements with minus sign means start counting from the last element
#if a[i, j] and i > j then a[i, j] = []
arr1[3:-7]

array([], dtype=int32)

In [14]:
#omitted fields means start from the beginning or stop at the end of the array
arr1[4:]

array([4, 5, 6, 7, 8, 9])

In [15]:
arr1[:5]

array([0, 1, 2, 3, 4])

In [16]:
#In multidimensional array, if you omit later indices, the returned object will be a low dimensional ndarray consiting
#all elements along higher indices
array3d = np.array([[[2, 4], [1, 3], [8, 5]], [[3, 10], [21, 45], [9, 9]]])
array3d

array([[[ 2,  4],
        [ 1,  3],
        [ 8,  5]],

       [[ 3, 10],
        [21, 45],
        [ 9,  9]]])

In [17]:
array3d[0]

array([[2, 4],
       [1, 3],
       [8, 5]])

In [18]:
array3d[1][1]

array([21, 45])

In [19]:
array3d[1][1][0]

21

In [20]:
#when you assign a number to one dimension of the array, all elements in that dimension will be equal to the given number
#in the other word, both scalar value and array can be assigned to one dimension of the array

oldvalues = array3d[0].copy()
array3d[0] = 1
array3d

array([[[ 1,  1],
        [ 1,  1],
        [ 1,  1]],

       [[ 3, 10],
        [21, 45],
        [ 9,  9]]])

In [21]:
array3d[0] = oldvalues
array3d

array([[[ 2,  4],
        [ 1,  3],
        [ 8,  5]],

       [[ 3, 10],
        [21, 45],
        [ 9,  9]]])

In [23]:
array3d[0][1] = 1
array3d

array([[[2, 4],
        [1, 1],
        [8, 5]],

       [[1, 1],
        [1, 1],
        [1, 1]]])

In [24]:
#you can locate cells/dimension using below syntax
array3d[1, 0]

array([1, 1])

In [26]:
#slicing in multidimensional array goes starts from the first bracket
array3d[:1]

array([[[2, 4],
        [1, 1],
        [8, 5]]])

In [30]:
#you can pass multiple slices just like multiple indexes
array3d[:1, :2]

array([[[2, 4],
        [1, 1]]])

In [32]:
#but not like this
array3d[:1][:1]

array([[[2, 4],
        [1, 1],
        [8, 5]]])

In [35]:
name = np.array(['Bob', 'Zoey', 'Kazi', 'Bob', 'Tyson', 'West', 'West'])
data = np.random.randn(7, 4)
name


array(['Bob', 'Zoey', 'Kazi', 'Bob', 'Tyson', 'West', 'West'],
      dtype='<U5')

In [36]:
data

array([[ 0.4953815 , -0.45035555, -0.60982348, -0.23655716],
       [ 1.66337308,  0.14942182, -0.56429456,  0.00777951],
       [-0.14411453, -1.39749429, -0.59321881,  1.75994342],
       [-0.84666658,  1.60129361, -1.06054227, -0.97830368],
       [-0.24545564, -1.10831378,  0.53100579, -0.15630685],
       [-1.38588362, -0.17672807, -1.17590726, -1.07506872],
       [ 1.23553996,  1.35669948, -1.07252821, -0.95173691]])

In [39]:
#Comparing name with the string 'Bob' yield a boolean array
name == 'Bob'

array([ True, False, False,  True, False, False, False], dtype=bool)

In [40]:
#This boolean array can be passed when indexing the array
#REMEMBER: the boolean array must have the same length as the array it's indexing
data[name == 'Bob']

array([[ 0.4953815 , -0.45035555, -0.60982348, -0.23655716],
       [-0.84666658,  1.60129361, -1.06054227, -0.97830368]])

In [41]:
#To select everything BUT [option], you can use != or ~
data[~(name == 'Bob')]

array([[ 1.66337308,  0.14942182, -0.56429456,  0.00777951],
       [-0.14411453, -1.39749429, -0.59321881,  1.75994342],
       [-0.24545564, -1.10831378,  0.53100579, -0.15630685],
       [-1.38588362, -0.17672807, -1.17590726, -1.07506872],
       [ 1.23553996,  1.35669948, -1.07252821, -0.95173691]])

In [47]:
arr1 = np.arange(32).reshape((8, 4))
arr1

array([[ 0,  1,  2,  3],
       [ 4,  5,  6,  7],
       [ 8,  9, 10, 11],
       [12, 13, 14, 15],
       [16, 17, 18, 19],
       [20, 21, 22, 23],
       [24, 25, 26, 27],
       [28, 29, 30, 31]])

In [50]:
#fancy indexing
#in this example, the elements [1, 0], [5, 3], [7, 1], [2, 2] are selected
#regladless of how many dimensionals the array has, fancy indexing is always one-dimensional
#fancy indexing, unlike slicing, always copies data into a new array (the original array will not be modified)

arr1[[1, 5, 7, 2], [0, 3, 1, 2]]

array([ 4, 23, 29, 10])

In [51]:
#array transposing
arr1.T

array([[ 0,  4,  8, 12, 16, 20, 24, 28],
       [ 1,  5,  9, 13, 17, 21, 25, 29],
       [ 2,  6, 10, 14, 18, 22, 26, 30],
       [ 3,  7, 11, 15, 19, 23, 27, 31]])

In [53]:
#for higher dimensional array, transpose will accept a tuple of axis numbers to permute the axes:
arr = np.arange(16).reshape((2, 2, 4))
arr

array([[[ 0,  1,  2,  3],
        [ 4,  5,  6,  7]],

       [[ 8,  9, 10, 11],
        [12, 13, 14, 15]]])

In [60]:
#the axes has been recorded with the second axis first, the first axis second, and the last axis unchaged -> therefore the order is 1, 0, 2
#keep in mind that transposing copies data into a new array, thus the original array is unaffected
arr2 = arr.transpose((1, 0, 2))
arr2

array([[[ 0,  1,  2,  3],
        [ 8,  9, 10, 11]],

       [[ 4,  5,  6,  7],
        [12, 13, 14, 15]]])

In [61]:
#ndarray also has a method called swapaxes, which is more general than transposing
arr.swapaxes(1, 2)

array([[[ 0,  4],
        [ 1,  5],
        [ 2,  6],
        [ 3,  7]],

       [[ 8, 12],
        [ 9, 13],
        [10, 14],
        [11, 15]]])

In [63]:
#4.2 Universal functions: functions that apply element-wise operations on data in ndarrays
#Unary functions: abs, fabs (floating point absolute), sqrt, square, exp (e^x), log, log10, log2, log1p, sign, ceil, floor, rint (round elements to the nearest integer)
#                 modf (return fractional and integral parts of array as a seperate array, isnan(return boolean array indicating whether each value is NaN)
#                 isfinite, isinf, cos, cosh, sin, sinh, tan, tanh, ...
#                 logical_not (compute truth value of not x element-wise (equivalent to ~arr))
np.sqrt(arr)


array([[[ 0.        ,  1.        ,  1.41421356,  1.73205081],
        [ 2.        ,  2.23606798,  2.44948974,  2.64575131]],

       [[ 2.82842712,  3.        ,  3.16227766,  3.31662479],
        [ 3.46410162,  3.60555128,  3.74165739,  3.87298335]]])

In [65]:
#Binary functions:  add, subtract, multiply, divide, floor_divide, power, maximum, fmax (like maximum but ignore NaN), minimum, fmin, mod, copysign
#                   greater, greater_equal, less, less_equal, equal, not_equal, logical_and, logical_or, logical_xor (perform element-wise comparison, yielding boolean array)
np.add(arr, 3)

array([[[ 3,  4,  5,  6],
        [ 7,  8,  9, 10]],

       [[11, 12, 13, 14],
        [15, 16, 17, 18]]])

In [67]:
#the practice of replacing explicit loops with array expression in data processing is called vectorization
#vectorization is often be one or two (or more) order of magnitude faster than their pure Python equivalents
arr1 = [1, 2, 3, 4]
arr2 = [5, 6, 7, 8]
arr3, arr4 = np.meshgrid(arr1, arr2)
arr3

array([[1, 2, 3, 4],
       [1, 2, 3, 4],
       [1, 2, 3, 4],
       [1, 2, 3, 4]])

In [68]:
    arr4

array([[5, 5, 5, 5],
       [6, 6, 6, 6],
       [7, 7, 7, 7],
       [8, 8, 8, 8]])

In [74]:
z = np.sqrt(arr3 ** 2 + arr4 ** 2)
z

array([[ 5.09901951,  5.38516481,  5.83095189,  6.40312424],
       [ 6.08276253,  6.32455532,  6.70820393,  7.21110255],
       [ 7.07106781,  7.28010989,  7.61577311,  8.06225775],
       [ 8.06225775,  8.24621125,  8.54400375,  8.94427191]])

In [82]:
#list comprehension
xarr = np.array([1.1, 1.2, 1.3, 1.4, 1.5])
yarr = np.array([2.1, 2.2, 2.3, 2.4, 2.5])
cond = np.array([True, False, True, True, False])
result = [(x if c else y) for x, y, c in zip(xarr, yarr, cond)]
result

[1.1000000000000001, 2.2000000000000002, 1.3, 1.3999999999999999, 2.5]

In [83]:
#np.where, where condition is put first
result = np. where (cond, xarr, yarr)
result 

array([ 1.1,  2.2,  1.3,  1.4,  2.5])

In [84]:
# Mathematical and Statistical Methods
arr = np.random.randn(5, 4)
arr

array([[-1.96507854,  0.16439871,  0.067458  ,  2.51499534],
       [ 1.374731  , -0.67059883, -1.62022114,  1.24543769],
       [-0.069987  ,  0.74446844,  1.47017094,  0.13550725],
       [ 0.35968334, -1.2836328 ,  1.54941807,  0.74914887],
       [-1.03102036,  0.76797271, -0.13233188,  1.02821217]])

In [87]:
#calculate mean
arr.mean()
#alternate method: np.mean(arr)

0.26993660037920891

In [91]:
#you can also specify which axis you want to calculate 
#(axis = 0: calculate mean of columns, axis = 1: calculate mean of rows)
arr.mean(axis = 1)

array([ 0.19544338,  0.08233718,  0.57003991,  0.34365437,  0.15820816])

In [92]:
#method like cumsum or cumprod do not aggregate, and instead producing an array of the intermediate result
arr2 = np.array([1, 2, 3, 4, 5, 6, 7, 8])
arr2.cumsum()

array([ 1,  3,  6, 10, 15, 21, 28, 36], dtype=int32)

In [102]:
#same with sum or mean, you can choose to specify certain axis in multidimensional array
#axis = 0: plus along the column axis; axis = 1: plus along the row axis
arr.cumsum(axis = 0)

array([[-1.96507854,  0.16439871,  0.067458  ,  2.51499534],
       [-0.59034753, -0.50620012, -1.55276314,  3.76043303],
       [-0.66033454,  0.23826833, -0.08259219,  3.89594028],
       [-0.30065119, -1.04536447,  1.46682588,  4.64508915],
       [-1.33167156, -0.27739176,  1.334494  ,  5.67330132]])

In [103]:
#boolean values are coerced to 1 (True) and 0 (False) in the preceding methods. Thus, sum is often use as a mean of counting True values in a boolean arrays
arr = np.random.randn(100)
(arr > 0).sum()

55

In [106]:
# .any() checks if one or more values in an array is True, .all() checks if all of them are True
(arr >0).any()

True

In [107]:
(arr > 0).all()

False

In [108]:
#numpy sort algorithm is in-place, meaning the original array will be modified
arr.sort()
arr

array([-2.74233219, -2.39658149, -2.34692257, -2.23717583, -2.02341519,
       -1.86720658, -1.78858224, -1.66967307, -1.53988863, -1.34942425,
       -1.15945696, -1.15546908, -1.08877569, -1.06212345, -1.04847655,
       -1.03994114, -0.90658102, -0.88160623, -0.88150343, -0.86885002,
       -0.84346107, -0.81326997, -0.72658903, -0.64969308, -0.61173409,
       -0.58748936, -0.58023118, -0.54151776, -0.5018768 , -0.5004582 ,
       -0.48050347, -0.38706308, -0.38299149, -0.38039617, -0.33313215,
       -0.32939155, -0.2688039 , -0.23570903, -0.19484005, -0.17816939,
       -0.14475342, -0.13172458, -0.12664016, -0.06492166, -0.05257277,
        0.01852777,  0.02521138,  0.02742347,  0.02831418,  0.0438673 ,
        0.05666927,  0.07103935,  0.0873246 ,  0.13381068,  0.2360863 ,
        0.27628806,  0.30508081,  0.30576008,  0.31826219,  0.32117968,
        0.34182983,  0.3453236 ,  0.38289564,  0.39095365,  0.39147629,
        0.39981431,  0.41939263,  0.42499768,  0.48613433,  0.48

In [117]:
#you can also specify the axis you want to sort
#also, notice that reshape will specify number of columns before rows
arr = arr.reshape(20, 5)
arr.sort(1)
arr

array([[-2.74233219, -2.39658149, -2.34692257, -2.23717583, -2.02341519],
       [-1.86720658, -1.78858224, -1.66967307, -1.53988863, -1.34942425],
       [-1.15945696, -1.15546908, -1.08877569, -1.06212345, -1.04847655],
       [-1.03994114, -0.90658102, -0.88160623, -0.88150343, -0.86885002],
       [-0.84346107, -0.81326997, -0.72658903, -0.64969308, -0.61173409],
       [-0.58748936, -0.58023118, -0.54151776, -0.5018768 , -0.5004582 ],
       [-0.48050347, -0.38706308, -0.38299149, -0.38039617, -0.33313215],
       [-0.32939155, -0.2688039 , -0.23570903, -0.19484005, -0.17816939],
       [-0.14475342, -0.13172458, -0.12664016, -0.06492166, -0.05257277],
       [ 0.01852777,  0.02521138,  0.02742347,  0.02831418,  0.0438673 ],
       [ 0.05666927,  0.07103935,  0.0873246 ,  0.13381068,  0.2360863 ],
       [ 0.27628806,  0.30508081,  0.30576008,  0.31826219,  0.32117968],
       [ 0.34182983,  0.3453236 ,  0.38289564,  0.39095365,  0.39147629],
       [ 0.39981431,  0.41939263,  0.4

In [120]:
#select quantile of an array by first sorting the array
arr = arr.reshape(100)
arr[int(0.05 * len(arr))] #5% quantile


-1.8672065776558284

In [123]:
#Numpy has a set of operation for one-dimensional ndarrays.
arr = np.array(["Bob", "Ellie", "Joel", "Bob", "Joe", "Joel"])
np.unique(arr)

array(['Bob', 'Ellie', 'Joe', 'Joel'],
      dtype='<U5')

In [124]:
#in1d(x, y) method compute a boolean array indicating whether an element of x is contained in y
arr1 = np.array([1, 4, 5, 2])
arr2 = np.array([1, 52, 23, 4, 10, 20])
np.in1d(arr1, arr2)

array([ True,  True, False, False], dtype=bool)

In [127]:
#4.5 Linear Algebra
#x.dot(y) to calculate x.y (dot product), it is also equivalent to np.dot(x, y)
x = np.array([[1, 2, 3], [4, 9, 6]])
y = np.array([[2, 1, 5, 9], [4, 12, 1,6], [5, 3, 10, 3]])
x.dot(y)

array([[ 25,  34,  37,  30],
       [ 74, 130,  89, 108]])

In [128]:
#the @ symbol also works as an infix operator to perform matrix multiplication
x @ np.ones(3)

array([  6.,  19.])

In [130]:
#Pseudorandom number generator
#This is how you get a 4x4 array of samples from standard normal distribution using normal:
sample = np.random.normal(size = (4,4))
sample

array([[-0.84343294, -1.4095342 ,  0.88953167,  0.62739246],
       [-0.96944668, -0.16624471,  1.02202889,  0.18730987],
       [-1.16841696, -1.64488436,  0.43532711, -0.06012125],
       [ 0.61570204,  0.34987347, -0.49756572, -1.07726218]])

In [131]:
#We can change Numpy's random number generation seed 
np.random.seed(1323)

In [132]:
#The data generation functions in numpy.random use a global random seed. To avoid global random seed, you can use numpy.random.RandomState to create a generator isolated from others
rng = np.random.RandomState(1234)
rng. randn(10)

array([ 0.47143516, -1.19097569,  1.43270697, -0.3126519 , -0.72058873,
        0.88716294,  0.85958841, -0.6365235 ,  0.01569637, -2.24268495])

In [133]:
#perform random step with Numpy
nsteps = 1000
draws = np.random.randint(0, 2, size = nsteps)
steps = np.where(draws > 0, 1, -1)
walk = steps.cumsum()

In [134]:
walk.min()

-38

In [135]:
walk.max()

38

In [138]:
#To determine the "first crossing time", meaning the step which the random walk reachesa particular value, 
# we use argmax to return the first index of the maximum value in the boolean array (In boolean array, True is max value)
(walk > 10).argmax()

90

In [142]:
#we can also perform multiple random walks at the same time
np.random.seed(124)
nwalks = 5000
nsteps = 1000
draws = np.random.randint(0, 2, size=(nwalks, nsteps))
steps = np.where(draws>0, 1, -1)
walks = steps.cumsum(1)
walks

array([[ -1,  -2,  -1, ...,  30,  29,  28],
       [ -1,   0,   1, ...,  30,  29,  28],
       [ -1,  -2,  -1, ...,   8,   9,   8],
       ..., 
       [ -1,  -2,  -3, ...,  12,  11,  10],
       [ -1,   0,  -1, ..., -12, -13, -14],
       [  1,   2,   3, ..., -28, -27, -26]], dtype=int32)

In [145]:
walks.max()

113

In [144]:
walks.min()

-144

In [146]:
#we can do some basic statistic to compare each walk
#In this example, we check how many walks manages to walk more than 50 steps in either directions
hits50 = (np.abs(walks>=50)).any(1)
hits50

array([False, False, False, ..., False, False, False], dtype=bool)

In [147]:
np.sum(hits50)

540

In [149]:
#We can also calculate average first crossing time among the rows that hits 50
crossing_times = (np.abs(walks>=50)).argmax(1)
np.mean(crossing_times)

73.533199999999994