### Numpy 101
https://www.machinelearningplus.com/101-numpy-exercises-python/

#### import numpy and see the version

In [None]:
import numpy as np

print(np.__version__)

#### create 1D array

In [297]:
np.arange(10)

a = np.arange(10)
b = a.reshape(1,10)
print(a, a.shape)
print(b, b.shape)

[0 1 2 3 4 5 6 7 8 9] (10,)
[[0 1 2 3 4 5 6 7 8 9]] (1, 10)


#### create a boolean array

In [None]:
np.full((3,3), True)

#### extract items that satisfy a given condition from 1D array

In [None]:
arr = np.arange(10)
arr[arr%2 != 0]

#### replace items that satisfy a condition with another value in numpy array

In [None]:
arr = np.arange(10)
arr[arr%2 != 0] = -1
arr

#### replace items that satisfy a condition without affecting the original array

In [None]:
arr = np.arange(10)
out = np.where(arr%2 != 0, -1, arr)
print(arr)
out

#### reshape an array

In [None]:
arr = np.arange(10)
arr.reshape(2,-1)

#### stack two arrays vertically

In [None]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.vstack((a,b)))
print(np.concatenate((a,b), axis=0))
print(np.r_[a,b])

#### stack two arrays horizontally

In [None]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.hstack((a,b)))
print(np.concatenate((a,b), axis=1))
print(np.c_[a,b])

#### generate custom sequences in numpy without hardcoding

In [None]:
a = np.array([1,2,3])

np.r_[np.repeat(a,3), np.tile(a,3)]

#### get the common items between two python numpy arrays

In [None]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)

#### remove from one array those items that exist in another

In [None]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

print(np.intersect1d(a,np.setxor1d(a,b)))
print(np.setdiff1d(a,b))

#### get the positions where elements of two arrays match

In [None]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a==b)

#### extract all numbers between a given range from a numpy array

In [None]:
a = np.arange(15)

a[(a>=5) & (a<=10)]

#### make a python function that handles scalars to work on numpy arrays

In [None]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

print(list(map(maxx,a,b)))
np.vectorize(maxx, otypes=[float])(a,b)


#### swap two columns in a 2d numpy array

In [None]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[:, [1,0,2]]

#### swap two rows in a 2d numpy array

In [None]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[[1,0,2], :]

#### reverse the rows of a 2D array

In [None]:
arr = np.arange(9).reshape(3,3)

arr[::-1,:]

#### reverse the columns of a 2D array

In [None]:
arr = np.arange(9).reshape(3,3)

arr[:,::-1]

#### create a 2D array containing random floats between 5 and 10

In [None]:
np.random.uniform(5,10,(5,3))

#### print only 3 decimal places in python numpy array

In [None]:
rand_arr = np.random.random((5,3))

np.set_printoptions(3)
print(rand_arr)

#### pretty print a numpy array by suppressing the scientific notation (like 1e10)

In [None]:
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
print(rand_arr)

np.set_printoptions(suppress=True, precision=6)

#### limit the number of items printed in output of numpy array

In [None]:
a = np.arange(15)

np.set_printoptions(threshold=6)
print(a)

#### print the full numpy array without truncating

In [None]:
np.set_printoptions(threshold=6)
a = np.arange(15)

np.set_printoptions(threshold=np.nan)
a

#### import a dataset with numbers and texts keeping the text intact in python numpy

In [None]:
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
iris[:3]

#### extract a particular column from 1D array of tuples

In [None]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)

species = np.array([row[4] for row in iris_1d])
species[:5]

#### convert a 1d array of tuples to a 2d numpy array

In [None]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d.shape)
print(iris_1d.ndim)
print(type(iris_1d))
print(iris_1d[:4])
print(type(iris_1d[0]))

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
print(iris_2d.shape)
print(iris_2d.ndim)
print(type(iris_2d))
iris_2d[:4]

In [None]:
# Note: The different processing for different dtype
# https://stackoverflow.com/questions/25247190/numpy-void-type-how-to-use-it
dt = np.dtype([('num',np.int),('name',np.str)])
arr1 = np.array([(1,'4'),(5,'8')], dtype=dt)
print(type(arr1[0]))
print(arr1.shape)

arr2 = np.array([(1,'4'),(5,'8')], dtype=None)
print(type(arr2[0]))
print(arr2.shape)

iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d[:5])
print(iris_1d.shape) # 150*1
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
print(iris_2d[:5])
print(iris_2d.shape) # 150*5

#### compute the mean, median, standard deviation of a numpy array

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)

#### normalize an array so the values range exactly between 0 and 1

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

smax, smin = np.max(sepallength), np.min(sepallength)
s = (sepallength - smin) / (smax - smin)
s

#### compute the softmax score

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x), axis=0)

softmax(sepallength)

#### find the percentile scores of a numpy array

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

np.percentile(sepallength,q=[5, 95])

#### insert values at random positions in an array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

i, j = np.where(iris_2d)
iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan
iris_2d[:5]

In [None]:
# np.nan != np.nan, because np.nan is a float!!!
# https://stackoverflow.com/questions/13003202/python-nan-nan?noredirect=1&lq=1

#### find the position of missing values in numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print("Number of missing values:", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values:", np.where(np.isnan(iris_2d[:, 0])))

#### filter a numpy array based on two or more conditions

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

iris_2d[(iris_2d[:,2]>1.5) & (iris_2d[:,0]<5.0)]

#### drop rows that contain a missing value from a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

no_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[no_nan_in_row][:5]

#### find the correlation between two columns of a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

np.corrcoef(iris_2d[:,0], iris_2d[:,2])

#### find if a given array has any null values

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print(np.any(np.isnan(iris_2d)))
print(np.isnan(iris_2d).any())

#### replace all missing values with 0 in a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print(iris_2d[-5:])
iris_2d[np.where(np.isnan(iris_2d))] = 0
iris_2d[-5:]

#### find the count of unique values in a numpy array

In [265]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

species = iris_2d[:,4]
np.unique(species, return_counts=True)

# numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False)
# Find the unique elements of an array.
# Returns the sorted unique elements of an array.

(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'], dtype=object),
 array([50, 50, 50], dtype=int64))

#### convert a numeric to a categorical (text) array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

# iris_2d[:,0:4] = iris_2d[:,0:4].astype(float)
# print(iris_2d[:,2].max())
# print(iris_2d[:,2].min())
# print(iris_2d[:,1].dtype)
bins = np.digitize(iris_2d[:,2].astype(float), [0, 3, 5, 7])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in bins]
petal_length_cat[:4]

#### create a new column from existing columns of a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

a = np.pi * iris_2d[:,2] * np.sqrt(iris_2d[:,0]) / 3
print(iris_2d.shape)
print(a.shape)
np.column_stack((iris_2d, a))[:5]

#### do probabilistic sampling in numpy

In [215]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

print(iris_2d.shape)
print(np.unique(iris_2d[:,4], return_counts=True))

# 1
species = iris_2d[:, 4]
a = np.unique(iris_2d[:,4])
np.random.seed(100)
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

# 2
prob = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, 0.751, num=50), np.linspace(0.751, 1.000, num=50)]
# numpy.random.random(size=None)  Return random floats in the half-open interval [0.0, 1.0).
# numpy.searchsorted(a, v, side=’left’, sorter=None) Find indices where elements should be inserted to maintain order.
index = np.searchsorted(prob, np.random.random(150)) # Has duplicated value
species_out = species[index]
print(np.unique(species_out, return_counts=True))

(150, 5)
(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'], dtype=object), array([50, 50, 50], dtype=int64))
(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'], dtype=object), array([75, 35, 40], dtype=int64))


#### get the second largest value of an array when grouped by another array

In [225]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

print(iris_2d[iris_2d[:,4]==b'Iris-setosa'][:,2].astype(float))
np.unique(iris_2d[iris_2d[:,4]==b'Iris-setosa'][:,2].astype(float))[-2]

# Note: numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False)
# Find the unique elements of an array. Returns the sorted unique elements of an array(The sorted unique values).

[ 1.4  1.4  1.3  1.5  1.4  1.7  1.4  1.5  1.4  1.5  1.5  1.6  1.4  1.1  1.2
  1.5  1.3  1.4  1.7  1.5  1.7  1.5  1.   1.7  1.9  1.6  1.6  1.5  1.4  1.6
  1.6  1.5  1.5  1.4  1.5  1.2  1.3  1.5  1.3  1.5  1.3  1.3  1.3  1.6  1.9
  1.4  1.6  1.4  1.5  1.4]


1.7

#### sort a 2D array by a column

In [259]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

iris_2d[np.argsort(iris_2d[:,0], axis=0)][:20]

# numpy.argsort(a, axis=-1, kind=’quicksort’, order=None)
# Returns the indices that would sort an array.

(150,)

#### find the most frequent value in a numpy array

In [275]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

value, count = np.unique(iris_2d[:,2], return_counts=True) # 返回升序排列的数列值和数列中每个值的个数，返回值中可以看出值为b'1.5'的个数最多
print(value, count)
print(value[np.argsort(count)[-1]])

[b'1.0' b'1.1' b'1.2' b'1.3' b'1.4' b'1.5' b'1.6' b'1.7' b'1.9' b'3.0'
 b'3.3' b'3.5' b'3.6' b'3.7' b'3.8' b'3.9' b'4.0' b'4.1' b'4.2' b'4.3'
 b'4.4' b'4.5' b'4.6' b'4.7' b'4.8' b'4.9' b'5.0' b'5.1' b'5.2' b'5.3'
 b'5.4' b'5.5' b'5.6' b'5.7' b'5.8' b'5.9' b'6.0' b'6.1' b'6.3' b'6.4'
 b'6.6' b'6.7' b'6.9'] [ 1  1  2  7 12 14  7  4  2  1  2  2  1  1  1  3  5  3  4  2  4  8  3  5  4
  5  4  8  2  2  2  3  6  3  3  2  2  3  1  1  1  2  1]


b'1.5'

#### find the position of the first occurrence of a value greater than a given value

In [307]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

# 1
index = np.searchsorted(iris_2d[:,3].astype(float), 1.0, side='right')
print(index)

# 2
print(iris[:, 3].astype('float') > 1.0)
print((iris[:, 3].astype('float') > 1.0).shape)
np.argmax(iris[:, 3].astype('float') > 1.0, axis=0)

# 3
np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

50
[False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False False False False False False False False False False False
 False False  True  True  True  True  True  True  True False  True  True
 False  True False  True  True  True  True False  True  True  True  True
  True  True  True  True  True  True  True False  True False  True  True
  True  True  True  True  True  True  True  True  True False  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True  True  True  True  True  True  True
  True  True  True  True  True  True]
(150,)


array([50], dtype=int64)

#### replace all values greater than a given value to a given cutoff

In [303]:
np.random.seed(100)
a = np.random.uniform(1, 50, 20)

print(a)
# 1
a[a>30] = 30
a[a<10] = 10
print(a)

# 2
print(np.clip(a, a_min=10, a_max=30))

# 3
print(np.where(a<10, 10, np.where(a>30, 30, a)))

[ 27.626842  14.6401    21.801362  42.39403    1.231224   6.956887
  33.866705  41.466785   7.698623  29.179573  44.674776  11.250904
  10.081083   6.310468  11.765177  48.952565  40.772474   9.42511
  40.995013  14.429614]
[ 27.626842  14.6401    21.801362  30.        10.        10.        30.
  30.        10.        29.179573  30.        11.250904  10.081083  10.
  11.765177  30.        30.        10.        30.        14.429614]
[ 27.626842  14.6401    21.801362  30.        10.        10.        30.
  30.        10.        29.179573  30.        11.250904  10.081083  10.
  11.765177  30.        30.        10.        30.        14.429614]
[ 27.626842  14.6401    21.801362  30.        10.        10.        30.
  30.        10.        29.179573  30.        11.250904  10.081083  10.
  11.765177  30.        30.        10.        30.        14.429614]


#### compute the row wise counts of all possible values in an array

In [304]:
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
print(arr)



[[ 9  9  4  8  8  1  5  3  6  3]
 [ 3  3  2  1  9  5  1 10  7  3]
 [ 5  2  6  4  5  5  4  8  2  2]
 [ 8  8  1  3 10 10  4  3  6  9]
 [ 2  1  8  7  3  1  9  3  6  2]
 [ 9  2  6  5  3  9  4  6  1 10]]
