### Numpy 101
https://www.machinelearningplus.com/101-numpy-exercises-python/

#### import numpy and see the version

In [1]:
import numpy as np

print(np.__version__)

1.13.3


#### create 1D array

In [2]:
np.arange(10)

a = np.arange(10)
b = a.reshape(1,10)
print(a, a.shape)
print(b, b.shape)

[0 1 2 3 4 5 6 7 8 9] (10,)
[[0 1 2 3 4 5 6 7 8 9]] (1, 10)


#### create a boolean array

In [3]:
np.full((3,3), True)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

#### extract items that satisfy a given condition from 1D array

In [4]:
arr = np.arange(10)
arr[arr%2 != 0]

array([1, 3, 5, 7, 9])

#### replace items that satisfy a condition with another value in numpy array

In [5]:
arr = np.arange(10)
arr[arr%2 != 0] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

#### replace items that satisfy a condition without affecting the original array

In [6]:
arr = np.arange(10)
out = np.where(arr%2 != 0, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

#### reshape an array

In [7]:
arr = np.arange(10)
arr.reshape(2,-1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

#### stack two arrays vertically

In [8]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.vstack((a,b)))
print(np.concatenate((a,b), axis=0))
print(np.r_[a,b])

[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]


#### stack two arrays horizontally

In [9]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.hstack((a,b)))
print(np.concatenate((a,b), axis=1))
print(np.c_[a,b])

[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]


#### generate custom sequences in numpy without hardcoding

In [10]:
a = np.array([1,2,3])

np.r_[np.repeat(a,3), np.tile(a,3)]

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

#### get the common items between two python numpy arrays

In [11]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)

array([2, 4])

#### remove from one array those items that exist in another

In [12]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

print(np.intersect1d(a,np.setxor1d(a,b)))
print(np.setdiff1d(a,b))

[1 2 3 4]
[1 2 3 4]


#### get the positions where elements of two arrays match

In [13]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a==b)

(array([1, 3, 5, 7], dtype=int64),)

#### extract all numbers between a given range from a numpy array

In [14]:
a = np.arange(15)

a[(a>=5) & (a<=10)]

array([ 5,  6,  7,  8,  9, 10])

#### make a python function that handles scalars to work on numpy arrays

In [15]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

print(list(map(maxx,a,b)))
np.vectorize(maxx, otypes=[float])(a,b)


[6, 7, 9, 8, 9, 7, 5]


array([ 6.,  7.,  9.,  8.,  9.,  7.,  5.])

#### swap two columns in a 2d numpy array

In [16]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[:, [1,0,2]]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

#### swap two rows in a 2d numpy array

In [17]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[[1,0,2], :]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

#### reverse the rows of a 2D array

In [18]:
arr = np.arange(9).reshape(3,3)

arr[::-1,:]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

#### reverse the columns of a 2D array

In [None]:
arr = np.arange(9).reshape(3,3)

arr[:,::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

#### create a 2D array containing random floats between 5 and 10

In [None]:
np.random.uniform(5,10,(5,3))

#### print only 3 decimal places in python numpy array

In [None]:
rand_arr = np.random.random((5,3))

np.set_printoptions(3)
print(rand_arr)

#### pretty print a numpy array by suppressing the scientific notation (like 1e10)

In [None]:
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
print(rand_arr)

np.set_printoptions(suppress=True, precision=6)

#### limit the number of items printed in output of numpy array

In [None]:
a = np.arange(15)

np.set_printoptions(threshold=6)
print(a)

#### print the full numpy array without truncating

In [None]:
np.set_printoptions(threshold=6)
a = np.arange(15)

np.set_printoptions(threshold=np.nan)
a

#### import a dataset with numbers and texts keeping the text intact in python numpy

In [None]:
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
iris[:3]

#### extract a particular column from 1D array of tuples

In [None]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)

species = np.array([row[4] for row in iris_1d])
species[:5]

#### convert a 1d array of tuples to a 2d numpy array

In [None]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d.shape)
print(iris_1d.ndim)
print(type(iris_1d))
print(iris_1d[:4])
print(type(iris_1d[0]))

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
print(iris_2d.shape)
print(iris_2d.ndim)
print(type(iris_2d))
iris_2d[:4]

In [None]:
# Note: The different processing for different dtype
# https://stackoverflow.com/questions/25247190/numpy-void-type-how-to-use-it
dt = np.dtype([('num',np.int),('name',np.str)])
arr1 = np.array([(1,'4'),(5,'8')], dtype=dt)
print(type(arr1[0]))
print(arr1.shape)

arr2 = np.array([(1,'4'),(5,'8')], dtype=None)
print(type(arr2[0]))
print(arr2.shape)

iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d[:5])
print(iris_1d.shape) # 150*1
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
print(iris_2d[:5])
print(iris_2d.shape) # 150*5

#### compute the mean, median, standard deviation of a numpy array

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)

#### normalize an array so the values range exactly between 0 and 1

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

smax, smin = np.max(sepallength), np.min(sepallength)
s = (sepallength - smin) / (smax - smin)
s

#### compute the softmax score

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x), axis=0)

softmax(sepallength)

#### find the percentile scores of a numpy array

In [None]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

np.percentile(sepallength,q=[5, 95])

#### insert values at random positions in an array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

i, j = np.where(iris_2d)
iris_2d[np.random.choice((i), 20), np.random.choice((j), 20)] = np.nan
iris_2d[:5]

In [None]:
# np.nan != np.nan, because np.nan is a float!!!
# https://stackoverflow.com/questions/13003202/python-nan-nan?noredirect=1&lq=1

#### find the position of missing values in numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print("Number of missing values:", np.isnan(iris_2d[:, 0]).sum())
print("Position of missing values:", np.where(np.isnan(iris_2d[:, 0])))

#### filter a numpy array based on two or more conditions

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

iris_2d[(iris_2d[:,2]>1.5) & (iris_2d[:,0]<5.0)]

#### drop rows that contain a missing value from a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

no_nan_in_row = np.array([~np.any(np.isnan(row)) for row in iris_2d])
iris_2d[no_nan_in_row][:5]

#### find the correlation between two columns of a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

np.corrcoef(iris_2d[:,0], iris_2d[:,2])

#### find if a given array has any null values

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print(np.any(np.isnan(iris_2d)))
print(np.isnan(iris_2d).any())

#### replace all missing values with 0 in a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print(iris_2d[-5:])
iris_2d[np.where(np.isnan(iris_2d))] = 0
iris_2d[-5:]

#### find the count of unique values in a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

species = iris_2d[:,4]
np.unique(species, return_counts=True)

# numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False)
# Find the unique elements of an array.
# Returns the sorted unique elements of an array.

#### convert a numeric to a categorical (text) array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

# iris_2d[:,0:4] = iris_2d[:,0:4].astype(float)
# print(iris_2d[:,2].max())
# print(iris_2d[:,2].min())
# print(iris_2d[:,1].dtype)
bins = np.digitize(iris_2d[:,2].astype(float), [0, 3, 5, 7])
label_map = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petal_length_cat = [label_map[x] for x in bins]
petal_length_cat[:4]

#### create a new column from existing columns of a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='float', usecols=[0,1,2,3])

a = np.pi * iris_2d[:,2] * np.sqrt(iris_2d[:,0]) / 3
print(iris_2d.shape)
print(a.shape)
np.column_stack((iris_2d, a))[:5]

#### do probabilistic sampling in numpy

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

print(iris_2d.shape)
print(np.unique(iris_2d[:,4], return_counts=True))

# 1
species = iris_2d[:, 4]
a = np.unique(iris_2d[:,4])
np.random.seed(100)
species_out = np.random.choice(a, 150, p=[0.5, 0.25, 0.25])

# 2
prob = np.r_[np.linspace(0, 0.500, num=50), np.linspace(0.501, 0.751, num=50), np.linspace(0.751, 1.000, num=50)]
# numpy.random.random(size=None)  Return random floats in the half-open interval [0.0, 1.0).
# numpy.searchsorted(a, v, side=’left’, sorter=None) Find indices where elements should be inserted to maintain order.
index = np.searchsorted(prob, np.random.random(150)) # Has duplicated value
species_out = species[index]
print(np.unique(species_out, return_counts=True))

#### get the second largest value of an array when grouped by another array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

print(iris_2d[iris_2d[:,4]==b'Iris-setosa'][:,2].astype(float))
np.unique(iris_2d[iris_2d[:,4]==b'Iris-setosa'][:,2].astype(float))[-2]

# Note: numpy.unique(ar, return_index=False, return_inverse=False, return_counts=False)
# Find the unique elements of an array. Returns the sorted unique elements of an array(The sorted unique values).

#### sort a 2D array by a column

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

iris_2d[np.argsort(iris_2d[:,0], axis=0)][:20]

# numpy.argsort(a, axis=-1, kind=’quicksort’, order=None)
# Returns the indices that would sort an array.

#### find the most frequent value in a numpy array

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

value, count = np.unique(iris_2d[:,2], return_counts=True) # 返回升序排列的数列值和数列中每个值的个数，返回值中可以看出值为b'1.5'的个数最多
print(value, count)
print(value[np.argsort(count)[-1]])

#### find the position of the first occurrence of a value greater than a given value

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

# 1
index = np.searchsorted(iris_2d[:,3].astype(float), 1.0, side='right')
print(index)

# 2
print(iris[:, 3].astype('float') > 1.0)
print((iris[:, 3].astype('float') > 1.0).shape)
np.argmax(iris[:, 3].astype('float') > 1.0, axis=0)

# 3
np.argwhere(iris[:, 3].astype(float) > 1.0)[0]

#### replace all values greater than a given value to a given cutoff

In [None]:
np.random.seed(100)
a = np.random.uniform(1, 50, 20)

print(a)
# 1
a[a>30] = 30
a[a<10] = 10
print(a)

# 2
print(np.clip(a, a_min=10, a_max=30))

# 3
print(np.where(a<10, 10, np.where(a>30, 30, a)))

#### get the positions of top n values from a numpy array

In [None]:
np.random.seed(100)
a = np.random.uniform(1,50, 20)

print(a)
np.sort(a)[-5:]
print(np.argsort(a)[-5:])
print(a[np.argsort(a)][-5:])

#### compute the row wise counts of all possible values in an array

In [None]:
np.random.seed(100)
arr = np.random.randint(1,11,size=(6, 10))
print(arr)

# 1
a = []
for row in arr:
    for i in range(1, 11):
        a.append(np.sum([(elem - i)==0 for elem in row]))
print(np.array(a).reshape(arr.shape))

# 2
num_counts_array = [np.unique(row, return_counts=True) for row in arr]
print(num_counts_array)
[[int(b[a==i]) if i in a else 0 for i in np.unique(arr)] for a, b in num_counts_array]

#### convert an array of arrays into a flat 1d array

In [None]:
arr1 = np.arange(3)
arr2 = np.arange(3,7)
arr3 = np.arange(7,10)

array_of_arrays = np.array([arr1, arr2, arr3])
print(array_of_arrays)

# 1
print(np.array([a for arr in array_of_arrays for a in arr]))

# 2
print(np.concatenate(array_of_arrays))

#### generate one-hot encodings for an array in numpy

In [None]:
np.random.seed(101) 
arr = np.random.randint(1,4, size=6)
print(arr)

value = np.max(arr)
np.eye(value)[arr-1]

#### create row numbers grouped by a categorical variable

In [None]:
species = np.genfromtxt('iris.data.txt', delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
print(species_small)

# 1
value, count = np.unique(species_small, return_counts=True)
print(np.concatenate([np.arange(0, i) for i in count]))

# 2
print([i for val in np.unique(species_small) for i, grp in enumerate(species_small[species_small==val])])

#### create groud ids based on a given categorical variable

In [None]:
species = np.genfromtxt('iris.data.txt', delimiter=',', dtype='str', usecols=4)
np.random.seed(100)
species_small = np.sort(np.random.choice(species, size=20))
print(species_small)

output = []
for index, elem in enumerate(np.unique(species_small)):
    for e in species_small:
        if elem == e:
            output.append(index)
print(output)

#### rank items in an array using numpy

In [None]:
np.random.seed(10)
a = np.random.randint(20, size=10)
print(a)

print(np.argsort(a))
print(np.argsort(np.argsort(a)))

#### rank items in a multidimensional array using numpy

In [None]:
np.random.seed(10)
a = np.random.randint(20, size=[2,5])
print(a)

print(np.ravel(a).argsort().argsort().reshape(a.shape))

#### find the maximum value in each row of a numpy array 2d

In [None]:
np.random.seed(100)
a = np.random.randint(1,10, [5,3])

# 1
print([row.max() for row in a])

# 2
print(np.amax(a, axis=1))

#### compute the min-by-max for each row for a numpy array 2d

In [None]:
np.random.seed(100)
a = np.random.randint(1,10, [5,3])

print(np.amin(a, axis=1)/np.amax(a, axis=1))

#### find the duplicate records in a numpy array

In [None]:
np.random.seed(100)
a = np.random.randint(0, 5, 10)
print('Array: ', a)

out = np.full(a.shape[0], True)
out[np.unique(a, return_index=True)[1]] = False
print(out)

#### find the grouped mean in numpy

In [None]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

numerical_col = iris_2d[:, 1].astype('float')
species_col = iris_2d[:, 4]
[[item, numerical_col[species_col==item].mean()] for item in np.unique(species_col)]

#### convert a PIL image to numpy array

In [None]:
from PIL import Image
pic = Image.open("Denali_Mt_McKinley.jpg")
pic = pic.resize([10,10])

pixels = np.array(pic)
print(pixels)

#### drop all missing values from a numpy array

In [None]:
arr = np.array([1,2,3,np.nan,5,6,7,np.nan])

arr[~np.isnan(arr)]

#### compute the euclidean distance between two arrays

In [None]:
a = np.array([1,2,3,4,5])
b = np.array([4,5,6,7,8])

print(np.sqrt(np.sum((np.power((a-b),2)))))
print(np.linalg.norm(a-b))

#### find all the local maximum (or peaks) in a 1d array

In [None]:
a = np.array([1, 3, 7, 1, 2, 6, 0, 1])

# 1
b = [(a[i]>a[i-1]) & (a[i]>a[i+1]) for i in range(1, len(a)-1)]
print(np.where(np.array(b)==True)[0] + 1)

# 2
doublediff = np.diff(np.sign(np.diff(a)))
print(np.where(doublediff == -2)[0] + 1)

#### subtract a 1d array from a 2d array, where each item of 1d array subtracts from respective row

In [None]:
a_2d = np.array([[3,3,3],[4,4,4],[5,5,5]])
b_1d = np.array([1,2,3])

a_2d - b_1d.reshape(3, 1)

In [None]:
# difference between reshape(n,) and reshape(n, 1)
arr = np.random.uniform(1,10, size=(3,3))
print(arr.reshape(9,))
print(arr.reshape(9,1))

#### find the index of n'th repetition of an item in an array

In [None]:
x = np.array([1, 2, 1, 1, 3, 4, 3, 1, 1, 2, 1, 1, 2])

n = 5
np.where(x==1)[0][n-1]

#### convert numpy's datetime64 object to datetime's datetime object

In [None]:
dt64 = np.datetime64('2018-02-25 22:10:10')

# 1
print(dt64.tolist())

# 2
from datetime import datetime
print(dt64.astype(datetime))

####  compute the moving average of a numpy array

In [None]:
np.random.seed(100)
Z = np.random.randint(10, size=10)

def moving_average(a, n=3):     
    ret = a.cumsum()
    ret[n:] = ret[n:] - ret[:-n]
    return ret[n-1:]/n

moving_average(Z)

#### create a numpy array sequence given only the starting point, length and the step

In [None]:
def array_gen(start, length, step):
    ret = np.arange(start, start+step*length, step)
    return ret

array_gen(5, 10, 3)

#### fill in missing dates in an irregular series of numpy dates

In [None]:
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-25'), 2)
print(dates)

filled_in = np.array([np.arange(date, (date+d)) for date, d in zip(dates, np.diff(dates))]).reshape(-1)
np.hstack([filled_in, dates[-1]])

#### create strides from a given 1D array

In [None]:
arr = np.arange(15) 
print(arr)

length = 5
strides = 3
[arr[i:length+i] for i in np.arange(0, len(arr)-length, strides)]