### Numpy 101
https://www.machinelearningplus.com/101-numpy-exercises-python/

#### import numpy and see the version

In [1]:
import numpy as np

print(np.__version__)

1.13.3


#### create 1D array

In [2]:
np.arange(10)

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

#### create a boolean array

In [3]:
np.full((3,3), True)

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]], dtype=bool)

#### extract items that satisfy a given condition from 1D array

In [4]:
arr = np.arange(10)
arr[arr%2 != 0]

array([1, 3, 5, 7, 9])

#### replace items that satisfy a condition with another value in numpy array

In [5]:
arr = np.arange(10)
arr[arr%2 != 0] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

#### replace items that satisfy a condition without affecting the original array

In [6]:
arr = np.arange(10)
out = np.where(arr%2 != 0, -1, arr)
print(arr)
out

[0 1 2 3 4 5 6 7 8 9]


array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

#### reshape an array

In [7]:
arr = np.arange(10)
arr.reshape(2,-1)

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

#### stack two arrays vertically

In [8]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.vstack((a,b)))
print(np.concatenate((a,b), axis=0))
print(np.r_[a,b])

[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]
[[0 1 2 3 4]
 [5 6 7 8 9]
 [1 1 1 1 1]
 [1 1 1 1 1]]


#### stack two arrays horizontally

In [9]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1,10).reshape(2,-1)

print(np.hstack((a,b)))
print(np.concatenate((a,b), axis=1))
print(np.c_[a,b])

[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]
[[0 1 2 3 4 1 1 1 1 1]
 [5 6 7 8 9 1 1 1 1 1]]


#### generate custom sequences in numpy without hardcoding

In [10]:
a = np.array([1,2,3])

np.r_[np.repeat(a,3), np.tile(a,3)]

array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3, 1, 2, 3])

#### get the common items between two python numpy arrays

In [11]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)

array([2, 4])

#### remove from one array those items that exist in another

In [12]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])

print(np.intersect1d(a,np.setxor1d(a,b)))
print(np.setdiff1d(a,b))

[1 2 3 4]
[1 2 3 4]


#### get the positions where elements of two arrays match

In [13]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a==b)

(array([1, 3, 5, 7], dtype=int64),)

#### extract all numbers between a given range from a numpy array

In [14]:
a = np.arange(15)

a[(a>=5) & (a<=10)]

array([ 5,  6,  7,  8,  9, 10])

#### make a python function that handles scalars to work on numpy arrays

In [15]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

print(list(map(maxx,a,b)))
np.vectorize(maxx, otypes=[float])(a,b)


[6, 7, 9, 8, 9, 7, 5]


array([ 6.,  7.,  9.,  8.,  9.,  7.,  5.])

#### swap two columns in a 2d numpy array

In [16]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[:, [1,0,2]]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

#### swap two rows in a 2d numpy array

In [17]:
arr = np.arange(9).reshape(3,3)
print(arr)

arr[[1,0,2], :]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

#### reverse the rows of a 2D array

In [18]:
arr = np.arange(9).reshape(3,3)

arr[::-1,:]

array([[6, 7, 8],
       [3, 4, 5],
       [0, 1, 2]])

#### reverse the columns of a 2D array

In [19]:
arr = np.arange(9).reshape(3,3)

arr[:,::-1]

array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

#### create a 2D array containing random floats between 5 and 10

In [20]:
np.random.uniform(5,10,(5,3))

array([[ 8.48024419,  6.1461809 ,  9.28665464],
       [ 7.4036986 ,  8.66272296,  8.68395803],
       [ 6.82659655,  8.42415717,  6.23769006],
       [ 7.1444372 ,  5.44986867,  8.08754185],
       [ 9.25117311,  8.02052598,  8.86475471]])

#### print only 3 decimal places in python numpy array

In [21]:
rand_arr = np.random.random((5,3))

np.set_printoptions(3)
print(rand_arr)

[[ 0.047  0.294  0.887]
 [ 0.143  0.211  0.048]
 [ 0.45   0.993  0.18 ]
 [ 0.455  0.707  0.832]
 [ 0.373  0.856  0.167]]


#### pretty print a numpy array by suppressing the scientific notation (like 1e10)

In [22]:
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
print(rand_arr)

np.set_printoptions(suppress=True, precision=6)

[[  5.434e-04   2.784e-04   4.245e-04]
 [  8.448e-04   4.719e-06   1.216e-04]
 [  6.707e-04   8.259e-04   1.367e-04]]


#### limit the number of items printed in output of numpy array

In [23]:
a = np.arange(15)

np.set_printoptions(threshold=6)
print(a)

[ 0  1  2 ..., 12 13 14]


#### print the full numpy array without truncating

In [24]:
np.set_printoptions(threshold=6)
a = np.arange(15)

np.set_printoptions(threshold=np.nan)
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

#### import a dataset with numbers and texts keeping the text intact in python numpy

In [25]:
# url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
iris[:3]

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa']], dtype=object)

#### extract a particular column from 1D array of tuples

In [26]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)

species = np.array([row[4] for row in iris_1d])
species[:5]

array([b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', b'Iris-setosa',
       b'Iris-setosa'],
      dtype='|S18')

#### convert a 1d array of tuples to a 2d numpy array

In [27]:
iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d.shape)
print(iris_1d.ndim)
print(type(iris_1d))
print(iris_1d[:4])
print(type(iris_1d[0]))

iris_2d = np.array([row.tolist()[:4] for row in iris_1d])
print(iris_2d.shape)
print(iris_2d.ndim)
print(type(iris_2d))
iris_2d[:4]

(150,)
1
<class 'numpy.ndarray'>
[( 5.1,  3.5,  1.4,  0.2, b'Iris-setosa')
 ( 4.9,  3. ,  1.4,  0.2, b'Iris-setosa')
 ( 4.7,  3.2,  1.3,  0.2, b'Iris-setosa')
 ( 4.6,  3.1,  1.5,  0.2, b'Iris-setosa')]
<class 'numpy.void'>
(150, 4)
2
<class 'numpy.ndarray'>


array([[ 5.1,  3.5,  1.4,  0.2],
       [ 4.9,  3. ,  1.4,  0.2],
       [ 4.7,  3.2,  1.3,  0.2],
       [ 4.6,  3.1,  1.5,  0.2]])

In [45]:
# Note: The different processing for different dtype
# https://stackoverflow.com/questions/25247190/numpy-void-type-how-to-use-it
dt = np.dtype([('num',np.int),('name',np.str)])
arr1 = np.array([(1,'4'),(5,'8')], dtype=dt)
print(type(arr1[0]))
print(arr1.shape)

arr2 = np.array([(1,'4'),(5,'8')], dtype=None)
print(type(arr2[0]))
print(arr2.shape)

iris_1d = np.genfromtxt('iris.data.txt', delimiter=',', dtype=None)
print(iris_1d[:5])
print(iris_1d.shape)
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
print(iris_2d[:5])
print(iris_2d.shape)

<class 'numpy.void'>
(2,)
<class 'numpy.ndarray'>
(2, 2)
[( 5.1,  3.5,  1.4,  0.2, b'Iris-setosa')
 ( 4.9,  3. ,  1.4,  0.2, b'Iris-setosa')
 ( 4.7,  3.2,  1.3,  0.2, b'Iris-setosa')
 ( 4.6,  3.1,  1.5,  0.2, b'Iris-setosa')
 ( 5. ,  3.6,  1.4,  0.2, b'Iris-setosa')]
(150,)
[[b'5.1' b'3.5' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.9' b'3.0' b'1.4' b'0.2' b'Iris-setosa']
 [b'4.7' b'3.2' b'1.3' b'0.2' b'Iris-setosa']
 [b'4.6' b'3.1' b'1.5' b'0.2' b'Iris-setosa']
 [b'5.0' b'3.6' b'1.4' b'0.2' b'Iris-setosa']]
(150, 5)


#### compute the mean, median, standard deviation of a numpy array

In [33]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

mu, med, sd = np.mean(sepallength), np.median(sepallength), np.std(sepallength)
print(mu, med, sd)

5.84333333333 5.8 0.825301291785


#### normalize an array so the values range exactly between 0 and 1

In [37]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

smax, smin = np.max(sepallength), np.min(sepallength)
s = (sepallength - smin) / (smax - smin)
s

array([ 0.222222,  0.166667,  0.111111,  0.083333,  0.194444,  0.305556,
        0.083333,  0.194444,  0.027778,  0.166667,  0.305556,  0.138889,
        0.138889,  0.      ,  0.416667,  0.388889,  0.305556,  0.222222,
        0.388889,  0.222222,  0.305556,  0.222222,  0.083333,  0.222222,
        0.138889,  0.194444,  0.194444,  0.25    ,  0.25    ,  0.111111,
        0.138889,  0.305556,  0.25    ,  0.333333,  0.166667,  0.194444,
        0.333333,  0.166667,  0.027778,  0.222222,  0.194444,  0.055556,
        0.027778,  0.194444,  0.222222,  0.138889,  0.222222,  0.083333,
        0.277778,  0.194444,  0.75    ,  0.583333,  0.722222,  0.333333,
        0.611111,  0.388889,  0.555556,  0.166667,  0.638889,  0.25    ,
        0.194444,  0.444444,  0.472222,  0.5     ,  0.361111,  0.666667,
        0.361111,  0.416667,  0.527778,  0.361111,  0.444444,  0.5     ,
        0.555556,  0.5     ,  0.583333,  0.638889,  0.694444,  0.666667,
        0.472222,  0.388889,  0.333333,  0.333333, 

#### compute the softmax score

In [38]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

def softmax(x):
    return np.exp(x)/np.sum(np.exp(x), axis=0)

softmax(sepallength)

array([ 0.00222 ,  0.001817,  0.001488,  0.001346,  0.002008,  0.002996,
        0.001346,  0.002008,  0.001102,  0.001817,  0.002996,  0.001644,
        0.001644,  0.000997,  0.00447 ,  0.004044,  0.002996,  0.00222 ,
        0.004044,  0.00222 ,  0.002996,  0.00222 ,  0.001346,  0.00222 ,
        0.001644,  0.002008,  0.002008,  0.002453,  0.002453,  0.001488,
        0.001644,  0.002996,  0.002453,  0.003311,  0.001817,  0.002008,
        0.003311,  0.001817,  0.001102,  0.00222 ,  0.002008,  0.001218,
        0.001102,  0.002008,  0.00222 ,  0.001644,  0.00222 ,  0.001346,
        0.002711,  0.002008,  0.01484 ,  0.008144,  0.013428,  0.003311,
        0.009001,  0.004044,  0.007369,  0.001817,  0.009947,  0.002453,
        0.002008,  0.00494 ,  0.005459,  0.006033,  0.003659,  0.010994,
        0.003659,  0.00447 ,  0.006668,  0.003659,  0.00494 ,  0.006033,
        0.007369,  0.006033,  0.008144,  0.009947,  0.01215 ,  0.010994,
        0.005459,  0.004044,  0.003311,  0.003311, 

#### find the percentile scores of a numpy array

In [41]:
iris = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')
sepallength = np.array([row.tolist()[0] for row in iris], dtype='float')

np.percentile(sepallength,q=[5, 95])

array([ 4.6  ,  7.255])

#### insert values at random positions in an array

In [44]:
iris_2d = np.genfromtxt('iris.data.txt', delimiter=',', dtype='object')

iris_2d.shape

(150,)