# 101 NumPy Exercises for Data Analysis (Python)

 _Source:  https://www.machinelearningplus.com/python/101-numpy-exercises-python/_

** 1.- Import numpy as np and see the version**

In [3]:
import numpy as np
np.__version__

'1.14.0'

** 2.- How to create a 1D array? **
   - Create a 1D array of numbers from 0 to 9

In [5]:
a = np.arange(0,10)
a

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

** 3.- How to create a boolean array? **
  - Create a 3×3 numpy array of all True’s

In [16]:
a = np.ones((3,3), dtype=bool)
a

array([[ True,  True,  True],
       [ True,  True,  True],
       [ True,  True,  True]])

** 4.- How to extract items that satisfy a given condition from 1D array? **
   - Extract all odd numbers from arr

In [13]:
arr = np.array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])
arr[arr%2 != 0]



array([1, 3, 5, 7, 9])

** 5.- How to replace items that satisfy a condition with another value in numpy array? **
  - Replace all odd numbers in arr with -1

In [10]:
arr[arr%2 != 0] = -1
arr

array([ 0, -1,  2, -1,  4, -1,  6, -1,  8, -1])

** 6.- How to replace items that satisfy a condition without affecting the original array? **
  - Replace all odd numbers in arr with -1 without changing arr

In [18]:
arrModified = np.copy(arr)
arrModified[arrModified%2 != 0] = -1
# alternatively use
# arrModified = np.where(arr % 2 == 1,  -1, arr)
print(arr)
print(arrModified)

[0 1 2 3 4 5 6 7 8 9]
[ 0 -1  2 -1  4 -1  6 -1  8 -1]


** 7.- How to reshape an array? **
  - Convert a 1D array to a 2D array with 2 rows

In [25]:
arr = np.arange(10)
arr.reshape(2,5)
# alternative use
# arr.reshape(2, -1)  # Setting to -1 automatically decides the number of cols

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9]])

** 8.- How to stack two arrays vertically? **
  - Stack arrays a and b vertically

In [38]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
np.concatenate((a, b), axis = 0)
# Method 2:
#np.vstack([a, b])
# Method 3:
#np.r_[a, b]

array([[0, 1, 2, 3, 4],
       [5, 6, 7, 8, 9],
       [1, 1, 1, 1, 1],
       [1, 1, 1, 1, 1]])

** 9.- How to stack two arrays horizontally? **
   - Stack the arrays a and b horizontally.

In [39]:
a = np.arange(10).reshape(2,-1)
b = np.repeat(1, 10).reshape(2,-1)
np.concatenate((a, b), axis = 1)
# Method 2:
#np.hstack([a, b])

# Method 3:
#np.c_[a, b]

array([[0, 1, 2, 3, 4, 1, 1, 1, 1, 1],
       [5, 6, 7, 8, 9, 1, 1, 1, 1, 1]])

** 10.- How to generate custom sequences in numpy without hardcoding? **
  - Create the following pattern without hardcoding. Use only numpy functions and the below input array a.

In [56]:
a = np.array([1,2,3])
b = np.repeat(a, [3,3,3])
np.hstack([b, np.tile(a, 2)])

# alternative use
#np.r_[np.repeat(a, 3), np.tile(a, 3)]


array([1, 1, 1, 2, 2, 2, 3, 3, 3, 1, 2, 3, 1, 2, 3])

** 11.- How to get the common items between two python numpy arrays? **
   - Get the common items between a and b

In [66]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.intersect1d(a,b)


array([2, 4])

** 12.- How to remove from one array those items that exist in another? **
  - From array a remove all items present in array b

In [78]:
a = np.array([1,2,3,4,5])
b = np.array([5,6,7,8,9])
print(a,b)
a[np.isin(a,b, invert=True)]

# Alternative use
# np.setdiff1d(a,b)

[1 2 3 4 5] [5 6 7 8 9]


array([1, 2, 3, 4])

** 13.- How to get the positions where elements of two arrays match? **
   - Get the positions where elements of a and b match

In [81]:
a = np.array([1,2,3,2,3,4,3,4,5,6])
b = np.array([7,2,10,2,7,4,9,4,9,8])

np.where(a == b)

(array([1, 3, 5, 7]),)

** 14.- How to extract all numbers between a given range from a numpy array? **
   - Get all items between 5 and 10 from a.

In [98]:
a = np.array([2, 6, 1, 9, 10, 3, 27])
a
a[(a>=5) & (a<=10)]

# Alternative use
# np.where((a>=5) & (a<=10))

array([ 6,  9, 10])

** 15.- How to make a python function that handles scalars to work on numpy arrays? **
  - Convert the function maxx that works on two scalars, to work on two arrays.

In [110]:
def maxx(x, y):
    """Get the maximum of two items"""
    if x >= y:
        return x
    else:
        return y

maxx(1, 5)

a = np.array([5, 7, 9, 8, 6, 4, 5])
b = np.array([6, 3, 4, 8, 9, 7, 1])

pairMax = np.vectorize(maxx)

pairMax(a,b)
    

array([6, 7, 9, 8, 9, 7, 5])

** 16.- How to swap two columns in a 2d numpy array? **
  - Swap columns 1 and 2 in the array arr.

In [146]:
arr = np.arange(9).reshape(3,3)
print(arr)
arr[:, [1,0,2]]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[1, 0, 2],
       [4, 3, 5],
       [7, 6, 8]])

** 17.- How to swap two rows in a 2d numpy array? **
  - Swap rows 1 and 2 in the array arr:

In [147]:
arr = np.arange(9).reshape(3,3)
print(arr)
arr[[1,0,2],: ]

[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[3, 4, 5],
       [0, 1, 2],
       [6, 7, 8]])

** 19.- How to reverse the columns of a 2D array? **
  - Reverse the columns of a 2D array arr.

In [154]:
arr = np.arange(9).reshape(3,-1)
print(arr)

arr[:,::-1]


[[0 1 2]
 [3 4 5]
 [6 7 8]]


array([[2, 1, 0],
       [5, 4, 3],
       [8, 7, 6]])

** 20.- How to create a 2D array containing random floats between 5 and 10? **
   - Create a 2D array of shape 5x3 to contain random decimal numbers between 5 and 10.

In [169]:
x = np.random.uniform(5,10, size=(5,3))
x

array([[8.64128082, 7.90304249, 5.69307691],
       [7.1679364 , 9.37886172, 7.51012144],
       [9.23000027, 5.86419845, 8.79863706],
       [5.62736394, 9.94383594, 6.01858064],
       [8.44725249, 8.75168736, 7.32301138]])

** 21.- How to print only 3 decimal places in python numpy array? **
   - Print or show only 3 decimal places of the numpy array rand_arr.

In [25]:
rand_arr = np.random.random((5,3))
print(type(rand_arr))

# Limit to 3 decimal places
np.set_printoptions(precision=3)
rand_arr

<class 'numpy.ndarray'>


array([[0.649, 0.838, 0.912],
       [0.238, 0.545, 0.018],
       [0.404, 0.157, 0.725],
       [0.81 , 0.034, 0.524],
       [0.645, 0.613, 0.508]])

** 22.- How to pretty print a numpy array by suppressing the scientific notation (like 1e10)? **
   - Pretty print rand_arr by suppressing the scientific notation (like 1e10)

In [29]:
np.random.seed(100)
rand_arr = np.random.random([3,3])/1e3
rand_arr

np.set_printoptions(precision=6, suppress=True)
rand_arr

array([[0.000543, 0.000278, 0.000425],
       [0.000845, 0.000005, 0.000122],
       [0.000671, 0.000826, 0.000137]])

** 23.- How to limit the number of items printed in output of numpy array? **
   - Limit the number of items printed in python numpy array a to a maximum of 6 elements.

In [35]:
a = np.arange(15)
a

np.set_printoptions(suppress=False, threshold=6)
a


array([ 0,  1,  2, ..., 12, 13, 14])

** 24.- How to print the full numpy array without truncating **
   - Print the full numpy array a without truncating.

In [39]:
np.set_printoptions(threshold=6)
a = np.arange(15)
a
np.set_printoptions(threshold=1000) # this is the default value
a

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

** 25.- How to import a dataset with numbers and texts keeping the text intact in python numpy? **
   - Import the iris dataset keeping the text intact.

In [46]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object') # if we do not specify dtype as object, a tuple is returned instead
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

iris[:10] # first 10

array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', b'2.9', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa']], dtype=object)

** 26. How to extract a particular column from 1D array of tuples? **
   - Extract the text column species from the 1D iris imported in previous question.

In [50]:
iris[:10:,4] # first 10

array([b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', b'Iris-setosa',
       b'Iris-setosa', b'Iris-setosa', b'Iris-setosa', b'Iris-setosa',
       b'Iris-setosa', b'Iris-setosa'], dtype=object)

** 27.- How to convert a 1d array of tuples to a 2d numpy array? **
   - Convert the 1D iris to 2D array iris_2d by omitting the species text field.

In [75]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_1d = np.genfromtxt(url, delimiter=',', dtype=None, encoding=None, usecols=[0,1,2,3])
iris_1d[:10] # first 10

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

** 28.- How to compute the mean, median, standard deviation of a numpy array? **
   - Find the mean, median, standard deviation of iris's sepallength (1st column)

In [100]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype=float)
mean = np.mean(iris[:,0])
median = np.median(iris[:,0])
std = np.std(iris[:,0])
print("The mean is: ", mean)
print("The median is: ", median)
print("The std is: ", std)

The mean is:  5.843333333333334
The median is:  5.8
The std is:  0.8253012917851409


** 29.- How to normalize an array so the values range exactly between 0 and 1? **
   - Create a normalized form of iris's sepallength whose values range exactly between 0 and 1 so that the minimum has value 0 and maximum has value 1.

In [113]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

sepalMax, sepalMin = sepallength.max(), sepallength.min()
print(Sepalmax, sepalMin)
sepalForm = (sepallength - sepalMin)/(sepalMax - sepalMin)
# Alternatively use
#sepalForm = (sepallength - sepalMin)/sepallength.ptp()
# help(sepallength.ptp)
print(sepalForm)

7.9 4.3
[0.222222 0.166667 0.111111 0.083333 0.194444 0.305556 0.083333 0.194444
 0.027778 0.166667 0.305556 0.138889 0.138889 0.       0.416667 0.388889
 0.305556 0.222222 0.388889 0.222222 0.305556 0.222222 0.083333 0.222222
 0.138889 0.194444 0.194444 0.25     0.25     0.111111 0.138889 0.305556
 0.25     0.333333 0.166667 0.194444 0.333333 0.166667 0.027778 0.222222
 0.194444 0.055556 0.027778 0.194444 0.222222 0.138889 0.222222 0.083333
 0.277778 0.194444 0.75     0.583333 0.722222 0.333333 0.611111 0.388889
 0.555556 0.166667 0.638889 0.25     0.194444 0.444444 0.472222 0.5
 0.361111 0.666667 0.361111 0.416667 0.527778 0.361111 0.444444 0.5
 0.555556 0.5      0.583333 0.638889 0.694444 0.666667 0.472222 0.388889
 0.333333 0.333333 0.416667 0.472222 0.305556 0.472222 0.666667 0.555556
 0.361111 0.333333 0.333333 0.5      0.416667 0.194444 0.361111 0.388889
 0.388889 0.527778 0.222222 0.388889 0.555556 0.416667 0.777778 0.555556
 0.611111 0.916667 0.166667 0.833333 0.666667 0.80555

** 30.- How to compute the softmax score? **
   - Compute the softmax score of sepallength.

In [115]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

softmax = lambda x : np.exp(x)/np.sum(np.exp(x))
softmax(sepallength)

array([0.00222 , 0.001817, 0.001488, 0.001346, 0.002008, 0.002996,
       0.001346, 0.002008, 0.001102, 0.001817, 0.002996, 0.001644,
       0.001644, 0.000997, 0.00447 , 0.004044, 0.002996, 0.00222 ,
       0.004044, 0.00222 , 0.002996, 0.00222 , 0.001346, 0.00222 ,
       0.001644, 0.002008, 0.002008, 0.002453, 0.002453, 0.001488,
       0.001644, 0.002996, 0.002453, 0.003311, 0.001817, 0.002008,
       0.003311, 0.001817, 0.001102, 0.00222 , 0.002008, 0.001218,
       0.001102, 0.002008, 0.00222 , 0.001644, 0.00222 , 0.001346,
       0.002711, 0.002008, 0.01484 , 0.008144, 0.013428, 0.003311,
       0.009001, 0.004044, 0.007369, 0.001817, 0.009947, 0.002453,
       0.002008, 0.00494 , 0.005459, 0.006033, 0.003659, 0.010994,
       0.003659, 0.00447 , 0.006668, 0.003659, 0.00494 , 0.006033,
       0.007369, 0.006033, 0.008144, 0.009947, 0.01215 , 0.010994,
       0.005459, 0.004044, 0.003311, 0.003311, 0.00447 , 0.005459,
       0.002996, 0.005459, 0.010994, 0.007369, 0.003659, 0.003

** 31.- How to find the percentile scores of a numpy array? **
   - Find the 5th and 95th percentile of iris's sepallength

In [118]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
sepallength = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0])

np.percentile(sepallength, [5, 95])
# help(np.percentile)

array([4.6  , 7.255])

** 32.- How to insert values at random positions in an array? **
   - Insert np.nan values at 20 random positions in iris_2d dataset

In [133]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='object')
print(iris_2d.shape)

iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
iris_2d[:10]


(150, 5)


array([[b'5.1', b'3.5', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.0', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.7', b'3.2', b'1.3', b'0.2', b'Iris-setosa'],
       [b'4.6', b'3.1', b'1.5', b'0.2', b'Iris-setosa'],
       [b'5.0', b'3.6', b'1.4', b'0.2', b'Iris-setosa'],
       [b'5.4', b'3.9', b'1.7', b'0.4', b'Iris-setosa'],
       [b'4.6', b'3.4', b'1.4', b'0.3', b'Iris-setosa'],
       [b'5.0', b'3.4', b'1.5', b'0.2', b'Iris-setosa'],
       [b'4.4', b'2.9', b'1.4', b'0.2', b'Iris-setosa'],
       [b'4.9', b'3.1', b'1.5', b'0.1', b'Iris-setosa']], dtype=object)

** 33.- How to find the position of missing values in numpy array? **
   - Find the number and position of missing values in iris_2d's sepallength (1st column)

In [148]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float')
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan

print("Number of missing values: ", np.isnan(iris_2d[::,0]).sum())
print("Positions of missing values: ", np.where(np.isnan(iris_2d[::,0])))
# np.info(np.where)

Number of missing values:  4
Positions of missing values:  (array([  4, 111, 119, 149]),)


** 34.- How to filter a numpy array based on two or more conditions? **
   - Filter the rows of iris_2d that has petallength (3rd column) > 1.5 and sepallength (1st column) < 5.0

In [156]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
#iris_2d[:10] # exploring

iris_2d[np.where((iris_2d[:,2] > 1.5) & (iris_2d[:,0] < 5.0))]

array([[4.8, 3.4, 1.6, 0.2],
       [4.8, 3.4, 1.9, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [4.9, 2.4, 3.3, 1. ],
       [4.9, 2.5, 4.5, 1.7]])

** 35.- How to drop rows that contain a missing value from a numpy array? **
   - Select the rows of iris_2d that does not have any nan value.

In [167]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(150, size=20), np.random.randint(4, size=20)] = np.nan
iris_2d[:10] # exploring

iris_2d[np.sum(np.isnan(iris_2d), axis = 1) == 0][:5]

array([[4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4]])

** 36.- How to find the correlation between two columns of a numpy array? **
  - Find the correlation between SepalLength(1st column) and PetalLength(3rd column) in iris_2d

In [176]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.corrcoef(iris_2d[:,0], iris_2d[:,2]) #[0,1]

array([[1.      , 0.871754],
       [0.871754, 1.      ]])

** 37.- How to find if a given array has any null values? **
   - Find out if iris_2d has any missing values.

In [177]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])

np.isnan(iris_2d).any()

False

** 38.- How to replace all missing values with 0 in a numpy array? **
   - Replace all ocurrences of nan with 0 in numpy array

In [194]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris_2d = np.genfromtxt(url, delimiter=',', dtype='float', usecols=[0,1,2,3])
iris_2d[np.random.randint(iris_2d.shape[0], size=20), np.random.randint(iris_2d.shape[1], size=20)] = np.nan
iris_2d[:20] # exploring

iris_2d[np.isnan(iris_2d)] = 0
iris_2d[:10]


array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [0. , 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1]])

** 39.- How to find the count of unique values in a numpy array? **
   - Find the unique values and the count of unique values in iris's species

In [208]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

species = np.array([row.tolist()[4] for row in iris])
print(species.shape)

# Get the unique values and the counts
np.unique(species, return_counts=True)

(150,)


(array([b'Iris-setosa', b'Iris-versicolor', b'Iris-virginica'],
       dtype='|S15'), array([50, 50, 50]))

** 40.-  How to convert a numeric to a categorical (text) array? **
   - Bin the petal length (3rd) column of iris_2d to form a text array, such that if petal length is:
    
    * Less than 3 --> 'small'
    * 3-5 --> 'medium'
    * '>=5 --> 'large'

In [214]:
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data'
iris = np.genfromtxt(url, delimiter=',', dtype='object')
names = ('sepallength', 'sepalwidth', 'petallength', 'petalwidth', 'species')

# Bin petallength 
petalLengthBin = np.digitize(iris[:, 2].astype('float'), [0, 3, 5, 10])
# help(np.digitize)

# Map it to respective category
labelMap = {1: 'small', 2: 'medium', 3: 'large', 4: np.nan}
petalLengthCat = [labelMap[x] for x in petalLengthBin]

# View
petalLengthCat[:4]

['small', 'small', 'small', 'small']