# Vital Functions for Data Analysis

In [2]:
import numpy as np

#### 1. How to get index locations that satisfy a given condition ?

In [3]:
# Create an array
arr_rand = np.array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])
print("Array: ", arr_rand)

Array:  [8 8 3 7 7 0 4 2 5 2]


In [4]:
# np.where locates the positions in the array where a given condition holds true.

index_gt5 = np.where(arr_rand > 5) # Positions where value > 5
print("Positions where value > 5: ", index_gt5)

Positions where value > 5:  (array([0, 1, 3, 4], dtype=int64),)


In [5]:
# Take items at given index
arr_rand.take(index_gt5)

array([[8, 8, 7, 7]])

In [6]:
# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')

array(['gt5', 'gt5', 'le5', 'gt5', 'gt5', 'le5', 'le5', 'le5', 'le5',
       'le5'], dtype='<U3')

In [10]:
# Location of the max
print('Position of max value: ', np.argmax(arr_rand))  

# Location of the min
print('Position of min value: ', np.argmin(arr_rand))

Position of max value:  0
Position of min value:  5


#### 2. How to import and export data as a csv file?

In [12]:
# Import data from csv file url
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='int')
data[:3]  # see first 3 rows

array([[  18,    8,  307,  130, 3504,   12,   70,    1, -999],
       [  15,    8,  350,  165, 3693,   11,   70,    1, -999],
       [  18,    8,  318,  150, 3436,   11,   70,    1, -999]])

As an example, let’s try to read a .csv file from the below URL. Since all elements in a numpy array should be of the same data type, the last column which is a text will be imported as a ‘nan’ by default.

By setting the 'filling_values' argument you can replace the missing values with something else.

How to handle datasets that has both numbers and text columns?

In case, you MUST have the text column as it is without replacing it with a placeholder, you can either set the dtype as ‘object’ or as None.

In [13]:
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2[:3]  # see first 3 rows

array([[b'18', b'8', b'307', b'130', b'3504', b'12', b'70', b'1',
        b'"chevrolet chevelle malibu"'],
       [b'15', b'8', b'350', b'165', b'3693', b'11.5', b'70', b'1',
        b'"buick skylark 320"'],
       [b'18', b'8', b'318', b'150', b'3436', b'11', b'70', b'1',
        b'"plymouth satellite"']], dtype=object)

In [14]:
data3 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data3[:3]  # see first 3 rows

  """Entry point for launching an IPython kernel.


array([(18., 8, 307., 130, 3504, 12. , 70, 1, b'"chevrolet chevelle malibu"'),
       (15., 8, 350., 165, 3693, 11.5, 70, 1, b'"buick skylark 320"'),
       (18., 8, 318., 150, 3436, 11. , 70, 1, b'"plymouth satellite"')],
      dtype=[('f0', '<f8'), ('f1', '<i4'), ('f2', '<f8'), ('f3', '<i4'), ('f4', '<i4'), ('f5', '<f8'), ('f6', '<i4'), ('f7', '<i4'), ('f8', 'S38')])

In [15]:
# Save the array as a csv file
np.savetxt("out.csv", data, delimiter=",")

#### 3. How to concatenate two numpy arrays column wise and row wise

There are 3 different ways of concatenating two or more numpy arrays.

Method 1: np.concatenate by changing the axis parameter to 0 and 1

Method 2: np.vstack and np.hstack

Method 3: np.r_ and np.c_

In [16]:
a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [17]:
# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)  
np.vstack([a,b])  
np.r_[a,b]

array([[0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [0., 0., 0., 0.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [18]:
# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1) 
np.hstack([a,b])  
np.c_[a,b]

array([[0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.]])

In [19]:
#create more complex number sequences in 1d arrays.
np.r_[[1,2,3], 0, 0, [4,5,6]]

array([1, 2, 3, 0, 0, 4, 5, 6])

#### 4. How to sort a numpy array based on one or more columns?

In [20]:
arr = np.random.randint(1,6, size=[8, 4])
arr

array([[1, 1, 2, 1],
       [2, 4, 4, 1],
       [1, 4, 2, 2],
       [1, 3, 5, 2],
       [4, 4, 3, 3],
       [4, 4, 5, 4],
       [4, 1, 4, 2],
       [2, 2, 2, 2]])

In [21]:
# Sort each columns of arr
np.sort(arr, axis=0)

array([[1, 1, 2, 1],
       [1, 1, 2, 1],
       [1, 2, 2, 2],
       [2, 3, 3, 2],
       [2, 4, 4, 2],
       [4, 4, 4, 2],
       [4, 4, 5, 3],
       [4, 4, 5, 4]])

4.1 How to sort a numpy array based on 1 column using argsort?

np.argsort returns the index positions of that would make a given 1d array sorted

In [22]:
# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])
sort_index = np.argsort(x)
print(sort_index)

[0 3 2 4 5 1]


In [23]:
x[sort_index]

array([ 1,  2,  5,  8,  9, 10])

In [24]:
arr

array([[1, 1, 2, 1],
       [2, 4, 4, 1],
       [1, 4, 2, 2],
       [1, 3, 5, 2],
       [4, 4, 3, 3],
       [4, 4, 5, 4],
       [4, 1, 4, 2],
       [2, 2, 2, 2]])

In [25]:
# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()
sorted_index_1stcol

array([0, 2, 3, 1, 7, 4, 5, 6], dtype=int64)

In [26]:
# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]

array([[1, 1, 2, 1],
       [1, 4, 2, 2],
       [1, 3, 5, 2],
       [2, 4, 4, 1],
       [2, 2, 2, 2],
       [4, 4, 3, 3],
       [4, 4, 5, 4],
       [4, 1, 4, 2]])

In [27]:
# Descending sort
arr[sorted_index_1stcol[::-1]]

array([[4, 1, 4, 2],
       [4, 4, 5, 4],
       [4, 4, 3, 3],
       [2, 2, 2, 2],
       [2, 4, 4, 1],
       [1, 3, 5, 2],
       [1, 4, 2, 2],
       [1, 1, 2, 1]])

4.2 How to sort a numpy array based on 2 or more columns?

In [28]:
# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0])) 
lexsorted_index

array([0, 3, 2, 7, 1, 6, 4, 5], dtype=int64)

In [29]:
arr[lexsorted_index]

array([[1, 1, 2, 1],
       [1, 3, 5, 2],
       [1, 4, 2, 2],
       [2, 2, 2, 2],
       [2, 4, 4, 1],
       [4, 1, 4, 2],
       [4, 4, 3, 3],
       [4, 4, 5, 4]])

#### 5. Working with dates

Numpy implements dates through the np.datetime64 object which supports a precision till nanoseconds. 

You can create one using a standard YYYY-MM-DD formatted date strings.

In [30]:
# Create a datetime64 object
date64 = np.datetime64('2019-06-17 10:10:10')
date64

numpy.datetime64('2019-06-17T10:10:10')

In [31]:
# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64

numpy.datetime64('2019-06-17')

In [32]:
# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)

Add 10 days:  2019-06-27
Add 10 minutes:  2019-06-17T00:10
Add 10 seconds:  2019-06-17T00:00:10
Add 10 nanoseconds:  2019-06-17T00:00:00.000000010


In [33]:
# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)

'2019-06-17'

In [34]:
print('Date: ', dt64)
print("Is it a business day?: ", np.is_busday(dt64))  

Date:  2019-06-17
Is it a business day?:  True


In [35]:
# Create date sequence
dates = np.arange(np.datetime64('2019-06-17'), np.datetime64('2019-06-27'))
print(dates)

# Check if its a business day
np.is_busday(dates)

['2019-06-17' '2019-06-18' '2019-06-19' '2019-06-20' '2019-06-21'
 '2019-06-22' '2019-06-23' '2019-06-24' '2019-06-25' '2019-06-26']


array([ True,  True,  True,  True,  True, False, False,  True,  True,
        True])

#### 6. vectorize – Make a scalar function work on vectors

When you apply this function on a scalar (individual numbers) it works perfectly, but fails when applied on an array.

In [36]:
# Define a scalar function
def foo(x):
    if x % 2 == 1:
        return x**2
    else:
        return x/2

# On a scalar
print('x = 10 returns ', foo(10))
print('x = 11 returns ', foo(11))

# On a vector, doesn't work
print('x = [10, 11, 12] returns ', foo([10, 11, 12]))  # Error 

x = 10 returns  5.0
x = 11 returns  121


TypeError: unsupported operand type(s) for %: 'list' and 'int'

In [37]:
# Vectorize foo(). Make it work on vectors.
foo_v = np.vectorize(foo, otypes=[float])

print('x = [10, 11, 12] returns ', foo_v([10, 11, 12]))
print('x = [[10, 11, 12], [1, 2, 3]] returns ', foo_v([[10, 11, 12], [1, 2, 3]]))

x = [10, 11, 12] returns  [  5. 121.   6.]
x = [[10, 11, 12], [1, 2, 3]] returns  [[  5. 121.   6.]
 [  1.   1.   9.]]


Apply_along_axis – Apply a function column wise or row wise

In [38]:
# Create a 4x10 random array
np.random.seed(100)
arr_x = np.random.randint(1,10,size=[4,10])
arr_x

array([[9, 9, 4, 8, 8, 1, 5, 3, 6, 3],
       [3, 3, 2, 1, 9, 5, 1, 7, 3, 5],
       [2, 6, 4, 5, 5, 4, 8, 2, 2, 8],
       [8, 1, 3, 4, 3, 6, 9, 2, 1, 8]])

How to find the difference of the maximum and the minimum value in each row?

In [39]:
# Define func1d
def max_minus_min(x):
    return np.max(x) - np.min(x)

# Apply along the rows
print('Row wise: ', np.apply_along_axis(max_minus_min, 1, arr=arr_x))

# Apply along the columns
print('Column wise: ', np.apply_along_axis(max_minus_min, 0, arr=arr_x))

Row wise:  [8 8 6 8]
Column wise:  [7 8 2 7 6 5 8 5 5 5]
