# Vital Functions for Data Analysis

In [None]:
import numpy as np

#### 1. How to get index locations that satisfy a given condition ?

In [None]:
# Create an array
arr_rand = np.array([8, 8, 3, 7, 7, 0, 4, 2, 5, 2])
print("Array: ", arr_rand)

In [None]:
# np.where locates the positions in the array where a given condition holds true.

index_gt5 = np.where(arr_rand > 5) # Positions where value > 5
print("Positions where value > 5: ", index_gt5)

In [None]:
# Take items at given index
arr_rand.take(index_gt5)

In [None]:
# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')

In [None]:
# Location of the max
print('Position of max value: ', np.argmax(arr_rand))  

# Location of the min
print('Position of min value: ', np.argmin(arr_rand))

#### 2. How to import and export data as a csv file?

In [None]:
# Import data from csv file url
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='float')
data[:3]  # see first 3 rows

As an example, let’s try to read a .csv file from the below URL. Since all elements in a numpy array should be of the same data type, the last column which is a text will be imported as a ‘nan’ by default.

By setting the 'filling_values' argument you can replace the missing values with something else.

How to handle datasets that has both numbers and text columns?

In case, you MUST have the text column as it is without replacing it with a placeholder, you can either set the dtype as ‘object’ or as None.

In [None]:
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2[:3]  # see first 3 rows

In [None]:
data3 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data3[:3]  # see first 3 rows

In [None]:
# Save the array as a csv file
np.savetxt("out.csv", data, delimiter=",")

#### 3. How to concatenate two numpy arrays column wise and row wise

There are 3 different ways of concatenating two or more numpy arrays.

Method 1: np.concatenate by changing the axis parameter to 0 and 1

Method 2: np.vstack and np.hstack

Method 3: np.r_ and np.c_

In [None]:
a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

In [None]:
# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)  
np.vstack([a,b])  
np.r_[a,b]

In [None]:
# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1) 
np.hstack([a,b])  
np.c_[a,b]

In [None]:
#create more complex number sequences in 1d arrays.
np.r_[[1,2,3], 0, 0, [4,5,6]]

#### 4. How to sort a numpy array based on one or more columns?

In [None]:
arr = np.random.randint(1,6, size=[8, 4])
arr

In [None]:
# Sort each columns of arr
np.sort(arr, axis=0)

4.1 How to sort a numpy array based on 1 column using argsort?

np.argsort returns the index positions of that would make a given 1d array sorted

In [None]:
# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])
sort_index = np.argsort(x)
print(sort_index)

In [None]:
x[sort_index]

In [None]:
# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()
sorted_index_1stcol

In [None]:
# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]

In [None]:
# Descending sort
arr[sorted_index_1stcol[::-1]]

4.2 How to sort a numpy array based on 2 or more columns?

In [None]:
# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0])) 
lexsorted_index

In [None]:
arr[lexsorted_index]

#### 5. Working with dates

Numpy implements dates through the np.datetime64 object which supports a precision till nanoseconds. 

You can create one using a standard YYYY-MM-DD formatted date strings.

In [None]:
# Create a datetime64 object
date64 = np.datetime64('2019-06-17 10:10:10')
date64

In [None]:
# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64

In [None]:
# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)

In [None]:
# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)

In [None]:
print('Date: ', dt64)
print("Is it a business day?: ", np.is_busday(dt64))  

In [None]:
# Create date sequence
dates = np.arange(np.datetime64('2019-06-17'), np.datetime64('2019-06-27'))
print(dates)

# Check if its a business day
np.is_busday(dates)

#### 6. vectorize – Make a scalar function work on vectors

When you apply this function on a scalar (individual numbers) it works perfectly, but fails when applied on an array.

In [None]:
# Define a scalar function
def foo(x):
    if x % 2 == 1:
        return x**2
    else:
        return x/2

# On a scalar
print('x = 10 returns ', foo(10))
print('x = 11 returns ', foo(11))

# On a vector, doesn't work
print('x = [10, 11, 12] returns ', foo([10, 11, 12]))  # Error 

In [None]:
# Vectorize foo(). Make it work on vectors.
foo_v = np.vectorize(foo, otypes=[float])

print('x = [10, 11, 12] returns ', foo_v([10, 11, 12]))
print('x = [[10, 11, 12], [1, 2, 3]] returns ', foo_v([[10, 11, 12], [1, 2, 3]]))

Apply_along_axis – Apply a function column wise or row wise

In [None]:
# Create a 4x10 random array
np.random.seed(100)
arr_x = np.random.randint(1,10,size=[4,10])
arr_x

How to find the difference of the maximum and the minimum value in each row?

In [None]:
# Define func1d
def max_minus_min(x):
    return np.max(x) - np.min(x)

# Apply along the rows
print('Row wise: ', np.apply_along_axis(max_minus_min, 1, arr=arr_x))

# Apply along the columns
print('Column wise: ', np.apply_along_axis(max_minus_min, 0, arr=arr_x))