In [1]:
import numpy as np

In [3]:
# get index locations that satisfy a given condition using np.where

In [2]:
# Create an array

arr_rand = np.array([8, 8, 3, 7, 7, 1, 4, 2, 5, 2, 0])
print("Array: ", arr_rand)

Array:  [8 8 3 7 7 1 4 2 5 2 0]


In [3]:
# Positions where value > 5
index_gt5 = np.where(arr_rand > 5)
print("Positions where value > 5: ", index_gt5)

Positions where value > 5:  (array([0, 1, 3, 4], dtype=int64),)


In [5]:
# extract them using the array’s take method.

# Take items at given index
arr_rand.take(index_gt5)

array([[8, 8, 7, 7]])

In [19]:
# If value > 5, then yield 'gt5' else 'le5'
np.where(arr_rand > 5, 'gt5', 'le5')

array(['gt5', 'gt5', 'le5', 'gt5', 'gt5', 'le5', 'le5', 'le5', 'le5',
       'le5', 'le5'],
      dtype='<U3')

In [20]:
# Location of the max
print('Position of max value: ', np.argmax(arr_rand))  

Position of max value:  0


In [21]:
# Location of the min
print('Position of min value: ', np.argmin(arr_rand))  

Position of min value:  10


In [4]:
# import and export data as a csv file
# np.genfromtxt function. It can import datasets from web URLs, 
# handle missing values, multiple delimiters, handle irregular number of columns etc.

# Turn off scientific notation
np.set_printoptions(suppress=True)  

In [5]:
# Import data from csv file url
path = 'https://raw.githubusercontent.com/selva86/datasets/master/Auto.csv'

In [6]:
data = np.genfromtxt(path, delimiter=',', skip_header=1, filling_values=-999, dtype='float')

In [7]:
# see first 3 rows
data[:3]

array([[  18. ,    8. ,  307. ,  130. , 3504. ,   12. ,   70. ,    1. ,
        -999. ],
       [  15. ,    8. ,  350. ,  165. , 3693. ,   11.5,   70. ,    1. ,
        -999. ],
       [  18. ,    8. ,  318. ,  150. , 3436. ,   11. ,   70. ,    1. ,
        -999. ]])

In [8]:
data.shape

(392, 9)

In [None]:
# notice all the values in last column has the same value ‘-999’?

# That happened because, `dtype=’float’`. 
# The last column in the file contained text values and since all the values in a numpy array 
# has to be of the same `dtype`, `np.genfromtxt` didn’t know how to convert it to a float.


In [None]:
# handle datasets that has both numbers and text columns?
# set the dtype as ‘object’ or as None

In [9]:
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype='object')
data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)
data2[:3]  # see first 3 rows

  data2 = np.genfromtxt(path, delimiter=',', skip_header=1, dtype=None)


array([(18., 8, 307., 130, 3504, 12. , 70, 1, b'"chevrolet chevelle malibu"'),
       (15., 8, 350., 165, 3693, 11.5, 70, 1, b'"buick skylark 320"'),
       (18., 8, 318., 150, 3436, 11. , 70, 1, b'"plymouth satellite"')],
      dtype=[('f0', '<f8'), ('f1', '<i4'), ('f2', '<f8'), ('f3', '<i4'), ('f4', '<i4'), ('f5', '<f8'), ('f6', '<i4'), ('f7', '<i4'), ('f8', 'S38')])

In [10]:
# Save the array as a csv file
np.savetxt("out.csv", data, delimiter=",")

In [None]:
# concatenate two numpy arrays columnwise and row wise
# There are 3 different ways of concatenating two or more numpy arrays.

# Method 1: np.concatenate by changing the axis parameter to 0 and 1
# Method 2: np.vstack and np.hstack
# Method 3: np.r_ and np.c_

In [11]:
a = np.zeros([4, 4])
b = np.ones([4, 4])
print(a)
print(b)

[[0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]
 [0. 0. 0. 0.]]
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]
 [1. 1. 1. 1.]]


In [13]:
# stack the arrays vertically.

# Vertical Stack Equivalents (Row wise)
np.concatenate([a, b], axis=0)  
 

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

In [14]:
np.vstack([a,b])  


array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

In [32]:
np.r_[a,b] 

array([[ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 0.,  0.,  0.,  0.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.],
       [ 1.,  1.,  1.,  1.]])

In [None]:
# horizontally (columns wise)

In [16]:
# Horizontal Stack Equivalents (Coliumn wise)
np.concatenate([a, b], axis=1) 

array([[ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.]])

In [34]:
np.hstack([a,b])  

array([[ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.]])

In [35]:
np.c_[a,b]

array([[ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.],
       [ 0.,  0.,  0.,  0.,  1.,  1.,  1.,  1.]])

In [37]:
# sort a numpy array based on one or more columns?
arr = np.random.randint(1,6, size=[8, 4])
arr

array([[4, 5, 4, 5],
       [1, 4, 3, 3],
       [2, 3, 4, 3],
       [1, 2, 3, 1],
       [4, 3, 3, 2],
       [5, 3, 1, 3],
       [2, 5, 3, 3],
       [2, 2, 3, 4]])

In [38]:
# np.sort function with axis=0, all the columns will be sorted in ascending order independent 
# of eachother

In [40]:
# Sort each columns of arr
# compromising the integrity of the row items. In simple terms, the values in each row 
# gets corrupted with values from other rows.
np.sort(arr, axis=0)

array([[1, 2, 1, 1],
       [1, 2, 3, 2],
       [2, 3, 3, 3],
       [2, 3, 3, 3],
       [2, 3, 3, 3],
       [4, 4, 3, 3],
       [4, 5, 4, 4],
       [5, 5, 4, 5]])

In [41]:
# sort a numpy array based on 1 column using argsort?

# first understand what np.argsort does

In [None]:
# np.argsort returns the index positions of that would make a given 1d array sorted.

In [42]:
# Get the index positions that would sort the array
x = np.array([1, 10, 5, 2, 8, 9])

sort_index = np.argsort(x)
print(sort_index)

[0 3 2 4 5 1]


In [None]:
# In array ‘x’, the 0th item is the smallest, 3rd item is the second smallest and so on.

In [43]:
x[sort_index]

array([ 1,  2,  5,  8,  9, 10])

In [None]:
# do an argsort on the 1st column and use the resulting index positions to sort arr.

In [44]:
# Argsort the first column
sorted_index_1stcol = arr[:, 0].argsort()

In [45]:
# Sort 'arr' by first column without disturbing the integrity of rows
arr[sorted_index_1stcol]

array([[1, 4, 3, 3],
       [1, 2, 3, 1],
       [2, 3, 4, 3],
       [2, 5, 3, 3],
       [2, 2, 3, 4],
       [4, 5, 4, 5],
       [4, 3, 3, 2],
       [5, 3, 1, 3]])

In [None]:
# To sort it in decreasing order, simply reverse the argsorted index.

In [49]:
# Descending sort
arr[sorted_index_1stcol[::-1]]

array([[5, 3, 1, 3],
       [4, 3, 3, 2],
       [4, 5, 4, 5],
       [2, 2, 3, 4],
       [2, 5, 3, 3],
       [2, 3, 4, 3],
       [1, 2, 3, 1],
       [1, 4, 3, 3]])

In [50]:
sorted_index_1stcol

array([1, 3, 2, 6, 7, 0, 4, 5], dtype=int64)

In [51]:
sorted_index_1stcol[::-1]

array([5, 4, 0, 7, 6, 2, 3, 1], dtype=int64)

In [54]:
np.flip(sorted_index_1stcol, 0)

array([5, 4, 0, 7, 6, 2, 3, 1], dtype=int64)

In [62]:
sorted_index_1stcol[: :-1]

array([5, 4, 0, 7, 6, 2, 3, 1], dtype=int64)

In [63]:
# every yth element from the list / array
a = [1,2,3,4,5,6,7,8,9]

a[::3]

[1, 4, 7]

In [64]:
# reverses
a[:: -1]

[9, 8, 7, 6, 5, 4, 3, 2, 1]

In [None]:
# sort a numpy array based on 2 or more columns?

# You can do this using np.lexsort by passing a tuple of columns based on which the 
# array should be sorted.

In [None]:
# place the column to be sorted first at the rightmost side inside the tuple.

In [65]:
# Sort by column 0, then by column 1
lexsorted_index = np.lexsort((arr[:, 1], arr[:, 0])) 

In [66]:
arr[lexsorted_index]

array([[1, 2, 3, 1],
       [1, 4, 3, 3],
       [2, 2, 3, 4],
       [2, 3, 4, 3],
       [2, 5, 3, 3],
       [4, 3, 3, 2],
       [4, 5, 4, 5],
       [5, 3, 1, 3]])

In [None]:
# Working with dates
# Numpy implements dates through the np.datetime64 object which supports a precision till 
# nanoseconds. 
# You can create one using a standard YYYY-MM-DD formatted date strings.

In [67]:
# Create a datetime64 object
date64 = np.datetime64('2018-02-04 23:10:10')
date64

numpy.datetime64('2018-02-04T23:10:10')

In [68]:
# Drop the time part from the datetime64 object
dt64 = np.datetime64(date64, 'D')
dt64

numpy.datetime64('2018-02-04')

In [69]:
# Create the timedeltas (individual units of time)
tenminutes = np.timedelta64(10, 'm')  # 10 minutes
tenseconds = np.timedelta64(10, 's')  # 10 seconds
tennanoseconds = np.timedelta64(10, 'ns')  # 10 nanoseconds

print('Add 10 days: ', dt64 + 10)
print('Add 10 minutes: ', dt64 + tenminutes)
print('Add 10 seconds: ', dt64 + tenseconds)
print('Add 10 nanoseconds: ', dt64 + tennanoseconds)

Add 10 days:  2018-02-14
Add 10 minutes:  2018-02-04T00:10
Add 10 seconds:  2018-02-04T00:00:10
Add 10 nanoseconds:  2018-02-04T00:00:00.000000010


In [70]:
# Convert np.datetime64 back to a string
np.datetime_as_string(dt64)

'2018-02-04'

In [71]:
# create a sequence of dates?
# Create date sequence
dates = np.arange(np.datetime64('2018-02-01'), np.datetime64('2018-02-10'))
print(dates)

# Check if its a business day
np.is_busday(dates)

['2018-02-01' '2018-02-02' '2018-02-03' '2018-02-04' '2018-02-05'
 '2018-02-06' '2018-02-07' '2018-02-08' '2018-02-09']


array([ True,  True, False, False,  True,  True,  True,  True,  True], dtype=bool)

In [None]:
# vectorize – Make a scalar function work on vectors


In [12]:
# Define a scalar function
def foo(x):
    if x % 2 == 1:
        return x**2
    else:
        return x/2

In [73]:
# On a scalar
print('x = 10 returns ', foo(10))
print('x = 11 returns ', foo(11))

x = 10 returns  5.0
x = 11 returns  121


In [75]:
# On a vector, doesn't work
print('x = [10, 11, 12] returns ', foo([10, 11, 12]))  # Error

TypeError: unsupported operand type(s) for %: 'list' and 'int'

In [76]:
# Vectorize foo(). Make it work on vectors.
foo_v = np.vectorize(foo, otypes=[float])

In [78]:
print('x = [10, 11, 12] returns \n',              foo_v([10, 11, 12]))
print('x = [[10, 11, 12], [1, 2, 3]] returns \n', foo_v([[10, 11, 12], [1, 2, 3]]))

x = [10, 11, 12] returns 
 [   5.  121.    6.]
x = [[10, 11, 12], [1, 2, 3]] returns 
 [[   5.  121.    6.]
 [   1.    1.    9.]]


In [None]:
# apply_along_axis – Apply a function column wise or row wise

In [79]:
# Create a 4x10 random array
np.random.seed(100)
arr_x = np.random.randint(1,10,size=[4,10])
arr_x

array([[9, 9, 4, 8, 8, 1, 5, 3, 6, 3],
       [3, 3, 2, 1, 9, 5, 1, 7, 3, 5],
       [2, 6, 4, 5, 5, 4, 8, 2, 2, 8],
       [8, 1, 3, 4, 3, 6, 9, 2, 1, 8]])

In [80]:
# find the difference of the maximum and the minimum value in each row?

In [81]:
# the normal approach would be to write a for-loop that iterates along each row and 
# then compute the max-min in each iteration.

In [None]:
# That sounds alright but it can get cumbersome if you want to do the same column wise

In [None]:
# using the numpy.apply_along_axis

In [None]:
# It takes as arguments:

# Function that works on a 1D vector (fund1d)
# Axis along which to apply func1d. For a 2D array, 1 is row wise and 0 is column wise.
# Array on which func1d should be applied.

In [82]:
# Define func1d
def max_minus_min(x):
    return np.max(x) - np.min(x)

In [83]:
# Apply along the rows
print('Row wise: ', np.apply_along_axis(max_minus_min, 1, arr=arr_x))

Row wise:  [8 8 6 8]


In [84]:
# Apply along the columns
print('Column wise: ', np.apply_along_axis(max_minus_min, 0, arr=arr_x))

Column wise:  [7 8 2 7 6 5 8 5 5 5]


In [None]:
# What is missing in numpy?
# So far we have covered a good number of techniques to do data manipulations with numpy. 
# But there are a considerable number of things you can’t do with numpy directly. 

# No direct function to merge two 2D arrays based on a common column.
# Create pivot tables directly
# No direct way of doing 2D cross tabulations.
# No direct method to compute statistics (like mean) grouped by unique values in an array.
# And more..

# pandas is the answer.