In [1]:
# Numpy
# what happens in the background of Pandas
# built on C++
# We could potentially do everything we are about to do ourselves
# would be ineffecient and take a lot of time

In [2]:
# Numpy is a lot more efficient than Python
# ~50x - 100X faster
# Arrays, C++ Arrays = Contiguous memory, optomized searches
# Lists in python - can contain multiple types and aren't "Arrays"
# Numpy Arrays are all one data type, stored in an nd-array



In [5]:
import numpy as np

print(np.__version__)

2.0.0


In [10]:
# Creating an array

arr = np.array([1,2,3,4,5])

arr

array([1, 2, 3, 4, 5])

In [18]:
# Dimensions - n-dimensional array
# 0-3D arrays

# 0D Array AKA Scalar

zero_arr = np.array(42)

zero_arr

# 1D Array AKA Array

one_arr = np.array([1,2,3,4,5])

one_arr

# 2D Array AKA Matrix

two_arr = np.array([[1,2,3], [4,5,6]])

two_arr

# 3D Array AKA Tensor

three_arr = np.array([[[1,2,3],[4,5,6]],[[7,8,9],[10,11,12]]])

three_arr

print(zero_arr.ndim, one_arr.ndim, two_arr.ndim, three_arr.ndim)

0 1 2 3


In [23]:
# Indexing into our nd arrays

# Python indexing - Do not do this
print(two_arr[0][1])

# Numpy indexing
print(two_arr[0, 1])
print(three_arr[0, 1, 2])

2
2
6


In [30]:
# Numpy arrays are a single data type

# Numpy has its own data types

# i - integers up to int64
# b - boolean
# u - unsigned ints
# f - float up to float128
# c - complex
# m - timedelta
# M - datetime
# O - object
# S - string
# U - unicode strings
# V - void type - fixed chunk of memory that is reserved

print(arr.dtype)

arr = np.array([1,2,3,4,5], dtype= 'S')

print(arr.dtype)

new_arr = arr.astype('f')
print(new_arr.dtype)

|S1
|S1
float32


In [35]:
# Checking our matrixes shape

print(arr.shape)
print(two_arr.shape)
print(three_arr.shape)


(5,)
(2, 3)
(2, 2, 3)


In [40]:
# Reshape arrays

arr = np.array([1,2,3,4,5,6,7,8,9,10,11,12])
print(arr)
new_arr = arr.reshape(4,3)
print(new_arr)


[ 1  2  3  4  5  6  7  8  9 10 11 12]
[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [43]:
# If we don't know full dimensions, we can sub in -1

new_arr = arr.reshape(-1,3)
print(new_arr)

[[ 1  2  3]
 [ 4  5  6]
 [ 7  8  9]
 [10 11 12]]


In [46]:
# To flatten arrays

arr = new_arr.reshape(-1)
print(arr)

[ 1  2  3  4  5  6  7  8  9 10 11 12]


In [51]:
# Iterating through arrays

for x in np.nditer(three_arr):
    print(x)

1
2
3
4
5
6
7
8
9
10
11
12


In [54]:
# Joining arrays
# Join based on axis
# axis = 0 is based on rows, axis = 1 is based on columns

arr1 = np.array([[1,2],[3,4]])
arr2 = np.array([[5,6],[7,8]])
arr = np.concatenate((arr1, arr2), axis= 0)
print(arr)

[[1 2]
 [3 4]
 [5 6]
 [7 8]]


In [57]:
# Join based on columns

arr = np.concatenate((arr1,arr2), axis=1)
print(arr)

[[1 2 5 6]
 [3 4 7 8]]


In [62]:
# Stacking
# stack(), hstack(), dstack()


arr1 = np.array([1,2,3])
arr2 = np.array([4,5,6])
arr = np.stack((arr1,arr2))
print(arr)

arr = np.hstack((arr1, arr2))
print(arr)

arr = np.dstack((arr1,arr2))
print(arr)

[[1 2 3]
 [4 5 6]]
[1 2 3 4 5 6]
[[[1 4]
  [2 5]
  [3 6]]]


In [65]:
# Split array into n parts

arr = np.array([1, 2, 3, 4, 5])
new_arr = np.array_split(arr, 3)
print(new_arr)

[array([1, 2]), array([3, 4]), array([5])]


In [69]:
# Searching array - where() method

arr =  np.array([1,2,3,4,5,6,4,4])
x = np.where(arr == 4)
print(x)

(array([3, 6, 7]),)


In [72]:
# searchsorted() - performs binary search

arr =  np.array([1,2,3,4,5,6,9,10])
x = np.searchsorted(arr, 6)
print(x)

5


In [74]:
# Sorting arrays
# Can still use python sort()
# np.sort() -more efficient

arr = np.array([[5,2,1],[6,3,9]])
print(np.sort(arr))

[[1 2 5]
 [3 6 9]]


In [78]:
# Filtering arrays
# Filter based off condition or boolean array

arr = np.array([40,41,42,430])

x = [False, True, False, True]

filtered_arr = arr[x]
print(filtered_arr)

[ 41 430]


In [84]:
# Copies vs Views
# When we work with Big Data we need to carefully consider what we copy
# 4 GB file -> 8 GB
# 11 GB file -> 22 GB
# Use a view
# view is a reference, doesn't copy array just references it

arr = np.array([1,2,3,4])
view = arr.view()
copy = arr.copy()
arr[0] = 15

print(arr)
print(view)
print(copy)

# .base shows what an objcect is based off of (where the reference is pointing)
print(view.base)
print(copy.base)

[15  2  3  4]
[15  2  3  4]
[1 2 3 4]
[15  2  3  4]
None


In [85]:
import time

In [91]:
# Use Python List
start = time.time()
list = [i for i in range(1_000_000)]
squared = [x**2 for x in list]
stop = time.time()

print(stop-start)

0.4637181758880615


In [99]:
# Use numpy array
start = time.time()
arr = np.arange(1_000_000)
squared = arr**2
stop = time.time()

print(stop-start)

0.00997304916381836
