## NumPy

### Creating numpy arrays

In [2]:
# from list or tuple
import numpy as np

a = np.array([1, 2, 3])
print(a)

[1 2 3]


In [6]:
b = np.array([[1, 2, 3], [4, 5, 6]])
print(b)

[[1 2 3]
 [4 5 6]]


In [7]:
# using build-in functions
a = np.zeros((2, 3))
a

array([[0., 0., 0.],
       [0., 0., 0.]])

In [8]:
a = np.ones((2, 3))
a

array([[1., 1., 1.],
       [1., 1., 1.]])

In [9]:
a = np.arange(1, 10, 2)
a

array([1, 3, 5, 7, 9])

In [10]:
a = np.linspace(0, 1, 5)
a

array([0.  , 0.25, 0.5 , 0.75, 1.  ])

In [12]:
a = np.random.rand(2, 3)
a

array([[0.39233558, 0.45666642, 0.35967731],
       [0.4038544 , 0.62777086, 0.50275395]])

### Operations on numpy arrays

In [3]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])

c = a + b
c

array([5, 7, 9])

In [4]:
a = np.array([1, 2, 3])

b = np.sin(a)
print(b) 

c = np.log(a)
print(c)

[0.84147098 0.90929743 0.14112001]
[0.         0.69314718 1.09861229]


In [7]:
a = np.array([[1, 2, 3], [4, 5, 6]])

sum_all = np.sum(a)
print(sum_all) 

# axis=0 is the first dimenstion (rows)
# sum all columns
sum_axis0 = np.sum(a, axis=0)
print(sum_axis0) 

# axis=1 are the columns
# sum all rows
sum_axis1 = np.sum(a, axis=1)
print(sum_axis1) 

21
[5 7 9]
[ 6 15]


In [None]:
# Exercise: can you get the mean instead of the sum?

### Indexing and Slicing

In [8]:
# 1D Array Indexing
a = np.array([10, 20, 30, 40, 50])
print(a[0]) 
print(a[2]) 
print(a[-1]) # last element

10
30
50


In [9]:
# 2D Array Indexing
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(a[0, 0])  
print(a[1, 2]) 
print(a[-1, -1]) 

1
6
9


In [11]:
# 1D slicing
a = np.array([10, 20, 30, 40, 50])
print(a[1:4])  # between 1 and 4
print(a[:3])   # before 3
print(a[2:]) # after 2

[20 30 40]
[10 20 30]
[30 40 50]


In [13]:
# 2D slicing
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(a[:2, 1:3]) 
print(a[1:, :2]) 

[[2 3]
 [5 6]]
[[4 5]
 [7 8]]


### Advanced indexing

In [15]:
# elements that meet a condition
a = np.array([10, 20, 30, 40, 50])
print(a[a > 25])

[30 40 50]


In [16]:
# access specific elements
a = np.array([10, 20, 30, 40, 50])
indices = np.array([0, 2, 4])
print(a[indices])

[10 30 50]


In [17]:
# mixing indexing and slicing
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
print(a[[0, 2], 1:3])

[[2 3]
 [8 9]]


### Modifying Arrays with Indexing and Slicing

In [19]:
a = np.array([10, 20, 30, 40, 50])
a[1] = 25
a

array([10, 25, 30, 40, 50])

In [20]:
a = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
a[1:, :2] = 0
print(a)

[[1 2 3]
 [0 0 6]
 [0 0 9]]


### Reshaping

In [23]:
a = np.arange(0,9)
a, a.shape

(array([0, 1, 2, 3, 4, 5, 6, 7, 8]), (9,))

In [24]:
a.reshape((3,3))

array([[0, 1, 2],
       [3, 4, 5],
       [6, 7, 8]])

### Useful functions 

In [25]:
# dot product
a = np.array([1, 2, 3])
b = np.array([1, 1, 1])
np.dot(a, b)

6

In [27]:
# L2 norm
np.linalg.norm(a), ((a**2).sum())**0.5

(3.7416573867739413, 3.7416573867739413)

In [28]:
# L1 norm
np.linalg.norm(a, 1), np.abs(a).sum()

(6.0, 6)

### Reading csv files with Pandas

In [29]:
import pandas as pd
df = pd.read_csv("tiny_ratings.csv")
df

Unnamed: 0,user_id,movie_id,rating
0,1,11,5
1,1,12,3
2,2,11,4
3,2,13,2
4,3,12,4
5,3,13,5


In [34]:
for index, row in df.iterrows():
    print(row["user_id"], row["movie_id"], row["rating"])

1 11 5
1 12 3
2 11 4
2 13 2
3 12 4
3 13 5


### Exercise:
 Compute cosine similarity between two vectors. Cosine similarity is the dot product between the two vectors divided by the L2 norms. 

In [40]:
a = np.array([5, 1, 2])
b = np.array([4, 2, 1])


### Concatenate multiple NumPy arrays 
* np.concatenate() concatenates arrays along an existing axis
   > Specify the list of arrays to concatenate </br>
   > Specify the axis to concatenate: axis </br>
* np.stack() concatenates arrays along a new axis

In [44]:
a1 = np.ones((2, 3), int)
a1

array([[1, 1, 1],
       [1, 1, 1]])

In [43]:
a2 = np.full((2,3), 2)
a2

array([[2, 2, 2],
       [2, 2, 2]])

In [46]:
a3 = np.concatenate([a1, a2], axis=0)
a1.shape, a2.shape, a3.shape

((2, 3), (2, 3), (4, 3))

In [47]:
a3

array([[1, 1, 1],
       [1, 1, 1],
       [2, 2, 2],
       [2, 2, 2]])

In [49]:
a3 = np.concatenate([a1, a2], axis=1)
a3, a3.shape

(array([[1, 1, 1, 2, 2, 2],
        [1, 1, 1, 2, 2, 2]]),
 (2, 6))

In [50]:
a3 = np.stack([a1, a2])
print(a3.shape)
a3

(2, 2, 3)


array([[[1, 1, 1],
        [1, 1, 1]],

       [[2, 2, 2],
        [2, 2, 2]]])

### Data types
This example demonstrates how changing the data type of a NumPy array can significantly reduce memory usage, especially for large arrays. If the range of values in the array can be accommodated by a smaller data type, this approach can lead to substantial memory savings.

In [55]:
large_random_array = np.random.randint(0, 100, size=1000000, dtype=np.int64)
default_memory_usage = large_random_array.nbytes
print(f"Memory usage with default dtype (int64): {default_memory_usage} bytes")

Memory usage with default dtype (int64): 8000000 bytes


In [56]:
small_random_array = large_random_array.astype(np.int8)
memory_usage = small_random_array.nbytes
print(f"Memory usage with int8: {memory_usage} bytes")

Memory usage with int8: 1000000 bytes


### Exercise:
You have an array of data. Your goal is to identify outliers and replace their values with the mean of the data (excluding the outliers).

Generate data using the following command and assume that anything larger than 90 is an outlier:
`data = np.random.randint(0, 101, size=50)`

### Exercise:
Given the following matrix:
`matrix = np.array([[75, 82, 90],
                   [62, 70, 85],
                   [90, 88, 92],
                   [80, 78, 84]])`
Write code that would rearrange the row based on the value of the last column (larger goes first).

In [57]:
matrix = np.array([[75, 82, 90],
                   [62, 70, 85],
                   [90, 88, 92],
                   [80, 78, 84]])