### Basic NumPy examples

In [None]:
# import NumPy, np is the "standard" alias for numpy
# in conventional data analytics and ML code
import numpy as np

In [811]:
# RUN THIS CODE CELL IN ORDER TO USE THE CUSTOM STYLES
# THE ONLY PURPOSE OF THIS IS THE MAKE NUMPY ARRAYS
# VISUALLY MORE PLEASING
from IPython.core.interactiveshell import InteractiveShell

# Custom HTML formatter for NumPy arrays
# you can customize the color's if you wish!
# NOTE: this can be a bit buggy with some 
# more advanced NumPy operations, try removing
# this custom formatting code if problems arise

# NOTE: you can control the amount of decimals by changing 
# precision value below (9 is enough for the NumPy exercises)
def array_to_html(arr, precision=9):
    # Round the numbers to the specified precision and convert to string
    str_arr = np.round(arr, precision).astype(str)
    
    # Find the maximum width (longest string length)
    max_width = max([len(val) for val in str_arr.flatten()])
    
    # Create the HTML table with consistent column width and centered text
    html = "<table style='border: 1px solid black; border-collapse: collapse;'>"
    
    if arr.ndim == 1:
        # Handle 1D array (vector)
        for val in str_arr:
            html += f"<tr><td style='padding: 5px; border: 1px solid black; font-weight: bold; width: {max_width}ch; text-align: center;'>{val}</td></tr>"
    else:
        # Handle 2D array (matrix)
        for row in str_arr:
            html += "<tr>"
            for val in row:
                html += f"<td style='padding: 5px; border: 1px solid black; font-weight: bold; width: {max_width}ch; text-align: center;'>{val}</td>"
            html += "</tr>"
    
    html += "</table>"
    return html

# Register the formatter in IPython
InteractiveShell.instance().display_formatter.formatters['text/html'].for_type(
    np.ndarray, lambda arr: array_to_html(arr)
)

<function __main__.<lambda>(arr)>

### Data generators

In [812]:
# generate a vector (list)
data = np.arange(0, 10)
data

0
0
1
2
3
4
5
6
7
8
9


In [813]:
# we can also define a step size in order to skip numbers
# this example skips 4 numbers every time
# so from 0 -> 20 -> 0, 4, 8, 12, 16
data = np.arange(0, 20, 4)
data

0
0
4
8
12
16


In [814]:
# NOTE! if you print NumPy data with print()
# you will not see any special styles (because printing
# is now done by Python instead of Jupyter)
print(data)

[ 0  4  8 12 16]


### You can convert conventional Python lists or lists-of-lists into NumPy arrays easily

In [815]:
# np.array is also handy if for some reason your
# data still remains in Python format, and your AI algorithm
# provides an error indicating your data is not in NumPy -format => np.array()
# often resolves this problem (but might need more features, like. np.expand_dims())
numbers = [6, 4, 8, 7, 2, 1, 3]
data = np.array(numbers)
data

0
6
4
8
7
2
1
3


In [816]:
# example 2, list of lists
day1 = [-32, -29, -30, -31, -37]
day2 = [-5, -2, 0, 1, -6]
day3 = [-10, -14, -11, -12, -8]

# list of lists (matrix)
temperatures = [day1, day2, day3]

# convert to NumPy -format
data = np.array(temperatures)
data

0,1,2,3,4
-32,-29,-30,-31,-37
-5,-2,0,1,-6
-10,-14,-11,-12,-8


### Some special data generators

In [817]:
# we can generate a collection of zeroes if we want
# this could be used to generate a set of default values
# for a real life dataset => usually 0 => "no"
data = np.zeros(8)
data

0
0.0
0.0
0.0
0.0
0.0
0.0
0.0
0.0


In [818]:
# a matrix version also possible:
data = np.zeros((5, 5))
data

0,1,2,3,4
0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0
0.0,0.0,0.0,0.0,0.0


In [819]:
# we can also change the data type into integer
# astype() is also usable most of the time in pandas
# very handy when the data is in a weird format (from a file etc.)
data = np.zeros((5, 5)).astype(int)
data

0,1,2,3,4
0,0,0,0,0
0,0,0,0,0
0,0,0,0,0
0,0,0,0,0
0,0,0,0,0


In [820]:
# we can generate a collection of ones too
# this could be used to generate a set of default values
# for a real life dataset => usually 1 => "yes"
data = np.ones((5, 5)).astype(int)
data

0,1,2,3,4
1,1,1,1,1
1,1,1,1,1
1,1,1,1,1
1,1,1,1,1
1,1,1,1,1


### linspace => linearly spaced array

In [821]:
# without NumPy, you'd need a Python list,
# a for loop and division calculations

# linspace is often used in visualizations
# (modules like seaborn and matplotlib)
# in order to automatically accomodate the amount
# of data you wish to visualize
data = np.linspace(0, 10, 15)
data

0
0.0
0.714285714
1.428571429
2.142857143
2.857142857
3.571428571
4.285714286
5.0
5.714285714
6.428571429


### Random numbers in NumPy

In [822]:
# random number generation is often used
# in order to create imaginary test data for other modules
# like pandas and seaborn
# very practical if you need a smaller non-sense dataset
# in order to test your advanced data filtering/removal -algorithms
# before applying them into real datasets (to ensure you don't remove too much data)

# matrix version
data = np.random.rand(5, 5)
data

0,1,2,3,4
0.128664847,0.007017896,0.365113092,0.290539269,0.514406629
0.421907013,0.786486937,0.3329849,0.523859284,0.184109958
0.883554265,0.105386679,0.852045319,0.853779335,0.344039779
0.275461749,0.386806491,0.704701077,0.883604697,0.641515348
0.374576795,0.732129002,0.299384576,0.036185065,0.384751369


In [823]:
# matrix version, values between 1-100 => 10 numbers
data = np.random.randint(1, 100, 10)
data

0
2
28
73
7
38
31
90
51
87
88


In [824]:
# generate random numebrs from standard normal distribution
# value range is commonly between -3 and +3, not values slightly out of bounds
# also possible (meaning extreme values => outliers)
data = np.random.randn(5, 5)
data

0,1,2,3,4
0.162602351,0.408238584,-0.953795848,1.876174959,0.074256181
0.611889947,0.640203384,1.569794764,0.605666724,0.015411916
-0.405097393,-1.391876124,-0.248341523,-0.237027409,0.879929273
-1.152812257,0.860504884,-0.910219187,0.026571634,-1.202136996
1.505810166,-1.486100616,1.463196732,-1.172919432,-1.656425359


In [825]:
# in future NumPy -versions the previous number examples
# might be removed, the new recommended way is to 
# use NumPy's random number generator
rng = np.random.default_rng()

# example to replace randn()
data = rng.normal(loc=0.0, scale=1.0, size=(5, 5))
data

0,1,2,3,4
0.752468275,0.060225757,-0.112115429,1.020228381,-0.269344554
0.166757519,0.908247017,-0.24119879,-0.196240535,-1.873428395
-0.364592932,1.098227664,-0.24110286,-0.420298613,-0.630747055
-0.562618661,-0.00887712,-0.687114422,1.245785811,-1.236698352
0.266656576,-0.745564853,1.343097303,-0.074284547,0.102436676


In [826]:
# in future NumPy -versions the previous number examples
# might be removed, the new recommended way is to 
# use NumPy's random number generator
rng = np.random.default_rng()

# example to replace randn()
data = rng.integers(0, 100, size=25).reshape(5, 5)
data

0,1,2,3,4
0,67,84,31,48
78,78,15,91,3
81,87,13,64,54
14,65,57,23,59
84,77,94,77,21


### reshape() -> convert any vector/list into a matrix!

In [827]:
# reshape() is very handy, as long as you have some kind of a vector (list)
# you can reshape it into a matrix easily (no need to remember specialized syntax)

# in this case we have 25 values (0-24) => 5 x 5 matrix (25 values)
# the values have to match, otherwise you'll get an error
data = np.arange(25).reshape(5, 5)
data

0,1,2,3,4
0,1,2,3,4
5,6,7,8,9
10,11,12,13,14
15,16,17,18,19
20,21,22,23,24


In [828]:
# sometimes in ML etc. code, you might need to check
# your current data format in order to adapt it to your algorithm
data.shape

(5, 5)

In [829]:
# sometimes you also need to check what is the datatype
# a common bug => data looks like a number, but it's a text/object instead
data.dtype

dtype('int64')

### NumPy functions and tools

In [830]:
# some example data, random numbers between 1-100, 25 numbers => 5 x 5 matrix
data = np.random.randint(0, 100, 25).reshape(5, 5)
data

0,1,2,3,4
19,67,11,10,48
29,56,60,20,60
41,61,79,99,45
59,88,93,68,32
21,54,63,79,22


In [831]:
# getting the largest value in the dataset
max_value = data.max()
max_value

np.int32(99)

In [832]:
# getting the smallest value in the dataset
min_value = data.min()
min_value

np.int32(10)

In [833]:
# if you want remove the np.int32() wrapper
# from the print => just print directly
print(max_value)
print(min_value)

99
10


In [834]:
# argmax gives only a position as if the data was a vector (list)
# not needed commonly, except some specialized ML cases, see below
data.argmax()

np.int64(13)

In [835]:
# imaginary example, but realistic

# this example, we recognized hotel feedback, and
# whether it has a negative, neutral or positive tone

# indeces: 0 => negative, 1 => neutral, 2 => positive

# in AI code, we usually save the actual names of the values 
# since it's easier to understand when our code outputs "Neutral"
# instead of 1
categories = ["Negative", "Neutral", "Positive"]

# this is an imaginary output from a classification hotel feedback model
predictions = [0.37, 0.53, 0.97]

# convert into NumPy -array
data = np.array(predictions)

# get the index with the highest prediction in the list
highest_index = data.argmax()

# get the actual description of this result from the categories
result = categories[highest_index]
print(result)

Positive


### NumPy data indeces and selections / selectors ...

In [836]:
# example data
data = np.arange(0, 21, 2)
data

0
0
2
4
6
8
10
12
14
16
18


In [837]:
# single element, just like in Python
data[5]

np.int64(10)

In [838]:
# sublist of data also same as in Python
slice = data[3:7]
slice

0
6
8
10
12


In [839]:
# our new test dataset, a matrix
data = np.random.randint(0, 100, 25).reshape(5, 5)
data

0,1,2,3,4
25,31,16,31,22
54,29,31,95,82
31,32,85,17,18
96,40,78,92,15
81,59,75,84,86


In [840]:
# NumPy -version of row/column selection
# this example => row index 3, column index 4 
data[3, 4]

np.int32(15)

In [841]:
# data is now a matrix, sow one index => one row

# only the first row
data[0]

0
25
31
16
31
22


In [842]:
# means => all rows from start (index 0) => until index 2
# all columns after index 1

# see also the examples in the materials

# see the previously generated matrix for reference
# this is a aslice of the original matrix
data[:2, 1:]

0,1,2,3
31,16,31,22
29,31,95,82


### Conditional filtering

In [843]:
# example data
data = np.arange(0, 11)
data

0
0
1
2
3
4
5
6
7
8
9


In [844]:
# the recommended way to filter, use this syntax:
# this example => filter out values over 5
filtered = data[data > 5]
filtered

0
6
7
8
9
10


In [845]:
# example two, numbers divisible by 2
filtered = data[data % 2 == 0]
filtered

0
0
2
4
6
8
10


### NumPy -slices and broadcasting

In [846]:
# example data
data = np.arange(0, 10)
data

0
0
1
2
3
4
5
6
7
8
9


In [847]:
# broadcast (override) the first elements to be 100
data[0:5] = 100
data

0
100
100
100
100
100
5
6
7
8
9


In [848]:
# let's take a slice of the data into a NEW VARIABLE
slice = data[0:5]
slice

0
100
100
100
100
100


In [849]:
# replace the values only in the slice (not original data)
slice[:] = 150
slice

0
150
150
150
150
150


In [850]:
# let's check the original data, see how broadcasting also
# changed the original data! this is not a bug, just a feature to manage memory
data

0
150
150
150
150
150
5
6
7
8
9


### Calculation operations

In [851]:
# our example data, imaginary salaries
salaries = np.arange(2000, 4000, 200)
salaries

0
2000
2200
2400
2600
2800
3000
3200
3400
3600
3800


In [852]:
# let's give a 150 â‚¬ raise to everyone
salaries = salaries + 150

In [853]:
salaries

0
2150
2350
2550
2750
2950
3150
3350
3550
3750
3950


In [854]:
# let's also give another +10% increase
salaries = salaries * 1.1

In [855]:
salaries

0
2365.0
2585.0
2805.0
3025.0
3245.0
3465.0
3685.0
3905.0
4125.0
4345.0


In [856]:
# broadcasting can be also used as a trick
# to generate certain values

# generate a bunch of ones => multiply all by 7
# => we have a bunch of 7s
data = np.ones(10).astype(int) * 7
data

0
7
7
7
7
7
7
7
7
7
7


### NaN / inf -values in NumPy data

In [857]:
# generate a dataset
data = np.arange(0 ,11)
data

0
0
1
2
3
4
5
6
7
8
9


In [858]:
# NaN and inf-values are problematic, because they are unusable
# by ALL AI algorithms

# NaN => not a number => usually a missing value
result = data / data
result

  result = data / data


0
""
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0
1.0


In [859]:
# inf => infinite value => very rare in real datasets
result = 1 / data
result

  result = 1 / data


0
inf
1.0
0.5
0.333333333
0.25
0.2
0.166666667
0.142857143
0.125
0.111111111


### sum() etc. 

In [860]:
# new test data
values = np.arange(0, 100).reshape(10, 10)
values

0,1,2,3,4,5,6,7,8,9
0,1,2,3,4,5,6,7,8,9
10,11,12,13,14,15,16,17,18,19
20,21,22,23,24,25,26,27,28,29
30,31,32,33,34,35,36,37,38,39
40,41,42,43,44,45,46,47,48,49
50,51,52,53,54,55,56,57,58,59
60,61,62,63,64,65,66,67,68,69
70,71,72,73,74,75,76,77,78,79
80,81,82,83,84,85,86,87,88,89
90,91,92,93,94,95,96,97,98,99


In [861]:
# get total of all values
total = np.sum(values)
print(total)

4950


In [862]:
# get grouped sums based on columns, returns a vector (list)
# in pandas => axis-parameter is flipped
# means => axis = 1 => columns, axis = 0 => rows
total = np.sum(values, axis=0)
total


0
450
460
470
480
490
500
510
520
530
540


In [863]:
# get grouped sums based on ROWS, returns a vector (list)
total = np.sum(values, axis=1)
total


0
45
145
245
345
445
545
645
745
845
945


In [864]:
# standard deviation is sometimes used to determine
# the dataset optimization quality
# but pandas has better tools for this
deviation = np.std(values)
print(deviation)

28.86607004772212


**Extra: expand_dims() / squeeze()**

In [None]:
# in ML, we sometimes need to modify the data structure to match ML
# algorithm requirements => expand_dims() and squeeze() are useful
predictions = [0.37, 0.53, 0.97]

# convert to NumPy-array
data = np.array(predictions)

# expand dimensions twice and reduce once
data = np.expand_dims(data, axis=1)
data = np.expand_dims(data, axis=1)
data = np.squeeze(data, axis=1)

print(data)

[[0.37]
 [0.53]
 [0.97]]
