# Introduction to NumPy and pandas

### Numpy

In [111]:
import numpy as np
import pandas as pd


np.random.seed(0)


In [47]:
# Numpy arrays

np.array([1, 2, 3, 4, 5])


array([1, 2, 3, 4, 5])

In [48]:
# Arrays can be created with a given type

np.array([1, 2, 3, 4, 5], dtype=np.float64)


array([1., 2., 3., 4., 5.])

In [49]:
# NumPy will cast a new value to the array type, if that can't be done an error is raised

l = np.array([1, 2, 3, 4, 5])
l[0] = 13.37
l


array([13,  2,  3,  4,  5])

In [50]:
l = np.array([1, 2, 3, 4, 5])
l[0] = 'A'
l


ValueError: invalid literal for int() with base 10: 'A'

In [51]:
# Creating Numpy arrays of fixed size

np.zeros(5)


array([0., 0., 0., 0., 0.])

In [52]:
np.ones(5)


array([1., 1., 1., 1., 1.])

In [53]:
np.empty(5)


array([1., 1., 1., 1., 1.])

In [54]:
np.arange(5)

array([0, 1, 2, 3, 4])

In [55]:
# Creating arrays with random values

np.random.randint(10, size=5)

array([5, 0, 3, 3, 7])

In [56]:
# Creating multi-dimensional arrays

m = np.ones((3,4))
m

array([[1., 1., 1., 1.],
       [1., 1., 1., 1.],
       [1., 1., 1., 1.]])

In [57]:
#   We can access the array properties as follows

m.ndim


2

In [58]:
m.shape


(3, 4)

In [59]:
# Elements can be accessed like ordinary lists

l = np.arange(5)
l[2]


2

In [60]:
m = np.random.randint(10, size=(3,4))
m[1][2]


6

In [61]:
# Slicing notation can also be used

l = np.arange(5)
l[1:4] # From index 1 (inclusive) to 4 (exclusive)


array([1, 2, 3])

In [62]:
l[::2] # Every second element


array([0, 2, 4])

In [65]:
m = np.random.randint(10, size=(3,4))
m[1:, 0:2] # From row 1 to end and column 0 to 2


array([[1, 3],
       [7, 0]])

In [66]:
m[::, 3:] # Every row, only last column


array([[8],
       [3],
       [9]])

In [68]:
# Copies are shallow by default

v = m[::, 3:]
v[0][0] = 42
v

array([[42],
       [ 3],
       [ 9]])

In [69]:
m

array([[ 0,  2,  3, 42],
       [ 1,  3,  3,  3],
       [ 7,  0,  1,  9]])

In [70]:
# For a deep copy, use the copy() method

v = m[::, 3:].copy()

In [71]:
# Performance comparison between python loops and numpy functions

m = np.random.randint(10, size=1000000)  # An array with a million of elements


def standard_double(array):
    output = np.empty(array.size)
    for i in range(array.size):
        output[i] = array[i] * 2
    return output


def numpy_double(array):
    return array * 2


In [75]:
%timeit standard_double(m)

442 ms ± 36.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [76]:
%timeit numpy_double(m)

1.77 ms ± 148 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [80]:
# We can apply operations element by element between arrays of the same dimension

np.array([1, 2, 3]) + np.array([4, 5, 6])


array([5, 7, 9])

In [86]:
# Or operations element by element between an array and a scalar
# This is called 'broadcasting'

np.array([1, 2, 3]) * 2


array([2, 4, 6])

In [85]:
# The last operation is conceptually the same as the following

np.array([1, 2, 3]) * np.array([2, 2, 2])


array([2, 4, 6])

In [88]:
# Broadcasting is applied when the array dimensions are compatible
# In this case, their rightmost dimensions are the same, so the arrays are compatible

a1 = np.ones((4, 3))
a1


array([[1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.],
       [1., 1., 1.]])

In [87]:
a2 = np.ones((1, 3)) 
a2


array([[1., 1., 1.]])

In [89]:
a1 + a2

array([[2., 2., 2.],
       [2., 2., 2.],
       [2., 2., 2.],
       [2., 2., 2.]])

In [93]:
# In this case, their rightmost dimensions aren't the same, so the arrays are not compatible

a3 = np.ones((1, 4))
a3

array([[1., 1., 1., 1.]])

In [94]:
a1 + a3

ValueError: operands could not be broadcast together with shapes (4,3) (1,4) 

In [95]:
# There are built-in functions for basic aggregations

np.arange(10).mean()

4.5

In [96]:
np.ones((4,4)).sum()


16.0

In [101]:
# In case of n-dimensional arrays, aggregations can be applied axis-wise

m = np.array([
    [6, 5, 1, 1],
    [8, 9, 3, 2],
    [9, 3, 8, 5],
    [1, 0, 1, 9]
])


In [102]:
m.sum(axis=0) # Sum on the rows axis (the first dimension)


array([24, 17, 13, 17])

In [103]:
m.sum(axis=1) # Sum on the columns axis (the second dimension)

array([13, 22, 25, 11])

In [105]:
# Broadcasting also applies to comparison operations

l = np.array([1, 2, 3, 4])
l < 3

array([ True,  True, False, False])

In [110]:
# When comparing n-dimensional arrays, their dimensions should be compatible

m = np.array(
    [[1., 5., 9., 13.], 
    [2., 6., 10., 14.], 
    [3., 7., 11., 15.], 
    [4., 8., 12., 16.]]
)
m <= np.array([1, 0, 15, 14])


array([[ True, False,  True,  True],
       [False, False,  True,  True],
       [False, False,  True, False],
       [False, False,  True, False]])

### Pandas

In [113]:
# For one-dimensional data, pandas Series should be used

s = pd.Series([1, 2, 3, 4, 5])
s

0    1
1    2
2    3
3    4
4    5
dtype: int64

In [114]:
# Pandas use numpy arrays under the hood

type(s.values)


numpy.ndarray

In [116]:
# So indexing works the same

s[0]

1

In [117]:
s[1:3]

1    2
2    3
dtype: int64

In [118]:
# Unlike numpy arrays, series mantain an index to label data

s.index

RangeIndex(start=0, stop=5, step=1)

In [120]:
# Indexes can be arbitrary

s = pd.Series([1, 2, 3, 4, 5], index=["a", "b", "c", "d", "e"])
s

a    1
b    2
c    3
d    4
e    5
dtype: int64

In [126]:
# Series can be created from dicts, and the keys are used as indexes

s = pd.Series({"a": 1, "b": 2, "c": 3, "d": 4, "e": 5})
s


a    1
b    2
c    3
d    4
e    5
dtype: int64

In [127]:
# For tabular data, pandas DataFrames should be used

paid = {
    "Louvre Museum": 5988065, 
    "Orsay Museum": 1850092, 
    "Pompidou Centre": 2620481, 
    "National Natural History Museum": 404497
}
free = {
    "Louvre Museum": 4117897, 
    "Orsay Museum": 1436132, 
    "Pompidou Centre": 1070337, 
    "National Natural History Museum": 344572
}
museums = pd.DataFrame({"paid": paid, "free": free})
museums


Unnamed: 0,paid,free
Louvre Museum,5988065,4117897
Orsay Museum,1850092,1436132
Pompidou Centre,2620481,1070337
National Natural History Museum,404497,344572


In [132]:
# In this case, there are 2 indexes, for rows and columns

museums.index


Index(['Louvre Museum', 'Orsay Museum', 'Pompidou Centre',
       'National Natural History Museum'],
      dtype='object')

In [133]:
museums.columns


Index(['paid', 'free'], dtype='object')

In [135]:
# We can use indexing and slicing notation to get subsets of columns or rows:

museums["free"]


Louvre Museum                      4117897
Orsay Museum                       1436132
Pompidou Centre                    1070337
National Natural History Museum     344572
Name: free, dtype: int64

In [137]:
museums["Louvre Museum":"Orsay Museum"]['paid']

Louvre Museum    5988065
Orsay Museum     1850092
Name: paid, dtype: int64

In [138]:
# We can write a Boolean condition inside the brackets to match data. 
# This is known as masking:

museums[museums["paid"] > 2000000]


Unnamed: 0,paid,free
Louvre Museum,5988065,4117897
Pompidou Centre,2620481,1070337


In [139]:
# We can easily set new columns with the indexing notation:

museums["total"] = museums["paid"] + museums["free"]
museums

Unnamed: 0,paid,free,total
Louvre Museum,5988065,4117897,10105962
Orsay Museum,1850092,1436132,3286224
Pompidou Centre,2620481,1070337,3690818
National Natural History Museum,404497,344572,749069


In [140]:
# We can apply aggregation operations to rows and columns

museums["total"].sum()


17832073

In [141]:
museums["total"].mean()


4458018.25

In [143]:
# To load CSV files, there's a simple function

museums = pd.read_csv("./museums.csv", index_col=0)
museums


Unnamed: 0_level_0,paid,free
name,Unnamed: 1_level_1,Unnamed: 2_level_1
Louvre Museum,5988065,4117897
Orsay Museum,1850092,1436132
Pompidou Centre,2620481,1070337
National Natural History Museum,404497,344572


In [144]:
# Saving to CSV is a similar process

museums["total"] = museums["paid"] + museums["free"]
museums.to_csv("museums_with_total.csv")
