# NumPy and pandas
## General
### NumPy
NumPy is used for performing numerical computations on arrays and matrices, such as mean, median, percentiles and linear algebra computations. Simply install numpy with pip `pip3 install numpy`. 

### Pandas
Pandas is used for handling tabular datasets that usually combine different types of data columns (integer, float, nominals, etc). Pandas requires NumPy. To install: `pip3 install pandas`.

## Numpy examples

### The basics

In [4]:
# zeros and ones. array shape
import numpy as np
a = np.zeros((2, 4))
b = np.ones((2, 4))
print(f"a:\n{a}")
print(f"b:\n{b}")
print(f"a+b:\n{a+b}")
print(f"a-2b:\n{a-2*b}")
print(f"shape:\n{a.shape}")

a:
[[0. 0. 0. 0.]
 [0. 0. 0. 0.]]
b:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]]
a+b:
[[1. 1. 1. 1.]
 [1. 1. 1. 1.]]
a-2b:
[[-2. -2. -2. -2.]
 [-2. -2. -2. -2.]]
shape:
(2, 4)


In [5]:
# creating arrays from lists and array types
import numpy as np
a = np.array([1, 2, 5])
b = np.array([2.0, 10, -1])
print(f"a+b{a + b}")
print(a.dtype)
print(b.dtype)

a+b[ 3. 12.  4.]
int64
float64


In [6]:
# numpy.arange. basic operations
import numpy as np
a = np.arange(0, 20, 5)
b = np.arange(0, 20, 5) - 10
print(f"a:{a}")
print(f"a-10:{a-10}")
print(f"a^2:{a ** 2}")
print(f"a-b:{a-b}")
print(f"cos(b * pi / 20):{np.cos(b * np.pi / 20.0)}")

a:[ 0  5 10 15]
a-10:[-10  -5   0   5]
a^2:[  0  25 100 225]
a-b:[10 10 10 10]
cos(b * pi / 20):[6.12323400e-17 7.07106781e-01 1.00000000e+00 7.07106781e-01]


In [8]:
# element-wise product matrix product
import numpy as np
A = np.array([[0, 2], [1, 1]])
B = np.array([[-1, 1], [1, 1]])
print(f"A .* B =\n {A * B}")     # element-wise
print(f"A * B =\n {A.dot(B)}")  # matrix product

A .* B =
 [[0 2]
 [1 1]]
A * B =
 [[2 2]
 [0 2]]


In [20]:
# reshaping arrays
import numpy as np
x = np.arange(10)
print(x)
print(x.reshape(2, 5))

[0 1 2 3 4 5 6 7 8 9]
[[0 1 2 3 4]
 [5 6 7 8 9]]


### Numpy statistics
The following example reads the temperatures from NYC in the last 150 years on the same day (9th April). The csv file contains 3 rows, namely date, max day temp and min day temp. Date is saved in a list and the two temperatures in numpy arrays:

In [69]:
import csv
import numpy as np

years = []
max_t, min_t = np.array([]), np.array([])
# read the csv file of New York min and max temperatures of 9th April for the last 150 years:
with open('data_ny_temperatures.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for ir, row in enumerate(reader):
        if ir>0:
            max_t = np.append(max_t, float(row[1]))
            min_t = np.append(min_t, float(row[2]))
            years.append(int(row[0].split('-')[0]))

print(f"Average max-day temperature is {max_t.mean():.1f}")
print(f"Median max-day temperature is {np.median(max_t):.1f}")
print(f"Average max-day temperature is {min_t.mean():.1f}")
print(f"Median max-day temperature is {np.median(min_t):.1f}")

print(f"The maximum max-day temp was {np.max(max_t):.1f} in {years[np.argmax(max_t)]}")
print(f"The maximum min-day temp was {np.max(min_t):.1f} in {years[np.argmax(min_t)]}")
print(f"The minimum max-day temp was {np.min(max_t):.1f} in {years[np.argmin(max_t)]}")
print(f"The minimum max-day temp was {np.min(min_t):.1f} in {years[np.argmin(min_t)]}")

max_t_p_10 = np.percentile(max_t, 10)
max_t_p_90 = np.percentile(max_t, 90)
years_max_10 = [y for i, y in enumerate(years) if max_t[i] < max_t_p_10]
print(years_max_10)
years_max_90 = [y for i, y in enumerate(years) if max_t[i] > max_t_p_90]
print(years_max_90)
min_t_p_10 = np.percentile(min_t, 10)
min_t_p_90 = np.percentile(min_t, 90)
years_min_10 = [y for i, y in enumerate(years) if min_t[i] < min_t_p_10]
print(years_min_10)
years_min_90 = [y for i, y in enumerate(years) if min_t[i] > min_t_p_90]
print(years_min_90)


import plotly
import plotly.graph_objs as go
from scipy import signal
win = signal.hann(25)
max_t_filtered = signal.convolve(max_t, win, mode='same') / sum(win)
max_t_filtered = signal.convolve(max_t_filtered, win, mode='same') / sum(win)
data1 = go.Scatter(x=years, y=max_t)
data2 = go.Scatter(x=years, y=max_t_filtered)
plotly.offline.iplot([data1, data2])


Average max-day temperature is 55.3
Median max-day temperature is 54.0
Average max-day temperature is 39.6
Median max-day temperature is 39.0
The maximum max-day temp was 86.0 in 1991
The maximum min-day temp was 68.0 in 1991
The minimum max-day temp was 39.0 in 1885
The minimum max-day temp was 25.0 in 1977
[1874, 1884, 1885, 1900, 1907, 1911, 1917, 1935, 1974, 1979, 1982, 1996, 1997, 2003]
[1871, 1879, 1921, 1929, 1934, 1945, 1959, 1968, 1970, 1981, 1991, 2001, 2002, 2013]
[1876, 1880, 1885, 1888, 1891, 1900, 1917, 1920, 1950, 1958, 1972, 1977, 1997, 2000]
[1871, 1895, 1915, 1921, 1922, 1929, 1959, 1968, 1970, 1980, 1981, 1991, 2002, 2012, 2013]


A note on speed: if you need to append a large number of elements in a numpy array, it is much faster to append it to a list and then convert the list to numpy array (instead of using the numpy.append() method). And list comprehension is obvioysly even faster. 

In [39]:
import numpy as np
import time

t1 = time.time()
a = np.array([])
for i in range(1, 10000):
    a = np.append(a, i)
t2 = time.time()
print(f"numpy.append(): {1000 * (t2 - t1):.2f} msecs")

t1 = time.time()
a = []
for i in range(1, 10000):
    a.append(i)
a = np.array(a)
t2 = time.time()
print(f"list append and numpy array conversion: {1000 * (t2 - t1):.2f} msecs")

t1 = time.time()
a = [i for i in range(1, 1000)]
a = np.array(a)
t2 = time.time()
print(f"list comprehension and numpy array conversion: {1000 * (t2 - t1):.2f} msecs")

numpy.append(): 87.97 msecs
list append and numpy array conversion: 1.96 msecs
list comprehension and numpy array conversion: 0.22 msecs
