# NumPy and pandas
## General
### NumPy
NumPy is used for performing numerical computations on arrays and matrices, such as mean, median, percentiles and linear algebra computations. Simply install numpy with pip `pip3 install numpy`. 

### Pandas
Pandas is used for handling tabular datasets that usually combine different types of data columns (integer, float, nominals, etc). Pandas requires NumPy. To install: `pip3 install pandas`.

## Numpy examples

### The basics

In [1]:
# zeros and ones. array shape
import numpy as np
a = np.zeros((2, 4))
b = np.ones((2, 4))
print(f"a:\n{a}")
print(f"b:\n{b}")
print(f"a+b:\n{a+b}")
print(f"a-2b:\n{a-2*b}")
print(f"shape:\n{a.shape}")

AttributeError: module 'numpy' has no attribute 'zeros'

In [None]:
# creating arrays from lists and array types
import numpy as np
a = np.array([1, 2, 5])
b = np.array([2.0, 10, -1])
print(f"a+b{a + b}")
print(a.dtype)
print(b.dtype)
print((a+b).dtype)

In [None]:
# to change the type of a numpy array use astype():
import numpy as np
b = np.array([2.1, 10, -5])
b_reduced = b.astype('uint8')
print(b.dtype)
print(b_reduced.dtype)
print(b_reduced)

In [None]:
import numpy as np
x = np.array([[200, 100], [100, 200]]).astype('uint8')
y = np.array([[255, 100], [100, 255]]).astype('uint8')
print(x)
print(y)
print(x + y)
print("(Result is overfloated!)")
print("\n Results with type conversion:")
print(x.astype('int32') + y.astype('int32'))

In [None]:
# numpy.arange. basic operations
import numpy as np
a = np.arange(0, 20, 5)
b = np.arange(0, 20, 5) - 10
print(f"a:{a}")
print(f"a-10:{a-10}")
print(f"a^2:{a ** 2}")
print(f"a-b:{a-b}")
print(f"cos(b * pi / 20):{np.cos(b * np.pi / 20.0)}")

In [None]:
# element-wise product matrix product
import numpy as np
A = np.array([[0, 2], [1, 1]])
B = np.array([[-1, 1], [1, 1]])
print(f"A .* B =\n {A * B}")     # element-wise
print(f"A * B =\n {A.dot(B)}")  # matrix product

In [None]:
# reshaping arrays
import numpy as np
x = np.arange(10)
print(x)
print(x.reshape(2, 5))

### Numpy statistics
The following example reads the temperatures from NYC in the last 150 years on the same day (9th April). The csv file contains 3 rows, namely date, max day temp and min day temp. Date is saved in a list and the two temperatures in numpy arrays. The following code extracts some basic statistics including, mean vale, median value, max, min and  10 and 90 percentiles. 

In [None]:
import csv
import numpy as np

years = []
max_t, min_t = np.array([]), np.array([])
# read the csv file of New York min and max temperatures of 9th April for the last 150 years:
with open('data_ny_temperatures.csv', newline='') as csvfile:
    reader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for ir, row in enumerate(reader):
        if ir>0:
            max_t = np.append(max_t, float(row[1]))
            min_t = np.append(min_t, float(row[2]))
            years.append(int(row[0].split('-')[0]))

print(f"Average max-day temperature is {max_t.mean():.1f}")
print(f"Median max-day temperature is {np.median(max_t):.1f}")
print(f"Average min-day temperature is {min_t.mean():.1f}")
print(f"Median min-day temperature is {np.median(min_t):.1f}")

print(f"The maximum max-day temp was {np.max(max_t):.1f} in {years[np.argmax(max_t)]}")
print(f"The maximum min-day temp was {np.max(min_t):.1f} in {years[np.argmax(min_t)]}")
print(f"The minimum max-day temp was {np.min(max_t):.1f} in {years[np.argmin(max_t)]}")
print(f"The minimum max-day temp was {np.min(min_t):.1f} in {years[np.argmin(min_t)]}")

max_t_p_10 = np.percentile(max_t, 10)
max_t_p_90 = np.percentile(max_t, 90)
years_max_10 = [y for i, y in enumerate(years) if max_t[i] < max_t_p_10]
# Note: this is equvalent to the followingL
# years_max_10 = []
#for i, y in enumerate(years):
#    if max_t[i] < max_t_p_10:
#        years_max_10.append(y)
print(years_max_10)
years_max_90 = [y for i, y in enumerate(years) if max_t[i] > max_t_p_90]
print(years_max_90)
min_t_p_10 = np.percentile(min_t, 10)
min_t_p_90 = np.percentile(min_t, 90)
years_min_10 = [y for i, y in enumerate(years) if min_t[i] < min_t_p_10]
print(years_min_10)
years_min_90 = [y for i, y in enumerate(years) if min_t[i] > min_t_p_90]
print(years_min_90)

A note on speed: if you need to append a large number of elements in a numpy array, it is much faster to append it to a list and then convert the list to numpy array (instead of using the numpy.append() method). And list comprehension is obvioysly even faster. 

In [None]:
import numpy as np
import time

t1 = time.time()
a = np.array([])
for i in range(1, 10000):
    a = np.append(a, i)
t2 = time.time()
print(f"numpy.append(): {1000 * (t2 - t1):.2f} msecs")

t1 = time.time()
a = []
for i in range(1, 10000):
    a.append(i)
a = np.array(a)
t2 = time.time()
print(f"list append and numpy array conversion: {1000 * (t2 - t1):.2f} msecs")

t1 = time.time()
a = [i for i in range(1, 10000)]
a = np.array(a)
t2 = time.time()
print(f"list comprehension and numpy array conversion: {1000 * (t2 - t1):.2f} msecs")

Talking about statistics, two of the most important quantities used in random variable statistics (whatever quantity they measure) are mean and standard deviation. We've already seen mean in some examples above. Standard deviation, which measures how close the values of the variable are to their mean value. Below, we are showing how to compute mean and std of a sequence and how to standardize the values of the sequence into having a standard deviation of 1 and mean value equal to 0. This is a very important process, used in machine learning and data science before training models and before predicting. An alternative is the max / min normalization, not shown here. 

In [None]:
import numpy as np
import numpy.random
m, s, n_samples = 10, 5, 1000
x = numpy.random.normal(m, s, n_samples)
m_est = x.mean()
s_est = x.std()

print(f"mean is {m_est:.3f} and std is {s_est:.3f}")
# z = (x - m) / s
x_norm = (x - m_est) / s_est
print(f"after standardization mean is {x_norm.mean():.3f} and std is {x_norm.std():.3f}")

### Numpy slicing and row - column operations

In [None]:
import numpy as np
x = np.array([[1,2,3], [4,5,6], [7, 8, 9], [10, 11, 12]])
print("x:")
print(x)
print("\nx[1:, :-1]:")
print(x[1:, :-1])

In [None]:
# global and row-wise or column-wise calculations
import numpy as np
x = np.array([[1,2,3], [4,5,6], [7, 8, 9], [10, 11, 12]])
print(f"global mean {x.mean()}")
print(f"global min {x.min()}")
print(f"global max {x.max()}")
print(f"column-wise mean {x.mean(axis=0)}")
print(f"row-wise mean {x.mean(axis=1)}")

### Broadcasting
Broadcasting in numpy is. a very powerful mechanism that allows numpy operators to work on arrays of different shapes.

We saw previously that element-to-element operations are possible in numpy when arrays have the same dimensions. However, operations on arrays that do not share the same shapes is possible in numpy because of broadcasting. Broadcasting can be performed when the shape of each dimension in the arrays are equal or one has the one of its dimensions equal to 1. Below are some broadcasting examples:

In [None]:
# broacasting examples
import numpy as np
# example 1:
x = np.array([[1, 2], [3, 4]])
print(x + 2)  # scalar and 2D array broadcasting

# example 2:
x = np.array([[1,2,3], [4,5,6], [7, 8, 9], [10, 11, 12]])
y = np.array([1, 2, 3])
print(f"add a {x.shape[0]}x{x.shape[1]} with a {y.shape[0]}x{1} numpy array:")
print(x + y)

#example 3:
y = np.array([1, 2, 3, 4]).reshape(4,1)
print(f"add a {x.shape[0]}x{x.shape[1]} with a {y.shape[0]}x{1} numpy array:")
print(x + y)

In [None]:
import numpy as np
# A normalization example without looping (using numpy broadcasting)
# initialize features (columns represent features and rows represent instances)
X = np.array([[200,0.1],[220,0.15],[250,0.11],[300,0.15],[320,0.16],[240,0.14]])

# get mean / std per feature (per column):
m = X.mean(axis=0) 
s = X.std(axis=0)

# normalize (without having to loop through different rows):
X_norm = (X - m) / s
# now X_norm is normalized with mean = 0 , std = 1:
X_norm

## Pandas
### Pandas data structures
Two are the basic types used in pandas: *series* and *dataframes*.
Series is a 1D labeled array that holds any data type (integers, strings, floats etc). To define a Series we need its data and its indices. Obviously the index must be of the same length to the data. If index is not defined, then the default value is \[0, ..., len(data) - 1\]. 

#### Series

In [None]:
# series definition
import pandas as pd
import numpy as np
s = pd.Series(np.random.randn(10), index=[f'index{i}' for i in range(10)])
print("series:"); print(s)
print("s.index"); print(s.index)

In [None]:
# one can also initialize series from dict:
s = pd.Series({'a': 2.1, 'c': 1.9, 'b': 1, 'd': -1})
print("series:"); print(s)
print("s.index"); print(s.index)

In [None]:
# indexing in series can be done with both its indices and integers
print(s[1], s['c'])

In [None]:
# also Series shares functions from numpy arrays:
s.mean(), s.median()

In [None]:
# ... and more functions:
np.cos(s)

In [None]:
# slicing similar to numpy arrays:
s[:-2]

In [None]:
s[s>0.5]

In [None]:
# BUT, operations are not the same as numpy. E.g. + results in the union of the indices involved
# NaN is assigned as the default value for indices that are not in both series 
a = pd.Series({'a': 2.1, 'b': 1, 'c': -1})
b = pd.Series({'a': 1, 'd': 1, 'g': -1, 'c': -1})
a + b

#### DataFrame
When your data is tabular with row index and column index, the go-to choice is pandas.DataFrame. DataFrame  is a 2D data structure with columns of potentially different types. Conceptually, DataFrame can be considered as a data table stored in a spreadsheet, a csv, a json file or a database. 

There are several ways to construct a DataFrame object, below are two of the most frequent:

In [None]:
# construct DataFrame from dict
import pandas as pd
d = {'name': ["james", "theodore", "jane", "maria"], 
     'score': [4., 3., 2., 5.]}
df = pd.DataFrame(d)
print(df.columns)
print(df)

In [None]:
# construct DataFrame from list of dicts
# (note that "sparse" matrices - aka missing data - are more easily supported using this format)
import pandas as pd
d = [{'name': 'james', 'score': '4', 'note': 'this is a note'},
     {'name': 'theodore', 'score': '3'},
     {'name': 'jane', 'score': '2'},
     {'name': 'maria', 'score': '5'}]
df = pd.DataFrame(d)
print(df.columns)
print(df)

#### More on DataFrames

In [None]:
# lets read the CSV file of temperatures again:
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
print(f"{len(list(df.columns))} columns {list(df.columns)}")
print(f"{len(df.index)} rows")
df

In [None]:
# SELECT a column:
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df['date']

In [None]:
# convert a column to numpy array:
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df['maxt'].to_numpy()

In [None]:
# ... or to list
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df['mint'].to_list()[::20]

In [None]:
# you can also INSERT a new column e.g.
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df['meant'] = (df['mint'] + df['maxt']) / 2
# or you can insert a fixed (non-array) value (it will be added to ALL rows)
df['note'] = 'this is a note'
df

In [None]:
# DELETE a column
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
del df['maxt']
df

We've seen that indexing columns is done like in dicts e.g. df['maxt']. What about indexing rows and assining values to individual cells:

In [None]:
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
print(df.iloc[0]) # index rows
# you can also use df.loc iand provide a LABEL instead of an integer
# (if the dataframe has been defined with labels in rows, see next examples)

# ASSIGN a value to a specific CELL:
df.loc[0, 'maxt'] = -10

df.iloc[0, df.columns.get_loc("maxt")] = -10
print(df)

In the following example
 * we set the index of the temperatures matrix from default (integers) to the date
 * we demonstrate how to use the loc method to index when non-integer indices are used

In [None]:
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df = df.set_index("date")
print(df)
# you can now use the loc method
df.loc["2018-04-09"]

In [None]:
# SLICING
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df = df.set_index("date")
print(df[::20]) # print every 20 rows
print(df[2:4])  # print rows 2 to 3
print(df["1929-04-09": "1944-04-09"])  # use non-integer indices in slicing

In [None]:
# SELECTION
print(df[df['maxt'] > 80]) # select rows with maxt>80
print(df[df['maxt'] - df['mint'] < 5]) # select rows with less than 5 difference between maxt and mint
print(df[(df['maxt'] > 80) | (df['mint'] < 28)]) # select rows with very high max or very low min temperatures

In [None]:
# SORTING
import pandas as pd
df = pd.read_csv("data_ny_temperatures.csv")
df = df.set_index("date")
df = df.sort_values(by='mint') 
print(df)