## 2. Data Wrangling (using Numpy & Pandas!)

#### Import Libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Numpy
Contains many of the most commonly used math functions, and can perform operations in N-dimensions

#### Numpy Arrays
Arrays are numpy specific data container

In [None]:
###Empty array
arr = np.array([])
print(arr.shape)
print(arr)

In [None]:
###1D array / vector
arr = np.array([1, 2, 3, 4, 5, 6])

What is this array's shape?

In [None]:
arr.shape

In [None]:
###2D array / matrix
arr = np.array([[1, 2, 3], [4, 5, 6]])
arr

What is this array's shape?

In [None]:
arr.shape

In [None]:
###3D array...arrays can have any N number of dimensions
arr = np.array([[[1, 2, 3], [4, 5, 6]], [[1, 2, 3], [4, 5, 6]]])
arr

What is this array's shape?

In [None]:
arr.shape

Below are other ways to quickly initialize arrays

In [None]:
np.ones(5)

In [None]:
np.zeros((2, 3, 2))

In [None]:
np.diag((3, 7, 2))

In [None]:
np.identity(6)

In [None]:
np.array([]).reshape((5, -1))

#### Array Operations

In [None]:
###All the usual math operations can be applied to arrays
arr1 = np.ones((3, 3))
arr2 = np.array([[2, 2, 2], [2, 2, 2], [2, 2, 2]])

In [None]:
arr1 + arr2

In [None]:
arr1 - arr2

In [None]:
arr1 += 1.
arr1

In [None]:
arr1 *= 2.
arr1

In [None]:
arr1 * arr2

In [None]:
arr1**arr2

In [None]:
###Be mindful of broadcasting
arr1 = np.ones((2, 2))
arr2 = np.ones((3, 3))

arr1 + arr2 #This will give error...why?

In [None]:
###Some of you may have already noticed ARRAY OPERATIONS IS *NOT* MATRIX ALGEBRA!
mat1 = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
mat2 = np.identity(3)

print(mat1 * mat2)
print()
print(np.matmul(mat1, mat2))

###For matrix operations, use np.linalg library! Docs: https://docs.scipy.org/doc/numpy/reference/routines.linalg.html

#### Array Indexing

In [None]:
###Lets create an big N-dim array
arr = np.array(range(1000)).reshape((10, 10, 10))
arr

In [None]:
###NEVER FORGET INDEXES START FROM 0!
arr[0, :, :]

In [None]:
arr[0:3, :, :]

In [None]:
arr[0, 4:6, 0]

In [None]:
arr[-1, :, :]

In [None]:
arr[-2:, :, :]

In [None]:
###Now for really complex indexing...
arr[1:3, 4, -2:]

In [None]:
###Boolean indexing
boolean_arr = arr % 2 == 0

In [None]:
arr[boolean_arr]

In [None]:
###This can get complex too...
boolean_arr = arr[0, :, :] % 2 == 0

In [None]:
###And I can even chain indexing...
arr[boolean_arr, :][:5, :]

NOTE: But when it comes to indexing...KISS!!

#### Numpy Example - AR Model Simulation
Numpy is a favorite library because of its speed. But you have to __vectorize__ your code to take full advantage of that speed. __Vectorization__ is to operate on vectors which reduces the need for looping.

tldr; 

looping = slow = bad

vectorization = fast = good

In [None]:
def AR2_model(init_X, num_timesteps, num_samples):
    """
    x(i, t) = beta1 * x(i, t - 1) + beta2 * x(i, t - 2) + e(i, t)
    """
    beta1 = 0.3; beta2 = 0.5; sigma = 0.2
    noise = np.random.normal(loc = 0., scale = sigma, size = (num_samples, num_timesteps))
    
    generated_data = []
    ###ADD CODE HERE
    
    

    return generated_data

In [None]:
###This function does the same thing, just faster
def AR2_model_fast(init_X, num_timesteps, num_samples):
    """
    x(i, t) = beta1 * x(i, t - 1) + beta2 * x(i, t - 2) + e(i, t)
    """
    beta1 = 0.3; beta2 = 0.5; sigma = 0.2
    noise = np.random.normal(loc = 0., scale = sigma, size = (num_samples, num_timesteps))
    
    ###ADD CODE HERE
    
    
    
    return X

In [None]:
%timeit AR2_model(init_X = np.array([1.0, 1.2]), num_timesteps = 120, num_samples = 10000)

In [None]:
%timeit AR2_model_fast(init_X = np.array([1.0, 1.2]), num_timesteps = 120, num_samples = 10000)

### Pandas

#### DataFrames
Dataframes are the pandas specific data container

In [None]:
arr1 = np.ones((3, 3))
arr2 = np.array([[1, 1, 1], [2, 2, 2], [3, 3, 3]])

df1 = pd.DataFrame(arr1, columns = ['A', 'B', 'C'])
df1

In [None]:
df2 = pd.DataFrame(arr2, columns = ['A', 'B', 'C'])
df1 + df2

In [None]:
###Dataframes are only 2D
df3 = pd.DataFrame(arr) #This will give error

In [None]:
### dataframe <-> array
df1.values

In [None]:
###Dataframe indexing not much different from arrays
df2.loc[0, :]

In [None]:
df2.loc[:, 'A']

In [None]:
df2.iloc[:, -1]

#### Pandas Example - Data Cleaning

In [None]:
path = 'C:/Box/rlee/Workspace/Python Workshops'

In [None]:
###We can load data from csv / txt files into DataFrames
treasury_frame = pd.read_csv(f'{path}/FRB_H15.csv')
treasury_frame.head(10)

We need to clean this dataset.

In [None]:
treasury_frame_orig = treasury_frame.copy()

###Start by relabeling columns
treasury_frame.columns = [
    'Date', 'yield_1mo', 'yield_3mo', 'yield_6mo', 'yield_1yr', 'yield_2yr', 
    'yield_3yr', 'yield_5yr', 'yield_7yr', 'yield_10yr', 'yeild_20yr', 'yield_30yr'
]

###WRITE CODE HERE



treasury_frame

Is the data clean now? What else should we do?

In [None]:
###WRITE CODE HERE





In [None]:
###Save cleaned dataset back to local directory
treasury_frame.to_csv(f'{path}/FRB_H15_clean.csv')