# 1. Libraries & Sample Data
The first step is to load our Python Libraries and download the sample data. The dataset represents Apple stock price (1d bars) for the year 2010

In [None]:
# Load Python Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from IPython.display import display, HTML
# for dataframe display
pd.set_option("display.max_rows", None)
def display_df(df):
    # Puts the scrollbar next to the DataFrame
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" + df.to_html() + "</div>"))

In [None]:
# Download Sample Data
data = pd.read_csv('AAPL_2009-2010_6m_all_features_1d.csv')

# 5. State Space Representation
Now we have a set of data with OHLC data plus some techinchal indicators. Using this data, construct the state space matrix, whese features of the state space are Close Price, 5-day Moving Average, 20-day Moving Average, Bollinger Bands (upper and lower), and 20-day Historical Volatility of Close Price.

In [None]:
# Construct the State Space Matrix
dataset = data.reset_index()[['Date', 'Close', 'MA5', 'MA20', 'BB_upper', 'BB_lower', 'Vol20']]
display_df(dataset)

# 6. Z-Score Normalization
Now that we have cleaned our data, and created our features of interest, we must normalize our data. For this example, we use the sklearn StandardScaler, which centers the data and normalizes to unit variance (i.e. performs z-score normalization for us). Do this in a simple, non-rolling fashion.

In [None]:
# Display & Plot Un-normalized Dataset
display_df(dataset)
dataset['Close'].plot()
dataset['BB_upper'].plot()
dataset['BB_lower'].plot(rot=45)

In [None]:
# Normalize Dataset with StandardScaler
normlist = []
static_normed_dataset = pd.DataFrame(index=dataset.index)
for col in dataset.columns:
    if col == 'Date':
        static_normed_dataset[col] = dataset[col]
        continue
    normalizer = StandardScaler()
    column_data = pd.DataFrame(dataset[col])
    normalizer.fit(column_data)
    static_normed_dataset[col] = normalizer.transform(column_data).flatten()
    normlist.append(normalizer)

In [None]:
# Plot Normalized Features: Close, MA20, BB Upper, BB Lower
static_normed_dataset['Close'].plot()
static_normed_dataset['MA20'].plot()
static_normed_dataset['BB_upper'].plot()
static_normed_dataset['BB_lower'].plot()

In [None]:
# Plot Normalized Features: Close, MA20, MA5
static_normed_dataset['Close'].plot()
static_normed_dataset['MA20'].plot()
static_normed_dataset['MA5'].plot()

In [None]:
# Plot Normalized Features: Close, Volatility
static_normed_dataset['Close'].plot()
static_normed_dataset['Vol20'].plot()

# 7. Rolling Z-Score Normalization
Now that we have cleaned our data, and created our features of interest, we must normalize our data. For this example, we use the sklearn StandardScaler, which centers the data and normalizes to unit variance. Due to the fact that our data is time-series data, it is best practice to do this in a rolling fashion. We choose 20 days as our window for normalization, and run the StandardScaler in a rolling (non-overlapping) fashion. 

In [None]:
# Display raw dataset (unnormalized) 
display_df(dataset)

In [None]:
# Normalize the chosen price data & features
normed_dataset = pd.DataFrame(index=dataset.index, columns=dataset.columns)
step = 20
for col in dataset.columns:
    n = 0
    if col == 'Date':
        normed_dataset[col] = dataset[col]
        continue
    while n<=len(data.index):
        normalizer = StandardScaler()
        if n == 0:
            column_data = dataset.loc[:n+step, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[:n+step, col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        elif n+step >= len(data.index):
            column_data = dataset.loc[n:, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[n:,col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        else:
            column_data = dataset.loc[n:n+step, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[n:n+step, col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        n += step
        # normlist.append(normalizer)
display_df(normed_dataset)


In [None]:
# Plot Normalized Features: Close, MA20, BB Upper, BB Lower
normed_dataset['Close'].plot()
normed_dataset['MA20'].plot()
normed_dataset['BB_upper'].plot()
normed_dataset['BB_lower'].plot()

In [None]:
# Plot Normalized Features: Close, MA20, MA5
normed_dataset['Close'].plot()
normed_dataset['MA20'].plot()
normed_dataset['MA5'].plot()

In [None]:
# Plot Normalized Features: Close, Volatility
normed_dataset['Close'].plot()
normed_dataset['Vol20'].plot()

In [None]:
normed_dataset.to_csv('AAPL_2009-2010_6m_normed_1d.csv')