# 1. Libraries & Sample Data
The first step is to load our Python Libraries and download the sample data. The dataset represents Apple stock price (1d bars) for the year 2010

In [None]:
# Load Python Libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler

from IPython.display import display, HTML
# for dataframe display
pd.set_option("display.max_rows", None)
def display_df(df):
    # Puts the scrollbar next to the DataFrame
    display(HTML("<div style='height: 200px; overflow: auto; width: fit-content'>" + df.to_html() + "</div>"))

In [None]:
# Download Sample Data
data = pd.read_csv('aapl_2010_3m_RAW.csv')

# 2. Exploratory Data Analysis
Next, we want to analyze our data. Display the data as a dataframe, and plot some relevant data so you can get an idea of what our dataset looks like.

In [None]:
# Display as Dataframe
display_df(data)

In [None]:
# Index data by Date
data.set_index('Date', inplace=True)
display_df(data)

In [None]:
# Plot the Close Data
data['Close'].plot()

# 3. Data Cleaning
Next, we need to clean our data for training our model. This requires removal of NaN values.

In [None]:
# Check for null values
print('Number of Null Values =', data.isnull().sum())

In [None]:
# forward fill missing values
data=data.ffill()
display_df(data)

In [None]:
# Check for null values
print('Number of Null Values =', data.isnull().sum())

In [None]:
data.reset_index().to_csv('aapl_2010_3m_CLEAN.csv')

# 4. Feature Definition
Now that we have cleaned our stock data, we can define some financial indicaors, or "features" to train our model on. We will be calculating some popular indicators: 20-day Close Moving Average, 5-day Close Moving Average, 20-day Close Bollinger Bands, and 20-day Historical Volatility of Close Price. 

In [None]:
data['MA5'] = data['Close'].rolling(window=5).mean()
data['MA20'] = data['Close'].rolling(window=20).mean()
data['STD20'] = data['Close'].rolling(window=20).std()
data['BB_upper'] = data['MA20'] + (data['STD20'] * 2)
data['BB_lower'] = data['MA20'] - (data['STD20'] * 2)
data['Log_Ret'] = np.log(data['Close'] / data['Close'].shift(1))
data['Vol20'] = data['Log_Ret'].rolling(window=20).std() * np.sqrt(252)
display_df(data)

In [None]:
# Remove rows with MA=NaN
data = data.dropna(axis=0)
display_df(data)

In [None]:
# Plot Features: Close, MA20, BB Upper, BB Lower
data['Close'].plot()
data['MA20'].plot()
data['BB_upper'].plot()
data['BB_lower'].plot()

In [None]:
# Plot Features: Close, MA20, MA5
data['Close'].plot()
data['MA20'].plot()
data['MA5'].plot()

In [None]:
# Plot Features: Volatility
data['Vol20'].plot()

# 5. Normalization
Now that we have cleaned our data, and created our features of interest, we must normalize our data. For this example, we use the sklearn StandardScaler, which centers the data and normalizes to unit variance. Due to the fact that our data is time-series data, it is best practice to do this in a rolling fashion. We choose 20 days as our window for normalization, and run the StandardScaler in a rolling (non-overlapping) fashion. 

In [None]:
# Get rid of Date, Open, High, and Low data
dataset = data.reset_index()[['Date', 'Close', 'MA5', 'MA20', 'STD20', 'BB_upper', 'BB_lower', 'Vol20']]
display_df(dataset)
# normlist = []

In [None]:
# Normalize the chosen price data & features
normed_dataset = pd.DataFrame(index=dataset.index, columns=dataset.columns)
step = 20
for col in dataset.columns:
    n = 0
    if col == 'Date':
        normed_dataset[col] = dataset[col]
        continue
    while n<=len(data.index):
        normalizer = StandardScaler()
        if n == 0:
            column_data = dataset.loc[:n+step, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[:n+step, col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        elif n+step >= len(data.index):
            column_data = dataset.loc[n:, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[n:,col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        else:
            column_data = dataset.loc[n:n+step, col]
            normalizer.fit(column_data.values.reshape(-1, 1))
            normed_dataset.loc[n:n+step, col] = normalizer.transform(column_data.values.reshape(-1, 1)).flatten()
        n += step
        # normlist.append(normalizer)
display_df(normed_dataset)


In [None]:
# Plot Normalized Features: Close, MA20, BB Upper, BB Lower
normed_dataset['Close'].plot()
normed_dataset['MA20'].plot()
normed_dataset['BB_upper'].plot()
normed_dataset['BB_lower'].plot()

In [None]:
# Plot Normalized Features: Close, MA20, MA5
normed_dataset['Close'].plot()
normed_dataset['MA20'].plot()
normed_dataset['MA5'].plot()

In [None]:
# Plot Normalized Features: Close, Volatility
normed_dataset['Close'].plot()
normed_dataset['Vol20'].plot()