# Machine Learning Nanodegree

### Capstone Project
---

## Stock Price Prediction


---

### Read in the Data


In [None]:
%autosave 60
# import all libraries
import io
import os
import time, requests
import matplotlib.pyplot as plt
import numpy as np 
import pandas as pd 

import boto3
import sagemaker
from sagemaker import get_execution_role

%matplotlib inline

In [None]:
!pip -q install -r requirements.txt --upgrade --no-cache-dir

In [None]:
# sagemaker session, role
sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()

# S3 bucket name
bucket = sagemaker_session.default_bucket()

In [None]:
source_dir = '/home/ec2-user/SageMaker/ML_Stock_Price_Prediction'
os.chdir(source_dir)

data_dir = source_dir + '/data/'
if not os.path.exists(data_dir):
    os.makedirs(data_dir)

processed_data_dir = source_dir +'/data/processedData'
if not os.path.exists(processed_data_dir):
    os.makedirs(processed_data_dir)

In [None]:
symbol = 'GOOGL'
start  = '2009-11-01'
end    = '2019-11-01'

In [None]:
import yfinance as yf

# def get_epoch_time(date):
#     ''' Get the epoch time for a particular date
#         param | date: Date in the format YYYY-MM-DD
#     '''
#     os.environ['TZ']='EST+5ETD'
#     date_time = date + ' 00:00:00'
#     pattern = '%Y-%m-%d %H:%M:%S'
    
#     epoch = int(time.mktime(time.strptime(date_time, pattern)))
#     return epoch

def get_historical_data(symbol,start_date,end_date, data_dir):
    ''' Daily quotes from Google. Date format='yyyy-mm-dd' '''
    symbol = symbol.upper()
    df = yf.download('{0}'.format(symbol), start=start_date, end=end_date)
    df.to_csv(data_dir+'{0}.csv'.format(symbol), index = True, header=False)
    col_names = ['Date','Open','High','Low','Close','Adj_Close','Volume']
    stocks = pd.read_csv('data/{0}.csv'.format(symbol), header=0, names=col_names)
    
    dataFile = pd.DataFrame(stocks)
    return dataFile
    
#     col_names = ['Date','Open','High','Low','Close','Volume']
#     stocks = pd.read_csv(url_string, header=0, names=col_names) 
    
#     df = pd.DataFrame(stocks)
#     return df

In [None]:
data = get_historical_data(symbol, start, end, data_dir) # from January 1, 2005 to June 30, 2017

In [None]:
data.head()

In [None]:
data.describe()

In [None]:
from src import viz_functions as viz, processData as ppd 

In [None]:
stocks = ppd.remove_data(data)

#Print the dataframe head and tail
print(stocks.head())
print("---")
print(stocks.tail())

In [None]:
viz.plot_basic(stocks)

In [None]:
stocks = ppd.normalise_data(stocks)
print(stocks.head())

In [None]:
viz.plot_basic(stocks)

In [None]:
if not os.path.exists(processed_data_dir): # Make sure that the folder exists
    os.makedirs(processed_data_dir)

In [None]:
stocks.to_csv(processed_data_dir + '/{0}_processed.csv'.format(symbol) ,index= False, header=True)

## Linear Regression 

In [None]:
# import LinearLearner
from sagemaker import LinearLearner

# specify an output path
prefix = 'stockPrices'
output_path = 's3://{}/{}/output'.format(bucket, prefix)

# instantiate LinearLearner
linear = LinearLearner(role=role,
                       train_instance_count=1, 
                       train_instance_type='ml.c4.xlarge',
                       predictor_type='regressor',
                       loss='squared_loss',
                       output_path=output_path,
                       sagemaker_session=sagemaker_session,
                       epochs=15)

In [None]:
stocks = pd.read_csv(processed_data_dir + '/{0}_processed.csv'.format(symbol))
display(stocks.head())

In [None]:
train_fraction = 0.8
X_train, X_test, y_train, y_test = ppd.train_test_split(stocks, train_fraction)

print("x_train", X_train.shape)
print("y_train", y_train.shape)
print("x_test", X_test.shape)
print("y_test", y_test.shape)

In [None]:
# convert features/labels to numpy in float32 format
train_x_np = X_train.astype('float32')
train_y_np = y_train.astype('float32')

print("y_train", train_y_np.shape)

# create RecordSet
formatted_train_data = linear.record_set(train_x_np , labels=train_y_np)

In [None]:
%%time 
# train the estimator on formatted training data
linear.fit(formatted_train_data)

In [None]:
# convert features/labels to numpy
test_x_np = X_test.astype('float32')
test_y_np = y_test.astype('float32')

In [None]:
%%time 
# deploy and create a predictor
linear_predictor = linear.deploy(initial_instance_count=1, instance_type='ml.t2.medium')

### Evaluate Model

In [None]:
# from sagemaker.predictor import csv_serializer, json_deserializer

result = linear_predictor.predict(test_x_np[0])
print (result)

In [None]:
from sklearn.metrics import mean_squared_error
import math

# code to evaluate the endpoint on test data
# returns a variety of model metrics
def evaluate(predictor, test_features, test_labels, verbose=True):
    """
    Evaluate a model on a test set given the prediction endpoint.  
    Return binary classification metrics.
    :param predictor: A prediction endpoint
    :param test_features: Test features
    :param test_labels: Class labels for test data
    :param verbose: If True, prints a table of all performance metrics
    :return: A dictionary of performance metrics.
    """
    
    # We have a lot of test data, so we'll split it into batches of 100
    # split the test data set into batches and evaluate using prediction endpoint    
    prediction_batches = [predictor.predict(batch) for batch in np.array_split(test_features, 100)]
    
    # LinearLearner produces a `predicted_label` for each data point in a batch
    # get the 'predicted_label' for every point in a batch
    test_preds = np.concatenate([np.array([x.label['score'].float32_tensor.values[0] for x in batch]) 
                                 for batch in prediction_batches])
    
    viz.plot_prediction(test_labels, test_preds)

    #calculate the score based on mean squared error
    score = mean_squared_error(test_preds, test_labels)
    
    # printing a table of metrics
    if verbose:
        print("{:<11} {:.8f} MSE ({:.8f} RMSE)".format('Score:', score, math.sqrt(score)))
        print()
        
    return {'Predictions': test_preds, 'Score': score}

In [None]:
print('Metrics for simple, LinearLearner.\n')

# get metrics for linear predictor
predictions, metrics = evaluate(linear_predictor, 
                   test_x_np, 
                   test_y_np, 
                   verbose=True) # verbose means we'll print out the metrics

In [None]:
# Deletes a precictor.endpoint
def delete_endpoint(predictor):
        try:
            boto3.client('sagemaker').delete_endpoint(EndpointName=predictor.endpoint)
            print('Deleted {}'.format(predictor.endpoint))
        except:
            print('Already deleted: {}'.format(predictor.endpoint))

In [None]:
# delete the predictor endpoint 
delete_endpoint(linear_predictor)

## XGBoost

In [None]:
from sagemaker.amazon.amazon_estimator import get_image_uri
from sagemaker.predictor import csv_serializer

In [None]:
stocks = pd.read_csv(processed_data_dir + '/{0}_processed.csv'.format(symbol))
display(stocks.head())

display(stocks.tail())

In [None]:
train_fraction = 0.8
val_fraction = 0.2

X_train, X_val, X_test, y_train, y_val, y_test = ppd.train_test_split(stocks, 
                                                                      train_fraction, 
                                                                      val_frac=val_fraction)

print("x_train", X_train.shape)
print("y_train", y_train.shape)
print("x_validation", X_val.shape)
print("y_validation", y_val.shape)
print("x_test", X_test.shape)
print("y_test", y_test.shape)

In [None]:
X_test = pd.DataFrame(X_test)
y_test = pd.DataFrame(y_test)
X_test.to_csv(os.path.join(processed_data_dir, 'XGB_features_test.csv'), header=False, index=False)
y_test.to_csv(os.path.join(processed_data_dir, 'XGB_labels_test.csv'), header=False, index=False)

pd.concat([pd.DataFrame(y_val), pd.DataFrame(X_val)], axis=1).to_csv(os.path.join(processed_data_dir, 'XGB_validation.csv'), header=False, index=False)
pd.concat([pd.DataFrame(y_train), pd.DataFrame(X_train)], axis=1).to_csv(os.path.join(processed_data_dir, 'XGB_train.csv'), header=False, index=False)

In [None]:
prefix = 'stockPrices-xgboost'
output_path = 's3://{}/{}/output'.format(bucket, prefix)

test_location = sagemaker_session.upload_data(os.path.join(processed_data_dir, 'XGB_features_test.csv'), key_prefix=prefix)
val_location = sagemaker_session.upload_data(os.path.join(processed_data_dir, 'XGB_validation.csv'), key_prefix=prefix)
train_location = sagemaker_session.upload_data(os.path.join(processed_data_dir, 'XGB_train.csv'), key_prefix=prefix)

In [None]:
# As stated above, we use this utility method to construct the image name for the training container.
container = get_image_uri(sagemaker_session.boto_region_name, 'xgboost')

# Now that we know which container to use, we can construct the estimator object.
xgb = sagemaker.estimator.Estimator(container,     # The name of the training container
                                    role = role,             # The IAM role to use (our current role in this case)
                                    train_instance_count=1,  # The number of instances to use for training
                                    train_instance_type='ml.m4.xlarge', # The type of instance ot use for training
                                    output_path= output_path,  # Where to save the output (the model artifacts)
                                    sagemaker_session=sagemaker_session) # The current SageMaker session

In [None]:
xgb.set_hyperparameters(base_score=0.5, 
                        booster='gbtree',
                        max_depth=3,
                        eta=0.1,
                        gamma=0,
                        alpha=0,
                        min_child_weight=13,
                        subsample=1.0,
                        objective='reg:linear',
                        early_stopping_rounds=10,
                        num_round=1000)

In [None]:
%%time
# This is a wrapper around the location of our train and validation data, to make sure that SageMaker
# knows our data is in csv format.
s3_input_train = sagemaker.s3_input(s3_data=train_location, content_type='csv')
s3_input_validation = sagemaker.s3_input(s3_data=val_location, content_type='csv')

xgb.fit({'train': s3_input_train, 'validation': s3_input_validation})

In [None]:
# xgb_transformer = xgb.transformer(instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
# xgb_transformer.transform(test_location, content_type='text/csv', split_type='Line')

In [None]:
# xgb_transformer.wait()

In [None]:
# !aws s3 cp --recursive $xgb_transformer.output_path $processed_data_dir

In [None]:
# Y_pred = pd.read_csv(os.path.join(processed_data_dir, 'XGB_features_test.csv.out'), header=None)

In [None]:
%%time
xgb_predictor = xgb.deploy(initial_instance_count=1, instance_type='ml.m4.xlarge')

In [None]:
# We need to tell the endpoint what format the data we are sending is in
xgb_predictor.content_type = 'text/csv'
xgb_predictor.serializer = csv_serializer

predictions = xgb_predictor.predict(X_test)
# predictions = xgb_predictor.predict(X_test.values).decode('utf-8')
# predictions is currently a comma delimited string and so we would like to break it up
# as a numpy array.
predictions = np.fromstring(predictions, sep=',')
predictions = pd.DataFrame(predictions)

In [None]:
viz.plot_prediction(y_test, Y_pred)
# plt.scatter(y_test, Y_pred)

In [None]:
from sklearn.metrics import mean_squared_error
import math

score = mean_squared_error(y_test, predictions)
print("{:<11} {:.8f} MSE ({:.8f} RMSE)".format('Score:', score, math.sqrt(score)))

In [None]:
# delete the predictor endpoint 
delete_endpoint(xgb_predictor)

## Long Short-Term Memory (LSTM) Networks

In [None]:
stocks = pd.read_csv(processed_data_dir + '/{0}_processed.csv'.format(symbol))
display(stocks.head())

In [None]:
train_fraction = 0.8
val_fraction = 0.2

history_size = 200
target_size = 14

X_train, X_val, X_test, y_train, y_val, y_test = ppd.create_dataset_lstm(stocks, 
                                                                         train_frac=train_fraction, 
                                                                         history_size=history_size, 
                                                                         target_size=target_size,
                                                                         val_frac=val_fraction)

X_train = X_train.reshape((-1, history_size, 1))
y_train = y_train.reshape((-1, target_size))

X_val = X_val.reshape((-1, history_size, 1))
y_val = y_val.reshape((-1, target_size))

print("x_train", X_train.shape)
print("y_train", y_train.shape)
print("x_validation", X_val.shape)
print("y_validation", y_val.shape)
print("x_test", X_test.shape)
print("y_test", y_test.shape)

In [None]:
from src.lstm.lstm_model import build_lstm_model

In [None]:
%%time
model = build_lstm_model(input_dim = X_train.shape[2], output_dim = target_size, return_sequences=True)
model.compile(loss='mean_squared_error', optimizer='adam')

In [None]:
model.summary()

In [None]:
%%time
model.fit(X_train,
          y_train,
          epochs=100,
          batch_size = history_size)

In [None]:
X_test = X_test.reshape((-1, history_size, 1))
y_test = y_test.reshape((-1, target_size))

In [None]:
%%time
predictions = model.predict(X_test)

In [None]:
viz.plot_prediction(y_test[:,4],predictions)

## Data Cleanup

In [None]:
# First we will remove all of the files contained in the data_dir directory
# And then we delete the directory itself
%rm -rf $processed_data_dir/*
%rm -rf $processed_data_dir

%rm -rf $data_dir/*
%rm -rf $data_dir