# Notes for user:

*   This notebook will use the dataset COVID-19 and forex to predict the forex spot price of a currency pair.
*   What to ask user before using this application:
*   What currency pair are you interested in? USD/GBP (This layout means that the base currency is USD)
*   The currency the user wants to look at is how much the USD is worth to the GBP.
*   Therefore, we look at the death rates in the US and the UK.
*   LSTMs are good at looking at changed over time. We need to also consider that what happen more recently is a better judgement than what happened a long time ago. This is why CNNs might be interesting to explore.


In [None]:
# Use some functions from tensorflow_docs
!pip install git+https://github.com/tensorflow/docs

In [None]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print(tf.__version__)

In [None]:
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling

# Data Collection


In [None]:
# Importing covid_dataset using pandas. Droping null values.
covid_dataset_data= pd.read_csv('covid.csv', na_values = "?", comment='\t', skipinitialspace=True)
covid_dataset = covid_dataset_data.copy()
# covid_dataset = covid_dataset.dropna()
covid_dataset.head()

In [None]:
# Trying to isolate location/country based on currency in question
# We are looking at pound and sterling therefore need to isolate 'United Kingdom' & 'United States' in location column
# Problem: Informaiton prints up until 'Trinidad and Tobago' for some reason – can print 'Togo'. Therefore cannot print United Kingdom or United States
us_death_rates = covid_dataset.loc[covid_dataset.location=='United States', ['location', 'date', 'total_deaths', 'total_cases', 'cvd_death_rate']]
us_death_rates.head()

In [None]:
# us_death_rates = covid_dataset.loc[covid_dataset.location=='Zimbabwe', ['location', 'date', 'total_deaths','total_cases','cvd_death_rate']]
# us_death_rates.head()

In [None]:
us_death_rates.shape

In [None]:
# Import forex
forex_dataset_data= pd.read_csv('GBP_USD.csv', na_values = "?", comment='\t', skipinitialspace=True)
forex_dataset = forex_dataset_data.copy()
# forex_dataset = forex_dataset_data.dropna()


# Data Preprocessing / Data Cleaning & Collating

In [None]:
# Filter dataset
forex_dataset.pop('Open')
forex_dataset.pop('High')
forex_dataset.pop('Low')
forex_dataset.pop('Change %')

In [None]:
forex_dataset.tail()

In [None]:
# Isolate covid_date
import time 
us_date = us_death_rates['date']
new_us_dates = [time.strptime(x.replace("-"," "), '%Y %m %d')for x in us_date]
us_death_rates['date'] = new_us_dates
us_death_rates.head()


In [None]:
# Isolate forex_date
import time 
forex_date = forex_dataset['Date']
new_forex_dates = [time.strptime(x.replace(',', ""), '%b %d %Y')for x in forex_date]
forex_dataset['Date'] = new_forex_dates
forex_dataset.tail()


In [None]:
print(len(forex_dataset))

In [None]:
# Merging dates from two tables
dataset = pd.merge(left=forex_dataset, left_on='Date',
         right=us_death_rates, right_on='date')
dataset.tail()
len(dataset)


In [None]:
# We pop the 'date' from the covid_dataset because this column starts later than the forex column
# We want to see the forex price before covid happened
dataset.pop('date')

In [None]:
# Problem: This still starts at 2019,12,13 when we want 2019,12,02
# However this does not affect our data graph for some reason - scroll down
dataset.tail()

In [None]:
# Object means string
dataset['total_deaths']

In [None]:
# View what date looks like
date = dataset['Date']
date.head()
len(date)

In [None]:
# Splitting timestamp and isolating the date
dateFormatting = pd.DataFrame({'new_date': pd.date_range('2019-12-31', periods=len(date))})
# dateFormatting['new_date'] = [d.date() for d in dateFormatting['my_timestamp']]
# dateFormatting['new_time'] = [d.time() for d in dateFormatting['my_timestamp']]
print(dateFormatting)


In [None]:
# Convert date to list so that we can use the dates to plot on graph
my_xticks = dateFormatting['new_date'].tolist()
print(my_xticks)

In [None]:
dataset['Date']

# Plotting Features before Normalisation

In [None]:
# Plotted prices and dates
# Notice how the dates start and end

from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,6

dates = np.array(dateFormatting['new_date'])
price = np.array(dataset['Price'])
plt.plot_date(dates,price, linestyle ='solid')
plt.gcf().autofmt_xdate()
date_format = mpl_dates.DateFormatter('%b,%d,%Y')
plt.title('Time Series Price of USD compare to GBP')
plt.xlabel('Date')
plt.ylabel('Price')
plt.gca().xaxis.set_major_formatter(date_format)

In [None]:
# print(price)
test = np.array(dataset['total_deaths'])
# print(test)
price.shape
# test.shape
test = np.array(dataset['total_deaths'])
list_arr = np.array(test)
reversed_arr = list_arr[::-1]
reversed_arr.shape
price.shape

In [None]:
# Adds the accumulative deaths every day of a country
# Problem – This needs to be sorted from the end since the way out data is set up - can this be made more efficient?
total_us_deaths = dataset['total_deaths']
list_arr = np.array(total_us_deaths)
reversed_arr = list_arr[::-1]
newDay = 0
day = []
for currentday in reversed_arr:
   newDay = currentday + newDay
   day.append(newDay)


In [None]:
# Plots the accumulative daily death rates of a country
dates = np.array(dateFormatting['new_date'])
us_deaths = np.array(day)
plt.plot_date(dates,us_deaths, linestyle ='solid')
plt.gcf().autofmt_xdate()
date_format = mpl_dates.DateFormatter('%b,%d,%Y')
plt.title('Time Series of Total Deaths of US')
plt.xlabel('Date')
plt.ylabel('Total Deaths')


In [None]:
test.shape

In [None]:
price.shape

In [None]:
price_frame = pd.DataFrame(np.concatenate([price]), columns= ["Price"])
death_frame = pd.DataFrame(np.concatenate([day]), columns= ["Total_Deaths_US"])
price_frame.append(death_frame)

# Normalisation of Features

In [None]:
# Working Normalised method!!
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler(feature_range=(0,1))
# they are so small that is why
normalisation_us_deaths = scaler.fit_transform(death_frame).reshape(-1,1)
# loosing precision by converting to decimal but makes more readable
# but when plotting it use normalisation_us_deaths
normalisation_price = scaler.fit_transform(price_frame).reshape(-1,1)
# 8 decimal: readable formate
# val = [print("{:.8f}".format(float(x)))for x in normalisation_us_deaths]
# print(normalisation_price)
# print(val)
# print(normalisation_price)

In [None]:
# Inspect the data
# Look at join distributions of afew pairs of collumns from the training set
# Need to revise what this graph does
import matplotlib.pyplot as plt
plt.title('Time Series of Total Deaths of US & USD/GBP')
plt.plot(normalisation_us_deaths,label="us_covid_death_rates")
plt.plot(normalisation_price, label="USD/GBP price")
plt.ylabel('Normalisation between 0 and 1')
plt.xlabel('Date')
plt.legend() 
plt.show()

# Preprocessing the Data - Train, Validation & Test


In [None]:
new_dataset = np.hstack((normalisation_us_deaths,normalisation_price))
new_dataset

In [None]:
# Test, Validation, Train 
# This should be ordered in respect to date – not cross validation or random seed
# We are going to divide our data based on date because the next data is always dependent on the previous data
# 65% of the total length of my datadrame
# 35% test size/ validation
training_size = int(len(new_dataset)*0.65)
test_size = (len(new_dataset)-training_size)//2 # // Rounds down
train_data, valset_data, test_data = new_dataset[0:training_size], new_dataset[training_size:(training_size + test_size)],new_dataset[(training_size + test_size):]
print(train_data.shape)
print(valset_data.shape)
print(test_data.shape)

In [None]:
print(valset_data[:4])

In [None]:
training_size, test_size

In [None]:
len(train_data), len(test_data)

In [None]:
# train_data

In [None]:
# Convert an array of values into a dataset matrix
# Go over what this method does
# time_step = how many previous days you have to consider before making a prediction
def create_dataset(dataset, time_step=1):
  dataX, dataY = [],[]
  # then we subtract e.g. timestep=3 : lendataset -3-3 so you are shifting
  # iterates from 
  # -1: guarentees 8th datapoint 
  # 100 - 1 = 99-1 =5 -> 8th
  # i is the starting point of dataset
  for i in range(len(dataset)-time_step-1):
    a = dataset[i:(i+time_step)]  # i=0, 0,1,2,3
    # print(a)
    dataX.append(a)
    y = dataset[i + time_step][1] 
    dataY.append(y)
  return np.array(dataX), np.array(dataY)

In [None]:
# weekly_timestep means taking in 7 days of features at a time 
# whats the point of writing 7 when it is assigned to 1?
weekly_timestep = 7
X_train, Y_train = create_dataset(train_data, weekly_timestep)
X_val, Y_val = create_dataset(valset_data, weekly_timestep)
X_test, Y_test = create_dataset(test_data, weekly_timestep)



In [None]:
print(X_test.shape), print(Y_test.shape)
# Finished preprocessing the data

# Stacked LSTM Model

In [None]:
# Create the stacked LSTM model
# One LSTM after the other
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
def lstm_model(lr_rate, hidden_layer, optimizer_one):
  # we want input_tensor to be a single data point
  model = Sequential()
  opt = None
  if optimizer_one == 'adam':
   opt = keras.optimizers.Adam(learning_rate=lr_rate)
  else:
    opt = keras.optimizers.Adamax(learning_rate=lr_rate)
  model.add(InputLayer(input_shape=(7,2)))
  model.add(LSTM(hidden_layer,return_sequences=True))
  model.add(LSTM(hidden_layer, return_sequences=True))
  model.add(LSTM(hidden_layer))
  model.add(Dense(1))
  model.compile(loss = 'mean_squared_error', optimizer=opt)
  return model

## Hyperparameter Tuning


*   Optimizers:
[Overview of different Optimizers for neural networks](https://medium.com/datadriveninvestor/overview-of-different-optimizers-for-neural-networks-e0ed119440c3#:~:text=Adagrad%20eliminates%20the%20need%20to,is%20no%20longer%20able%20learning.)
*   [Conceptual Guide for HP Tuning](https://medium.com/@jackstalfort/hyperparameter-tuning-using-grid-search-and-random-search-f8750a464b35)
*   [Consider this for Grid Search & Random Search](https://scikit-learn.org/stable/auto_examples/model_selection/plot_randomized_search.html)
*   [And this](https://stackoverflow.com/questions/58137140/randomizedsearchcv-with-keras-lstm-regression)
![Grid Search & Random Search](https://miro.medium.com/max/1200/1*ZTlQm_WRcrNqL-nLnx6GJA.png)



## Grid Search

*   We must choose candidates for good hidden layers and optimizers. Then plot a comparison.
*   List of optimizers: 
*   optimizers = ['SGD', 'RMSprop','adam','adagrad', 'adadelta', 'adamax','nadam', 'ftrl']
*   Callbacks: Restores best weights that is going to give u best model 
*   Consider changing batch size: slower the batch_size better training, because small dataset




In [None]:
hidden_layer_list = [14,20]
best_hidden_layer = 1
optimizer_list = ['adam','adamax']
best_optimizer = 'adam'
learning_rate_list = [3e-3, 3e-3]
best_learning_rate = 3e-3
least_loss = 1



for each_hidden_layer in hidden_layer_list:
  for each_optimizer in optimizer_list:
    for each_learning_rate in learning_rate_list:
      our_model = lstm_model(each_learning_rate, each_hidden_layer, each_optimizer)
      our_model.fit(X_train, Y_train, 
                validation_data= 
                (X_val, Y_val)
                , epochs=20,batch_size=10,verbose=1,
                callbacks=[tf.keras.callbacks.EarlyStopping(patience=3, restore_best_weights=True)])
      loss = our_model.evaluate(X_test, Y_test) # all results from dictionary
      if loss < least_loss:
          least_loss = loss
          best_learning_rate = each_learning_rate
          best_optimizer = each_optimizer
          best_hidden_layer = each_hidden_layer
  print(least_loss, best_learning_rate, best_optimizer, best_hidden_layer)

# Best Model Prediction

In [None]:
best_param_model = lstm_model(best_learning_rate, best_hidden_layer, best_optimizer)
best_param_model.fit(X_train, Y_train, validation_data= (X_val, Y_val), epochs=20,batch_size=10,verbose=1)
loss = best_param_model.evaluate(X_test, Y_test) # all results from dictionary


In [None]:
# Prediction and check performance metric
train_predict = best_param_model.predict(X_train)
test_predict = best_param_model.predict(X_test)
train_predict.shape
train_predict.shape

In [None]:
# Transform back to original form
train_predict=scaler.inverse_transform(train_predict)
test_predict=scaler.inverse_transform(test_predict)
# print(train_predict, test_predict)

# Performance Metric




In [None]:
# Calculate the RMSE performance metrics
# Output for the train dataset
import math
from sklearn.metrics import mean_squared_error
math.sqrt(mean_squared_error(Y_train, train_predict))

In [None]:
# Use testing
math.sqrt(mean_squared_error(Y_test, test_predict))

In [None]:
# Evaluating: How well is model performing 
# Probability distribution rather than time series
# Tensorflow??
# KL Divergence
# Consider this for evaluation: https://www.tensorflow.org/api_docs/python/tf/keras/losses/KLDivergence
# kl divergence measures difference between 2 probability distributions, 
# measure of what i am predicting and what should have been predicting

# Actual Data, Train Prediction & Test Prediction data

In [None]:
# Taking in the 7 

look_back = 7

# empty_like: Return a new array with the same shape and type as a given array.
# np.nan: Replect all with nan? What does this mean?
# What does this do?? 7: predict.length() + 7, :

trainPredictPlot = np.empty_like(new_dataset)
trainPredictPlot[:,:]= np.nan
trainPredictPlot[look_back:len(train_predict)+look_back, :] = train_predict
print(trainPredictPlot.shape)

# empty_like: Return a new array with the same shape and type as a given array.
# np.nan: Replect all with nan? What does this mean?
# Dont understand what this is doing
# What does this do?? predict.length() + (7*2) +2 :(training_size + test_size)] = test_predict

testPredictPlot = np.empty_like(new_dataset)
testPredictPlot[:,:]= np.nan
testPredictPlot[len(train_predict)+(look_back*2)+1: (training_size + test_size)] = test_predict
# new_dataset[training_size:(training_size + test_size)]
print(testPredictPlot.shape)

# Plot baseline and predictions
plt.title('Analysis of train, test and actual')
plt.plot(scaler.inverse_transform(new_dataset), label="actual dataset")

trainPredictPlot = [item[1] for item in trainPredictPlot] 
plt.plot(trainPredictPlot, label="train_predict")

testPredictPlot = [item[1] for item in testPredictPlot] 
plt.plot(testPredictPlot, label="test_predict")

# as you can see th test data and train is divided because we want to test after a specific day
plt.ylabel('Forex price & US Death Rates')
plt.xlabel('Date')
plt.legend()
plt.show()
# Printing 2 actual_dataset, train_predict, test_predict because 2 input layers

# Custom 4 day prediction

*   Training train data to predict future days and evaluating the accuracy on validation data.
*   Training train data to predict future days, no way of evaluating however this can tell us what is likely to occur.
*   Using last 7 days of training data to predict the future custom days(4). 



In [None]:
# Taking the last 7 days of test
# print(test_data[80:])
# is this supposed to be test or train_data
X_input=test_data[40:].reshape(1,-1)
print(X_input)
X_input.shape

temp_input = train_data[-7:]
temp_input



In [None]:
# Demonstrate prediction for the next 4 days:

# Step 1. 7 days of feature = temp_input taken from the last 7 days of the training set
# Step 2  i is the day you are on and we want to predict 4 days: 0,1,2,3 (4 days)
# Step 3. Predict the 8th day on each of the 7 days,
# Step 4. Add this new predicted day to the list
# Step 5. Because we are now taking in the last 7 days + the new predicted value, the list is now size 8  
# Step 6. In order to take in 7 days at a time, we shift 1 to the right.
# Step 7. We reassign the X_train to the new 7 days
# Step 8. We predict the 8th day of the 7 days and continue the process
# Step 9. We then add the 8th day to the output list
# Step 10. We keep track of the 4 days by incrementing by i at the end-> i = i+1
# Step 11. Once we have predicted the 4 days, we return the list of 4 day values.

from numpy import array
custom_day = 4
lst_output=[]
n_steps=7
i=0
X_input = temp_input
# print(X_input)
while(i<custom_day):
  if(len(temp_input)>7):
    # print(temp_input)  
    X_input=array(temp_input[1:])
    # print("{} day input {}".format(i,X_input))
    # Correct input shape (1,7,2)
    X_input=X_input.reshape((1,n_steps,2))
    # Prediction of X_input
    print(X_input)
    Y_hat = best_param_model.predict(X_input)
    # print(Y_hat)
    temp_input = np.append(temp_input, Y_hat)
    temp_input=temp_input[1:]
    lst_output.append(Y_hat[0][0])
    i=i+1
  else:
    X_input=X_input.reshape((1,n_steps,2))
    print(X_input)
    Y_hat = best_param_model.predict(X_input, verbose=0)
    # print(Y_hat)
    temp_input = np.append(temp_input, Y_hat)
    lst_output.append(Y_hat[0][0])
    i=i+1
# [0.039997526, 0.035484765, 0.029113937, 0.025276443]
# Print the predicted forex price for the next 4 days 
print(lst_output)

### Plotting evaluation on graph. Training output versus first 4 days of the validation data.

In [None]:
# Custom day plotted
day_new = custom_day-1

# Predicted 4 days from training
# Plot and label
# Transform back to original value before normalised
custom_day_prediction = np.array(lst_output)
print(custom_day_prediction)
plt.plot(custom_day_prediction, label="Predicted")
# plt.plot(day_new, scaler.inverse_transform([custom_day_prediction]))

# Actual 4 days from validation
# Plot and label
# Transform back to original value before normalised
val_data = [item[1] for item in valset_data] 
custom_day_validation = val_data[:custom_day]
print(custom_day_validation)
plt.plot(custom_day_validation, label="Actual")
# plt.plot(day_new, scaler.inverse_transform([custom_day_validation]))

# Information
plt.title('Predicted custom day from training data versus actual custom day from validation data')
plt.xlabel('Number of custom days')
plt.ylabel('Price')
plt.legend()
plt.show()

# our_model = [0.7160597  0.46945733 0.75719374 0.4917585 ]

In [None]:
# Connect what we got as a result to the future
day_new = custom_day-1

# Predicted 4 days from training
# Plot and label
# Transform back to original value before normalised
custom_day_prediction = np.array(lst_output)
print(custom_day_prediction)
plt.plot(custom_day_prediction, label="Predicted")
plt.plot(day_new, scaler.inverse_transform([custom_day_prediction]))

# Actual 4 days from validation
# Plot and label
# Transform back to original value before normalised
# Start on 4 instead
val_data = [item[1] for item in valset_data] 
custom_day_validation = val_data[custom_day:]
plt.plot(custom_day_validation, label="Actual")
plt.plot(day_new, scaler.inverse_transform([custom_day_validation]))

# Information
plt.title('Predicted custom day from training data versus actual custom day from validation data')
plt.xlabel('Number of custom days')
plt.ylabel('Price')
plt.legend()
plt.show()



In [None]:
# Custom day plotted
day_new = custom_day-1

# Predicted 4 days from training
# Plot and label
# Transform back to original value before normalised
custom_day_prediction = np.array(lst_output)
print(custom_day_prediction)
plt.plot(custom_day_prediction, label="Predicted")
# plt.plot(day_new, scaler.inverse_transform([custom_day_prediction]))

# Actual 4 days from validation
# Plot and label
# Transform back to original value before normalised
val_data = [item[1] for item in valset_data] 
custom_day_validation = val_data[:custom_day]
print(custom_day_validation)
plt.plot(custom_day_validation, label="Actual")
# plt.plot(day_new, scaler.inverse_transform([custom_day_validation]))

# Information
plt.title('Predicted custom day from training data versus actual custom day from validation data')
plt.xlabel('Number of custom days')
plt.ylabel('Price')
plt.legend()
plt.show()

In [None]:
# What we have:
# day_new=np.arange(1,8)
# print(day_new)
# What we want to predict:
# day_pred=np.arange(8,12)
# print(day_pred)

# Conclusion and next steps
We explored the LSTM algorithms:

*   List item
*   List item


In conclusion it seems that ..