# Notes for user:
*   This notebook will use the US gasoline price API dataset and forex to predict the forex spot price of a currency pair.
*   What to ask user before using this application:
1.   What currency pair are you interested in? USD/GBP (This layout means that the base currency is USD)
2.   The currency the user wants to look at is how much the USD is worth to the GBP. Therefore, we look at the price of gasoline of the US and the UK.
*   To do: We will need to organise US gasoline price dataset/ merge the datasets.

In [1]:
!pip install git+https://github.com/tensorflow/docs

Collecting git+https://github.com/tensorflow/docs
  Cloning https://github.com/tensorflow/docs to /tmp/pip-req-build-xk1xvf7i
  Running command git clone -q https://github.com/tensorflow/docs /tmp/pip-req-build-xk1xvf7i
Building wheels for collected packages: tensorflow-docs
  Building wheel for tensorflow-docs (setup.py) ... [?25l[?25hdone
  Created wheel for tensorflow-docs: filename=tensorflow_docs-0.0.0f84004a0ede6e4cb87f98edbf46f6c2784b002bd_-cp36-none-any.whl size=126412 sha256=57d89fa7f04269eb64d469f9344f7befd4dabe89278db3ad43bed870712a4f9c
  Stored in directory: /tmp/pip-ephem-wheel-cache-_yjc0vqi/wheels/eb/1b/35/fce87697be00d2fc63e0b4b395b0d9c7e391a10e98d9a0d97f
Successfully built tensorflow-docs


In [2]:
import pathlib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
print(tf.__version__)
import tensorflow_docs as tfdocs
import tensorflow_docs.plots
import tensorflow_docs.modeling
# modelling
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor

#modeling metrics
from sklearn import metrics

2.2.0


# Data Collection

In [3]:
# Import forex
# Need to get the most recent dataset
forex_dataset_data= pd.read_csv('GBP_USD.csv', na_values = "?", comment='\t', skipinitialspace=True)
forex_dataset = forex_dataset_data.copy()
# forex_dataset = forex_dataset_data.dropna()
# Filter dataset


In [4]:
forex_dataset.shape

(155, 6)

In [5]:
forex_dataset.head() 

Unnamed: 0,Date,Price,Open,High,Low,Change %
0,"Jul 03, 2020",1.2482,1.2466,1.2493,1.2439,0.12%
1,"Jul 02, 2020",1.2467,1.2476,1.2531,1.2456,-0.05%
2,"Jul 01, 2020",1.2474,1.2401,1.2492,1.2358,0.59%
3,"Jun 30, 2020",1.2401,1.2298,1.2402,1.2257,0.83%
4,"Jun 29, 2020",1.2299,1.234,1.2391,1.2251,-0.30%


In [6]:
# testing uk_fuel_prices as us_fuel_prices
us_fuel_dataset = pd.read_csv('uk_fuel_prices.csv', encoding= 'unicode_escape')
us_fuel_dataset.head()

Unnamed: 0,Date,ULSP,ULSD,ULSP.1,ULSD.1,ULSP.2,ULSD.2,Unnamed: 7,Unnamed: 8,Unnamed: 9,Unnamed: 10,Unnamed: 11,Unnamed: 12,Unnamed: 13
0,09/06/2003,74.59,76.77,45.82,45.82,17.5,17.5,,ULSP = Ultra low sulpur unleaded petrol,,,,,
1,16/06/2003,74.47,76.69,45.82,45.82,17.5,17.5,,ULSD = Ultra low sulphur diesel,,,,,
2,23/06/2003,74.42,76.62,45.82,45.82,17.5,17.5,,,,,,,
3,30/06/2003,74.35,76.51,45.82,45.82,17.5,17.5,,ULSP and ULSD,,,,,
4,07/07/2003,74.28,76.46,45.82,45.82,17.5,17.5,,Duty rate per litre (£) from 7 March 2001,,,,,0.4582


In [7]:
def retrieve_time_series(api, series_ID):
    """
    Return the time series dataframe, based on table and unique Series ID
    """
    #Retrieve Data By Series ID 
    series_search = us_fuel_dataset.data_by_series(series=series_ID)
    ##Create a pandas dataframe from the retrieved time series
    data = series_search['ULSP, Date']
    us_fuel_dataset = pd.DataFrame(list(data.items()), columns=["Date", "ULSP"])
    return us_fuel_dataset
    
def get_us_fuel_dataset():
  
    #Declare desired series ID
    series_ID='PET.EMM_EPM0_PTE_NUS_DPG.W'
    us_fuel=retrieve_time_series(us_fuel_dataset, series_ID)
    return us_fuel

In [8]:
us_fuel_dataset = get_us_fuel_dataset()
us_fuel_dataset.shape


UnboundLocalError: ignored

In [None]:
us_fuel_dataset.head()

# Data Preprocessing / Data Cleaning & Collating

In [None]:
# Create Time series and price columns
# First convert string type to datetime type for pandas series "date"
# Then convert price to numeric type
us_fuel_dataset['Date'] = us_fuel_dataset['Date'].apply(lambda x: pd.to_datetime(str(x.strip().replace(" ", "")[0:8]), format='%d%m%Y'))
us_fuel_dataset['ULSP'] = pd.to_numeric(us_fuel_dataset['ULSP'])
forex_dataset["Date"] = pd.to_datetime(forex_dataset["Date"])

In [None]:
# Our columns are converted correctly
us_fuel_dataset.dtypes


In [None]:
forex_dataset.dtypes

In [None]:
us_fuel_dataset["ULSP"].plot()

In [None]:
forex_dataset.set_index("Date", inplace=True)
us_fuel_dataset.set_index("Date", inplace=True)
forex_dataset["date"] = forex_dataset.index
us_fuel_dataset["Date"] = us_fuel_dataset.index

In [None]:
us_fuel_dataset["week"] = us_fuel_dataset.index.to_period('W')
forex_dataset["week"] = forex_dataset.index.to_period('W')

In [None]:
forex_dataset.head(15)

In [None]:
dataset = pd.merge(
    forex_dataset,
    us_fuel_dataset,
    left_on="week",
    right_on="week",
    how="left"
)

In [None]:
dataset.head(10)

# Sample Features - Go Over


In [None]:
# Look back last 4 days
# X variables/ features that we care about
# Y variable/ output/ what we want to predict
window_size=12
num_samples=len(dataset)-window_size

# Get indices of access for the data
indices=np.arange(num_samples).astype(np.int)[:,None]+np.arange(window_size+1).astype(np.int)

data = dataset["Price"].values[indices] # Create the 2D matrix of training samples
x = data[:,:-1] # Each row represents 12 days in the past
y = data[:,-1] # Each output value represents the 13rd day

In [None]:
# not data frame anymore now it is a list
# forex_dataset.head() of it
x

In [None]:
y

In [None]:
# 80% of my data
split_fraction=0.8
ind_split=int(split_fraction*num_samples)

In [None]:
# x features and y features
# test and train
x_train = x[:ind_split]
y_train = y[:ind_split]
x_test = x[ind_split:]
y_test = y[ind_split:]

In [None]:
x_train.shape, x_test.shape

In [None]:
y_train.shape, y_test.shape

# Modelling

In [None]:
# Help Functions
# Need to consider the trade offs between the models you use
def get_performance (model_pred):
  #Function returns standard performance metrics
  print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, model_pred).round(4))  
  print('Mean Squared Error:', metrics.mean_squared_error(y_test, model_pred).round(4))  
  print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, model_pred)).round(4))
  
  
  
def get_plot (model_pred):
  plt.scatter(model_pred, y_test, color="gray")
  plt.plot(y_test, y_test, color='red', linewidth=2)

# Baseline

In [None]:
# let your prediction be the most recent observation
# looking at the one day previous 
# np.roll-> similar to shift for a dataframe
y_pred_lag=np.roll(y_test,1)

In [None]:
# np.roll(y,1)

In [None]:
# any has to be better than this
# performance reltive to that very simlpe approach
get_performance(y_pred_lag)

In [None]:
# Why cant I print dates on the bottom of my graph?
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,6

get_plot(y_pred_lag)
# plt.plot(y_pred_gb)
plt.title('Baseline Model of USD/GBP & US Gas Prices')
plt.xlabel('Date')
plt.ylabel('Price of USD/GBP & Gas Prices')
# dates = np.array(dateFormatting['Date'])
# plt.plot_date(dates,y_pred_gb, linestyle ='solid')

# Linear Regression

In [None]:
# assigning the linear regression model
model_lr=LinearRegression()
# fitting the linear regression model to xtrain, xytrain
model_lr.fit(x_train, y_train)

In [None]:
#help
# ?model_lr.fit()

In [None]:
# x variable and predict the y 
y_pred_lr=model_lr.predict(x_test)

In [None]:
# check how they compare to the baseline
# not a big difference at all only in root mean
# not great
get_performance(y_pred_lr)

In [None]:
# Why cant I print dates on the bottom of my graph?
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,6

get_plot(y_pred_lr)
# plt.plot(y_pred_gb)
plt.title('Linear Regression Model USD/GBP & US Gas Prices')
plt.xlabel('Date')
plt.ylabel('Price')
# dates = np.array(dateFormatting['Date'])
# plt.plot_date(dates,y_pred_gb, linestyle ='solid')

In [None]:
get_performance(y_pred_lr)

In [None]:
# this should list our 12 co-efficients for all of our variables
# most recent day: 1.18024445
# as you can see other values are - and small, this is why the baseline 
# is so good because the previous day is the msot important value
# change is marginal
# only focus on the the last one
# problem: only print the last output
model_lr.coef_



```
# This is formatted as code
```

# Ridge Regression

In [None]:
model_ridge = Ridge()
model_ridge.fit(x_train, y_train)

#generate predictions
y_pred_ridge=model_ridge.predict(x_test)

In [None]:
# look at difference in baseline and ridge regression
# trying to figure out which one of the features is the most prominent
get_performance(y_pred_ridge)

In [None]:
# Why cant I print dates on the bottom of my graph?
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,6

get_plot(y_pred_ridge)
# plt.plot(y_pred_gb)
plt.title('Ridge Regression Model of USD/GBP & US Gas Prices')
plt.xlabel('Date')
plt.ylabel('Price')
# dates = np.array(dateFormatting['Date'])
# plt.plot_date(dates,y_pred_gb, linestyle ='solid')

In [None]:
# this should list our 12 co-efficients for all of our variables
# most recent day: 0.09432427
# change is marginal
# only focus on the the last one
# problem: only print the last output
# limit the number of days your looking back
# what is the trend: higher/lower?
# keep same features but apply different features
# k folds cross validation? in terms of splitting the data
model_ridge.coef_

# Gradient Boosting Trees

In [None]:
# Model #2 - Gradient Boosting Trees
# not linear, tree based
model_gb = GradientBoostingRegressor()
model_gb.fit(x_train, y_train)

# Infer
y_pred_gb = model_gb.predict(x_test)

In [None]:
get_performance(y_pred_gb)

In [None]:
# Why cant I print dates on the bottom of my graph?
from datetime import datetime, timedelta
from matplotlib import pyplot as plt
from matplotlib import dates as mpl_dates
plt.style.use('seaborn')
from matplotlib import rcParams
rcParams['figure.figsize'] = 15,6

get_plot(y_pred_gb)
# plt.plot(y_pred_gb)
plt.title('Gradient Boosting Model of USD/GBP & US Gas Prices')
plt.xlabel('Date')
plt.ylabel('Price')
# dates = np.array(dateFormatting['Date'])
# plt.plot_date(dates,y_pred_gb, linestyle ='solid')

# Comparison

In [None]:
# absolute -> only have positives
df_comp=pd.DataFrame({"lag":np.absolute(y_test-y_pred_lag), 
              "lr":np.absolute(y_test-y_pred_lr), 
              "ridge":np.absolute(y_test-y_pred_ridge),
              "gb":np.absolute(y_test-y_pred_gb)})

In [None]:
df_comp.head()

In [None]:
# go over again 51:22
y_pred_lag

In [None]:
#
df_comp.plot.bar(figsize=(16, 6))
# show me days 0 to 10
plt.xlim(11,20)
# unsure why this line doesnt work
# plt.ylim(0,10)

# Conclusion and next steps

Explored 4 different algorithms, ranging from simple to complex:


*   Lag (Previous day's price)
*   Linear Regression 
*   Ridge Regression
*   Gradient Boosting Trees

In conclusion it seems that Linear Regression gives us the best performance with the least complexity. Next steps might include adding additional features such as trend and recent volatility (go over).


