# Univariate results, basic feature engineering

In this notebook the following approaches are implemented:
- no feature engineering
- minmax scaling
- differenecing
- differencing + log transforms

In [None]:
from one_dimensional_time_series_forecasting import time_series_prediction, hit_rate
import pandas as pd

# model evalution metrics
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error

# data preprocessing
from sklearn.preprocessing import normalize
from sklearn.preprocessing import MinMaxScaler

# interactive figures
%matplotlib widget 

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

In [None]:
import sys
import warnings
import os
if not sys.warnoptions:
    warnings.simplefilter("ignore")
    os.environ["PYTHONWARNINGS"] = "ignore" 

# 1. Baseline: no feature engineering

## 1.1 S&P 500

In [None]:
# import some data
df = pd.read_csv('./test_data/S&P500_yfinance.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.drop(labels='Adj Close',axis=1,inplace=True)
df.plot(x='Date',y='Close',figsize=(10,5),legend=True,xlabel='Month',subplots=True)
plt.tight_layout()
display(df)

In [None]:
# some forecasting parameters
window_length = 10
split = 500

# scaled input data
scaler = MinMaxScaler()
scaled_training_data = scaler.fit_transform(df['Close'][0:-split].to_numpy().reshape(-1,1)).flatten()
scaled_test_data = scaler.transform(df['Close'][-split:].to_numpy().reshape(-1,1)).flatten()
scaled_input_data = np.append(scaled_training_data,scaled_test_data)

# initialize class object
blah = time_series_prediction('sp500_minmax',df['Date'],scaled_input_data,window_length,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
blah.sliding_window_1(verbose=0) # time series to supervised ML problem
blah.train_test_split(split=split) # testing and training dataset split
blah.test_train_plot(ylabel='Close price')    # visualize training split

# perform some prediction tasks
blah.linear_regression()
blah.support_vector_machine(model_tunning=True)
blah.neural_net_mlp(model_tunning=True)
blah.lstm(model_tunning=True, verbose=0)
blah.naive_model()

# tabulate results nicely
blah.collect_results()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# view final results
blah.conclusion()

### directional accuracy

In [None]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = blah.results['date'].iloc[split+window_length:]
original_values = blah.results['Value'].iloc[split+window_length:]
lin_predictions = blah.results['Linear'].iloc[split+window_length:]
svm_predictions = blah.results['SVM'].iloc[split+window_length:]
nn_predictions =  blah.results['NN'].iloc[split+window_length:]
naive_predictions =  blah.results['Naive'].iloc[split+window_length:]
lstm_predictions = blah.results['LSTM'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin,accuracy = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm,accuracy = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn,accuracy = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naivev = hit_rate(dates,original_values,naive_predictions)

print('LSTM:')
df_naive,accuracy = hit_rate(dates,original_values,lstm_predictions)

### walk forward validation results

In [None]:
# linear regression
df_walk_forward, df = blah.walk_forward_val('LinearReg',blah.linear_regression_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# SVR
df_walk_forward, df = blah.walk_forward_val('MLP',blah.svr_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# MLP
df_walk_forward, df = blah.walk_forward_val('MLP',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# LSTM
df_walk_forward, df = blah.walk_forward_val('LSTM',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

## 1.2 Gold price

In [None]:
# import some data
df = pd.read_csv('./test_data/Gold_yfinance.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.drop(labels='Adj Close',axis=1,inplace=True)
df.plot(x='Date',y='Close',figsize=(10,5),legend=True,xlabel='Month',subplots=True)
plt.tight_layout()
display(df)

In [None]:
# some forecasting parameters
window_length = 10
split = 500

# input data
scaler = MinMaxScaler()
scaled_training_data = scaler.fit_transform(df['Close'][0:-split].to_numpy().reshape(-1,1)).flatten()
scaled_test_data = scaler.transform(df['Close'][-split:].to_numpy().reshape(-1,1)).flatten()
scaled_input_data = np.append(scaled_training_data,scaled_test_data)

# initialize class object
blah = time_series_prediction(df['Date'],scaled_input_data,window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
blah.sliding_window_1(verbose=0) # time series to supervised ML problem
blah.train_test_split(split=split) # testing and training dataset split
blah.test_train_plot(ylabel='Close price')    # visualize training split

# perform some prediction tasks
blah.linear_regression()
blah.support_vector_machine(model_tunning=True)
blah.neural_net_mlp(model_tunning=True)
blah.lstm(model_tunning=True)
blah.naive_model()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# tabulate results nicely
blah.collect_results()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# view final results
blah.conclusion()

In [None]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = tabulated_results_0['date'].iloc[split+window_length:]
original_values = tabulated_results_0['Value'].iloc[split+window_length:]
lin_predictions = tabulated_results_0['Linear'].iloc[split+window_length:]
svm_predictions = tabulated_results_0['SVM'].iloc[split+window_length:]
nn_predictions =  tabulated_results_0['NN'].iloc[split+window_length:]
naive_predictions =  tabulated_results_0['Naive'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naive = hit_rate(dates,original_values,naive_predictions)

In [None]:
# linear regression
df_walk_forward, df = blah.walk_forward_val('LinearReg',blah.linear_regression_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# SVR
df_walk_forward, df = blah.walk_forward_val('MLP',blah.svr_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# MLP
df_walk_forward, df = blah.walk_forward_val('MLP',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# LSTM
df_walk_forward, df = blah.walk_forward_val('LSTM',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

## 1.3 USD / EUR exchange rate

In [None]:
# import some data
df = pd.read_csv('./test_data/EurUsd_yfinance.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.drop(labels='Adj Close',axis=1,inplace=True)
df.plot(x='Date',y='Close',figsize=(10,5),legend=True,xlabel='Month',subplots=True)
plt.tight_layout()
display(df)

In [None]:
# some forecasting parameters
window_length = 10
split = 500

# input data
scaler = MinMaxScaler()
scaled_training_data = scaler.fit_transform(df['Close'][0:-split].to_numpy().reshape(-1,1)).flatten()
scaled_test_data = scaler.transform(df['Close'][-split:].to_numpy().reshape(-1,1)).flatten()
scaled_input_data = np.append(scaled_training_data,scaled_test_data)

# initialize class object
blah = time_series_prediction(df['Date'],input_data,window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
blah.sliding_window_1(verbose=0) # time series to supervised ML problem
blah.train_test_split(split=split) # testing and training dataset split
blah.test_train_plot(ylabel='Close price')    # visualize training split

# perform some prediction tasks
blah.linear_regression()
blah.support_vector_machine(model_tunning=True)
blah.neural_net_mlp(model_tunning=True)
blah.lstm(model_tunning=True)
blah.naive_model()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# tabulate results nicely
blah.collect_results()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# view final results
blah.conclusion()

In [None]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = tabulated_results_0['date'].iloc[split+window_length:]
original_values = tabulated_results_0['Value'].iloc[split+window_length:]
lin_predictions = tabulated_results_0['Linear'].iloc[split+window_length:]
svm_predictions = tabulated_results_0['SVM'].iloc[split+window_length:]
nn_predictions =  tabulated_results_0['NN'].iloc[split+window_length:]
naive_predictions =  tabulated_results_0['Naive'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naive = hit_rate(dates,original_values,naive_predictions)

In [None]:
# linear regression
df_walk_forward, df = blah.walk_forward_val('LinearReg',blah.linear_regression_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# SVR
df_walk_forward, df = blah.walk_forward_val('MLP',blah.svr_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# MLP
df_walk_forward, df = blah.walk_forward_val('MLP',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# LSTM
df_walk_forward, df = blah.walk_forward_val('LSTM',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

## 1.4 Crude Oil price

In [None]:
# import some data
df = pd.read_csv('./test_data/CrudeOil_yfinance.csv') # sp_500 = GSPC.csv, # airplaine = AirPassengers.csv
df = df.iloc[-2000:,:].reset_index(drop=True) # only look at last 2000 days
df.drop(labels='Adj Close',axis=1,inplace=True)
df.plot(x='Date',y='Close',figsize=(10,5),legend=True,xlabel='Month',subplots=True)
plt.tight_layout()
display(df)

In [None]:
# some forecasting parameters
window_length = 10
split = 500

# input data
scaler = MinMaxScaler()
scaled_training_data = scaler.fit_transform(df['Close'][0:-split].to_numpy().reshape(-1,1)).flatten()
scaled_test_data = scaler.transform(df['Close'][-split:].to_numpy().reshape(-1,1)).flatten()
scaled_input_data = np.append(scaled_training_data,scaled_test_data)

# initialize class object
blah = time_series_prediction(df['Date'],input_data,window_length,1)#time_series_prediction(sp_500['Date'][-4000:],sp_500['Volume'][-4000:]/1e9,5,1) # pass: ime series dates, univariate time series, lag window length, a number of steps ahead to predict
blah.sliding_window_1(verbose=0) # time series to supervised ML problem
blah.train_test_split(split=split) # testing and training dataset split
blah.test_train_plot(ylabel='Close price')    # visualize training split

# perform some prediction tasks
blah.linear_regression()
blah.support_vector_machine(model_tunning=True)
blah.neural_net_mlp(model_tunning=True)
blah.lstm(model_tunning=True)
blah.naive_model()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# tabulate results nicely
blah.collect_results()

# visualize results
blah.vis_results_time_series(ylabel='Close price',second_plot='error')

# view final results
blah.conclusion()

In [None]:
# what is the accuracy of price movements for these predictions

# data to feed to hit_rate function:
dates = tabulated_results_0['date'].iloc[split+window_length:]
original_values = tabulated_results_0['Value'].iloc[split+window_length:]
lin_predictions = tabulated_results_0['Linear'].iloc[split+window_length:]
svm_predictions = tabulated_results_0['SVM'].iloc[split+window_length:]
nn_predictions =  tabulated_results_0['NN'].iloc[split+window_length:]
naive_predictions =  tabulated_results_0['Naive'].iloc[split+window_length:]

# hit rate calculations
print('Linear Regression:')
df_lin = hit_rate(dates,original_values,lin_predictions)

print('SVM:')
df_svm = hit_rate(dates,original_values,svm_predictions)

print('NN:')
df_nn = hit_rate(dates,original_values,nn_predictions)

print('Naive:')
df_naive = hit_rate(dates,original_values,naive_predictions)

In [None]:
# linear regression
df_walk_forward, df = blah.walk_forward_val('LinearReg',blah.linear_regression_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# SVR
df_walk_forward, df = blah.walk_forward_val('MLP',blah.svr_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# MLP
df_walk_forward, df = blah.walk_forward_val('MLP',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)

In [None]:
# LSTM
df_walk_forward, df = blah.walk_forward_val('LSTM',blah.mlp_model,train_len=225,test_len=25,train_frequency=10)
df_walk_forward['error'] = abs((df_walk_forward['real_value'] - df_walk_forward['prediction']))# / df_walk_forward['real_value'])
df_walk_forward.plot(x='date',y=['real_value','error'],subplots=True)