Author: Ihsaan Malek, Fall 2020

In [1]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import math
from datetime import datetime, timedelta

In [2]:
'''
RSI = 100 – 100 / ( 1 + RS )
RS = Relative Strength = Avg_gain / Avg_Loss
Avg_gain = average of all up moves in the last N price 
Avg_Loss = average of all down moves in the last N price 
N = the window of RSI
'''
#taken for stack overflow dont forget to put credits 
def average_gainloss(series, window_size, average):
    a = (window_size-1) / window_size
    ak = a**np.arange(len(series)-1, -1, -1) #arange in descending order
    return np.append(average, np.cumsum(ak * series) / ak / window_size + average * a**np.arange(1, len(series)+1))


def rsi(df, window_size = 14):

    df['change'] = df['close'].diff()
    df['gain'] = df.change.mask(df.change < 0, 0.0)
    df['loss'] = -df.change.mask(df.change > 0, -0.0)
    df.loc[window_size:,'avg_gain'] = average_gainloss( df.gain[window_size+1:].values, window_size, df.loc[:window_size, 'gain'].mean())
    df.loc[window_size:,'avg_loss'] = average_gainloss( df.loss[window_size+1:].values, window_size, df.loc[:window_size, 'loss'].mean())
    df['rs'] = df.avg_gain / df.avg_loss
    df['rsi'] = 100 - (100 / (1 + df.rs))
    
    return df
   

In [5]:
def mulitvariate_preprocessing(num_features,time_steps, df, num_predictions, sampling_method, sampling_window = 1):
    '''
    Ex:
    num_predictions = 2
    input : t1,t2,t3,t4,t5
    target : t3,t4,t5,t6,t7
    predictions = t6,t7
    sampling method: either you push the sliding window along the data or you jump ie select sample [0,timestep],[timestep,2*timstep] etc..
    
    '''  
    
    sampling_index = 0
    end_sampling_index = time_steps

    #set final subsampling shape Temp array to append actual data in the right shape
    temp = np.arange(time_steps) 
    for i in range(num_features-1):
        temp = np.vstack((temp,np.arange(time_steps))) #end of for loop you get 4 depth,20  input length shape
    temp = np.vstack(([temp],[temp])) #shape (2,4,20) or (samples, channels/features, time length)


    train_Set = df.iloc[sampling_index:end_sampling_index, [0]].to_numpy().flatten() #first time_steps samples
    #print(train_Set)
        
    if sampling_method == 'sliding_window':
        
        
        target_start = num_predictions  #retrieve t+1
        target_end = target_start + time_steps
        y_train = df.iloc[target_start:target_end, [0]].to_numpy().flatten() #first target samples

        
        for i in range((df.shape[0]-num_predictions-time_steps)+sampling_window): #loop of data to generate samples
  
            for i in range(1,num_features): #loop to iterate over features for x
               
                train_Set = np.vstack((train_Set,
                                   df.iloc[sampling_index:end_sampling_index, [i]].to_numpy().flatten()))
                
            #print('x',train_Set[0:1,])
            
            sampling_index += sampling_window
            end_sampling_index += sampling_window
            
            target_start += sampling_window
            target_end += sampling_window
            
                
            temp = np.vstack((temp,[train_Set])) #shape n, feature,time length
            train_Set = df.iloc[sampling_index:end_sampling_index, [0]].to_numpy().flatten() #reset to get shape 20, -> array of sub-sampled prices
            
            if target_end > df.shape[0]:
                break #error checking condition
            #y-value
            y_train = np.vstack((y_train,
                                        df.iloc[target_start:target_end, [0]].to_numpy().flatten()))

        y_train = y_train.reshape(-1,1,time_steps).astype('float32') #reshape to sample, 1 ,timesteps
            
            
    else:
        #jumping by timesteps
        
        for i in range(int(df.shape[0])//time_steps):
            for i in range(1,num_features):

                train_Set = np.vstack((train_Set,
                                       df.iloc[sampling_index:end_sampling_index, [i]].to_numpy().flatten()))
                
            #print('x',train_Set[0:1,])
            sampling_index += time_steps
            end_sampling_index += time_steps
            
            temp = np.vstack((temp,[train_Set])) #shape n, feature,time length
            train_Set = df.iloc[sampling_index:end_sampling_index, [0]].to_numpy().flatten() #reset to get shape 20, -> array of sub-sampled prices
    

        y_train = df['close'].tail(df.shape[0]-num_predictions) # last n - num_prediction values
        y_train = y_train.to_numpy().flatten().reshape(-1,1,time_steps).astype('float32')    
        
   
    
    print('Finish resampling')
    
    temp = temp[2:,:,:].astype('float32')
      
    #print('x',temp.shape)#nice shape is good
    #print(temp[:1,:,:])#check subsample for proper format
    #print('y',y_train.shape)
    
    #X_train_torch = torch.from_numpy(temp)
    
    #y__train_torch = torch.from_numpy(y_train) # n_Samples x 1 x temporal length
    
    return temp, y_train

In [6]:
############# Any forecast size >1 is multi-step/ horizon forecasting. due to the nature of the data
############## And the volatility this can lead to unaccurate predictions, therefore we need a recursive forecasting
###Method as well for when forecast_size = 5


window_size = 14
time_steps= 20 #input length
forecast_size = 5 #shift in y target as well
np.set_printoptions(suppress=True)
training_sample_size = 500
sampling_Style = 'sliding_window' # sliding_window or jump

df=pd.read_csv('SPY.csv')

sample = df[['date','close']][:600] #need to ensure extra data for y-target series forecasting
sample['date']= pd.to_datetime(sample['date'], format ='%Y-%m-%d %H:%M:%S' ) #convert string to date time
sample['dayofweek']= sample['date'].dt.dayofweek
sample['std'] = sample['close'].pct_change()*100

sample_rsi = rsi(sample, window_size)

sample_rsi = sample_rsi.set_index('date')
start_index =((sample.shape[0]-window_size)%time_steps + window_size)
sample_rsi = sample_rsi[start_index:]
sample_rsi = sample_rsi[['close','dayofweek','rsi','std']]  # predicting feature should be in column index 0

num_features = sample_rsi.shape[1]

training_cutoff = training_sample_size + forecast_size 
validation_cutoff = time_steps + training_cutoff + forecast_size
testing_cutoff = time_steps + validation_cutoff  + forecast_size

train = sample_rsi[:training_cutoff]
val = sample_rsi[training_cutoff:validation_cutoff]
testing = sample_rsi[validation_cutoff:testing_cutoff]

X_train, y_train = mulitvariate_preprocessing(num_features,time_steps,train,forecast_size,'sliding_window' ) #or jump

X_val, y_val = mulitvariate_preprocessing(num_features,time_steps,val, forecast_size,'sliding_window')

print(X_train.shape, y_train.shape)
print(X_val.shape, y_val.shape)

Finish resampling
Finish resampling
(481, 4, 20) (481, 1, 20)
(1, 4, 20) (1, 1, 20)


In [7]:
#Model Definition

In [8]:
#model Training

In [9]:
#example of model validation
model.eval()
y_pred_Test = model(X_val)
y_pred_vals = y_pred_Test.detach().numpy().reshape(-1) # entirety of model ouput
y_forecasted = y_pred_vals[y_pred_vals.shape[0]-forecast_size:] # forcasted aka shifted y, take last forecast_size elements

y_real_Test = val['close'].tail(forecast_size) # last time step size sample so it matches the forecasted time
plt.plot(val['close'].tail(forecast_size).index,y_real_Test,label = 'Actual')
plt.plot(val['close'].tail(forecast_size).index,y_forecasted,label = 'Predicted')
plt.legend()

val_loss = loss(y_pred_Test, y_val)
print('validation loss: ', val_loss)

NameError: name 'model' is not defined