# Итоговая модель прогнозирования stock

In [1]:
sym = 'SBERP'

In [2]:
import torch
from torch import nn, optim
from torch.utils.data import Dataset, DataLoader, Subset

import numpy as np
import pandas as pd
import pandas_ta as ta

import seaborn as sns
sns.set_style("darkgrid")

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams[
    "figure.facecolor"
] = "w"  # force white background on plots when using dark mode in JupyterLab

import plotly.express as px
import plotly.graph_objects as go

import sklearn
from sklearn import metrics
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error

import datetime
from datetime import datetime, date, timedelta
import time

import random
import math

from tqdm import tqdm

import warnings
warnings.filterwarnings('ignore')

In [3]:
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'
DEVICE 

'cpu'

In [4]:
data = pd.read_csv(f'{sym}_day.csv', index_col=0, parse_dates=True)

In [5]:
# функция по созданию сгенерированных признаков
def make_features(data):
    data['year'] = data.index.year
    data['month'] = data.index.month
    data['day'] = data.index.day
    data['dayofweek'] = data.index.dayofweek
    data['is_weekend'] = data.index.isin([5,6])*1
    
    return data

In [6]:
data = make_features(data)

In [7]:
# Adding indicators
data['RSI'] = ta.rsi(data.CLOSE, length=15)
data['EMAF'] = ta.ema(data.CLOSE, length=20)
data['EMAM'] = ta.ema(data.CLOSE, length=100)
data['EMAS'] = ta.ema(data.CLOSE, length=150)
data['VWAP'] = ta.vwap(data.HIGH, data.LOW, data.CLOSE, data.VOL, anchor = "D")

In [8]:
data['Target'] = data['CLOSE'] - data.OPEN
data['Target'] = data['Target'].shift(-1)

In [9]:
data.dropna(inplace=True)
data.reset_index(inplace=True)
data.drop(['VOL', 'TICKER', 'PER', 'TIME' ], axis=1, inplace=True)
data_set = data.iloc[:, 0:13]

In [10]:
sc = MinMaxScaler(feature_range=(-1, 1))
data_scaled = sc.fit_transform(data_set.iloc[:, 1:])

In [11]:
class SeriesDataset(Dataset):
                            def __init__(self, data, window_size):
                                    self.data = data  
                                    self.window_size = window_size
                    
                                    self.y = []
                                    self.sequence = []
                                
                                    for j in range(self.data.shape[1]):
                                                    self.sequence.append([])
                                                    self.y.append([])
                                            
                                                    for i in range(self.data.shape[0] - self.window_size):
                                                            self.sequence[j].append(self.data[i : i + self.window_size, j])
                                                            self.y[j].append(self.data[i + 1 : i + self.window_size + 1, j])
                                  
                                    
                                    
                                    
                                    self.sequence = np.array(np.moveaxis(self.sequence, [0], [2]))
                                    self.y = np.array(np.moveaxis(self.y, [0], [2]))

                                    

                        
                            def __len__(self):
                                                return len(self.data) - self.window_size

                            def __getitem__(self, idx):
                                                sequence = self.sequence[idx]
                                                label = self.y[idx]
                                                pred = self.y[idx][-1, 3]
                                                                                                                                         
                                                sequence = torch.tensor(sequence).float()
                                                label = torch.tensor(label).float()
                                                pred = torch.tensor(pred).float()
                     
                                        
                                                return sequence, label, pred

In [12]:
class My_Model_GRU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, bidirectional=False, method='GRU', dropout_p = 0.2):
        super(My_Model_GRU, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size * 2 if bidirectional else hidden_size
        self.hidden_var = hidden_size if bidirectional else hidden_size // 2
        self.bidirectional = bidirectional
        self.output_size = output_size
        self.num_layers = num_layers
        self.relu = nn.ReLU()
        self.do = nn.Dropout(dropout_p)
        self.method = method
        if self.method == 'GRU':
                self.net = nn.GRU(input_size=input_size, hidden_size=hidden_size,
                                                num_layers=num_layers, batch_first=True, bidirectional=self.bidirectional)
        elif self.method == 'LSTM':
                self.net = nn.LSTM(input_size=input_size, hidden_size=hidden_size,
                                                num_layers=num_layers, batch_first=True, bidirectional=self.bidirectional)
        self.linear1 = nn.Linear(self.hidden_size, self.hidden_var)
        self.linear2 = nn.Linear(self.hidden_var, output_size)
         
        
    def forward(self, input, future=0, y=None):

        ht, ct = self.net(input)
        output = self.linear1(self.do(ht))
        output = self.relu(output)
        outputs = self.linear2(output)
        teacher = outputs
        
        if future != 0:
            out = []
            c = 0
            for i in range(future):
                if y is not None and random.random() > 0.5:
                    c += 1
                    outputs = y[:, [i], :]  # teacher forcing
                ht, ct = self.net(outputs, ct)
                output = self.linear1(self.do(ht))
                output = self.relu(output)
                output = self.linear2(output)
                out += [output[:,-1,:]]
            
            out = torch.stack(out, 1).squeeze(2)
            
            if c > 0 : outputs = torch.cat((teacher, out), 1) 
            else: outputs = torch.cat((outputs, out), 1) 
      
        return outputs, outputs[:,-1,3]

In [13]:
class Optimization:
    """ A helper class to train, test and diagnose the GRU"""

    def __init__(self, model, loss_fn, optimizer, scheduler):
        self.model = model
        self.loss_fn = loss_fn
        self.optimizer = optimizer
        self.scheduler = scheduler
        self.train_losses = []
        self.val_losses = []
        self.futures = []

    def train(
        self,
        dataloader_train,
        n_epochs,
        dataloader_val = None,
        do_teacher_forcing=None,
    
    ):
        for epoch in range(n_epochs):
            start_time = time.time()
            train_loss = 0
#             loop = tqdm(dataloader_train, desc='Train', colour='green')
            for b, batch in enumerate(dataloader_train):
                x_batch, y_batch, y_b = batch[0], batch[1], batch[2]
                y_pr, y_pred = self._predict(x_batch, y_batch, do_teacher_forcing)
                self.optimizer.zero_grad()
                loss = self.loss_fn(y_pr, y_batch)
                loss.backward()
                self.optimizer.step()
                train_loss += loss.item()
            self.scheduler.step()
            train_loss /= (b+1)
            self.train_losses.append(train_loss)

            self._validation(dataloader_val)

            elapsed = time.time() - start_time
#             print(
#                 "Epoch %d Train loss: %.2f. Validation loss: %.2f. Avg future: %.2f. Elapsed time: %.2fs."
#                 % (epoch + 1, train_loss, self.val_losses[-1], np.average(self.futures), elapsed)
#             )
            
#             torch.save(self.model.state_dict(), f'./chkpt_mymodel_{epoch+1}.pth')
    
    def _predict(self, x_batch, y_batch, do_teacher_forcing):
        if do_teacher_forcing:
            future = random.randint(1, int(y_batch.shape[1]) / 2)
            limit = x_batch.size(1) - future
            y_pr, y_pred = self.model(x_batch[:, :limit, :], future=future, y=y_batch[:, limit:, :])
        else:
            future = 0
            y_pr, y_pred = self.model(x_batch)
        self.futures.append(future)
        return y_pr, y_pred
    
    def _validation(self, dataloader_val):
        if dataloader_val is None:
            return
        with torch.no_grad():
            val_loss = 0
#             loop = tqdm(dataloader_val, desc='Val', colour='green')
            for b, batch in enumerate(dataloader_val):
                x_batch, y_batch, y_b = batch[0], batch[1], batch[2]
                y_pr, y_pred = self.model(x_batch)
                loss = self.loss_fn(y_pr, y_batch)
                val_loss += loss.item()
            val_loss /= b+1
            self.val_losses.append(val_loss)

    def evaluate(self, dataloader_test, future=1):
        with torch.no_grad():
            test_loss = 0
            actual, predicted = [], []
#             loop = tqdm(dataloader_test, desc='Test', colour='green')
            for b, batch in enumerate(dataloader_test):
                x_batch, y_batch, y_b = batch[0], batch[1], batch[2]
                y_pr, y_pred = self.model(x_batch, future=future)
                y_pr = (
                    y_pr[:, -y_batch.shape[1] :, :] if y_pr.shape[1] > y_batch.shape[1] else y_pr
                )
                loss = self.loss_fn(y_pr, y_batch)
                test_loss += loss.item()            
                if x_batch.shape[0] > 1:
                            actual += torch.squeeze(y_batch[:, -1, :]).data.cpu().numpy().tolist()

                            predicted += torch.squeeze(y_pr[:, -1, :]).data.cpu().numpy().tolist()

                else:
                            actual.append(torch.squeeze(y_batch[:, -1, :]).data.cpu().numpy().tolist())
                            predicted.append(torch.squeeze(y_pr[:, -1, :]).data.cpu().numpy().tolist())


            
            test_loss /= (b+1)
            return actual, predicted, test_loss

    def plot_losses(self):
        plt.plot(self.train_losses, label="Training loss")
        plt.plot(self.val_losses, label="Validation loss")
        plt.legend()
        plt.title("Losses")

In [14]:
def to_dataframe(scalar, actual, predicted, columns, future = 0):
    y = pd.DataFrame(scalar.inverse_transform(np.array(actual)), columns = columns)
    y_pred = pd.DataFrame(scalar.inverse_transform(np.array(predicted)), columns = columns)
    mse = mean_squared_error(y, y_pred)
    rmse = np.sqrt(mse)
    mae = mean_absolute_error(y, y_pred)
    
    y['Target'] = y['CLOSE'] - y.OPEN
    y['TargetClass'] = [1 if y.Target[i] > 0 else 0 for i in range(len(y))]
    
    y_pred['Target'] = y_pred['CLOSE'] - y_pred['CLOSE'].shift()
    y_pred['TargetClass'] = [1 if y_pred.Target[i] > 0 else 0 for i in range(len(y_pred))]
    
    if future == 0:
                   f1 = metrics.f1_score(y['TargetClass'], y_pred['TargetClass'])
                   acc = metrics.accuracy_score(y['TargetClass'], y_pred['TargetClass'])
                   recall = metrics.precision_score(y['TargetClass'], y_pred['TargetClass'])
    else: 
                   f1 = metrics.f1_score(y_pred['TargetClass'][future: ], y_pred['TargetClass'][:- future])
                   acc = metrics.accuracy_score(y_pred['TargetClass'][future: ], y_pred['TargetClass'][:- future])
                   recall = metrics.precision_score(y_pred['TargetClass'][future: ], y_pred['TargetClass'][:- future])
            
    
    error = {'MSE': mse,'RMSE': rmse,'MAE': mae, 'f1': f1, 'acc': acc, 'recall': recall}
#     print('MSE: {:.4f}, RMSE: {:.4f}, MAE: {:.4f}, f1: {:.4f}, acc: {:.4f}, recall: {:.4f}'.format(mse, rmse, mae, f1, acc, recall))
    
    return y, y_pred, error

In [15]:
input_size = 12
output_size = 12
BATCH_SIZE = 128
EPOCHS = 10

In [16]:
W = [20, 30, 40, 50, 60, 70, 80, 90]
HIDDEN = [32, 64, 128, 256]
NUM = [1, 2]
met = ['GRU', 'LSTM']
bid = [True, False]
teach = [True, False]

In [18]:
model = []
score = []

for WINDOW_SIZE in tqdm(W):
    
    dataset = SeriesDataset(data_scaled, WINDOW_SIZE)

    splitlimit1 = int(len(dataset) * 0.8)
    train_dataset =  Subset(dataset, np.arange(splitlimit1))
    t_dataset =  Subset(dataset, np.arange(splitlimit1, len(dataset)))
    splitlimit2 = int(len(t_dataset) * 0.5)
    val_dataset =  Subset(t_dataset, np.arange(splitlimit2))
    test_dataset =  Subset(t_dataset, np.arange(splitlimit2, len(t_dataset)))
    
    train_dataloader = DataLoader(train_dataset, batch_size = BATCH_SIZE, shuffle=True)
    train_dataloader1 = DataLoader(train_dataset, batch_size = 1)
    val_dataloader = DataLoader(val_dataset, batch_size = 1)
    test_dataloader = DataLoader(test_dataset, batch_size = 1)

    for HIDDEN_SIZE in HIDDEN:
                    for NUM_L in NUM:
                              for method in met:
                                         for bidirectional in bid:
                                                           for teacher in teach:
                                        
                                                                        model_my = My_Model_GRU(input_size, HIDDEN_SIZE, NUM_L, output_size, bidirectional=bidirectional, method = method)
                                                                        criterion = nn.MSELoss()
                                                                        optimizer = optim.Adam(model_my.parameters(), lr=0.01)
                                                                        scheduler = optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
              
                                                                        optimization_1 = Optimization(model_my, criterion, optimizer, scheduler)
                                                                        optimization_1.train(train_dataloader, n_epochs = EPOCHS, dataloader_val = val_dataloader, do_teacher_forcing=teacher)

                                                                        actual_1, predicted_1, test_loss_1 = optimization_1.evaluate(test_dataloader, future = 0)
                                                                        df_actual_1, df_predicted_1, error1 = to_dataframe(sc, actual_1, predicted_1, data_set.columns[1:], future = 0) 
                                                                        error1['Loss'] = test_loss_1
                                
                                                                        m1 = f'TEST_W{WINDOW_SIZE}_H{HIDDEN_SIZE}_N{NUM_L}_m{method}_b{bidirectional}_t{teacher}'
                                                                        model.append(m1)
                                                                        score.append(error1)
                                                                        
                                                                        actual_2, predicted_2, val_loss_2 = optimization_1.evaluate(val_dataloader, future=0)
                                                                        df_actual_2, df_predicted_2, error2 = to_dataframe(sc, actual_2, predicted_2, data_set.columns[1:]) 
                                                                        error2['Loss'] = val_loss_2
                                                            
                                                                        m2 = f'VAL_W{WINDOW_SIZE}_H{HIDDEN_SIZE}_N{NUM_L}_m{method}_b{bidirectional}_t{teacher}'                                               
                                                                        model.append(m2)
                                                                        score.append(error2)

                                                                        actual_3, predicted_3, train_loss_3 = optimization_1.evaluate(train_dataloader1, future=0)
                                                                        df_actual_3, df_predicted_3, error3 = to_dataframe(sc, actual_3, predicted_3, data_set.columns[1:])
                                                                        error3['Loss'] = train_loss_3
                                                                        
                                                                        m3 = f'TRAIN_W{WINDOW_SIZE}_H{HIDDEN_SIZE}_N{NUM_L}_m{method}_b{bidirectional}_t{teacher}'
                                                                        model.append(m3)
                                                                        score.append(error3)

100%|██████████████████████████████████████████████████████████████████████████████| 8/8 [26:43:32<00:00, 12026.61s/it]


In [19]:
df_model = pd.DataFrame(score , index = model)
df_model

Unnamed: 0,MSE,RMSE,MAE,f1,acc,recall,Loss
TEST_W20_H32_N1_mGRU_bTrue_tTrue,106.420547,10.316034,6.636041,0.492537,0.468750,0.492537,0.045912
VAL_W20_H32_N1_mGRU_bTrue_tTrue,332.737538,18.241095,11.811974,0.426230,0.453125,0.403101,0.057502
TRAIN_W20_H32_N1_mGRU_bTrue_tTrue,60.967827,7.808190,4.870432,0.499263,0.501468,0.504469,0.014806
TEST_W20_H32_N1_mGRU_bTrue_tFalse,148.379624,12.181118,7.841569,0.513208,0.496094,0.519084,0.049455
VAL_W20_H32_N1_mGRU_bTrue_tFalse,374.524003,19.352623,12.781358,0.419214,0.480469,0.421053,0.054020
...,...,...,...,...,...,...,...
VAL_W90_H256_N2_mLSTM_bFalse_tTrue,483.215512,21.982164,14.585394,0.431535,0.449799,0.409449,0.084486
TRAIN_W90_H256_N2_mLSTM_bFalse_tTrue,58.466068,7.646311,4.924428,0.534917,0.507545,0.507207,0.033973
TEST_W90_H256_N2_mLSTM_bFalse_tFalse,109.780507,10.477619,6.726227,0.534351,0.510040,0.534351,0.072474
VAL_W90_H256_N2_mLSTM_bFalse_tFalse,355.712152,18.860333,12.551012,0.396476,0.449799,0.398230,0.065268


In [33]:
df_model.to_excel('Pipeline_model.xlsx', index = True)
df_model.to_csv('Pipeline_model.csv', index = True)

In [37]:
dfm = pd.read_csv('Pipeline_model.csv', index_col=0)
dfm.describe(include='all')

Unnamed: 0,MSE,RMSE,MAE,f1,acc,recall,Loss
count,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0,1536.0
mean,28475.39,26.775233,17.570795,0.488402,0.493629,0.485487,1.83181
std,486112.3,166.663001,111.831215,0.039603,0.027753,0.043633,22.744952
min,18.34096,4.282635,2.705921,0.333333,0.376984,0.333333,0.002762
25%,69.28534,8.323782,5.272333,0.466601,0.478431,0.458568,0.022521
50%,134.343,11.590623,7.51703,0.494289,0.494489,0.492188,0.047885
75%,322.4755,17.957603,11.818898,0.511124,0.509785,0.510922,0.074661
max,13288760.0,3645.375486,2514.837464,0.630037,0.600791,0.614286,397.92006


In [38]:
dfm

Unnamed: 0,MSE,RMSE,MAE,f1,acc,recall,Loss
TEST_W20_H32_N1_mGRU_bTrue_tTrue,106.420547,10.316034,6.636041,0.492537,0.468750,0.492537,0.045912
VAL_W20_H32_N1_mGRU_bTrue_tTrue,332.737538,18.241095,11.811974,0.426230,0.453125,0.403101,0.057502
TRAIN_W20_H32_N1_mGRU_bTrue_tTrue,60.967827,7.808190,4.870432,0.499263,0.501468,0.504469,0.014806
TEST_W20_H32_N1_mGRU_bTrue_tFalse,148.379624,12.181118,7.841569,0.513208,0.496094,0.519084,0.049455
VAL_W20_H32_N1_mGRU_bTrue_tFalse,374.524003,19.352623,12.781358,0.419214,0.480469,0.421053,0.054020
...,...,...,...,...,...,...,...
VAL_W90_H256_N2_mLSTM_bFalse_tTrue,483.215512,21.982164,14.585394,0.431535,0.449799,0.409449,0.084486
TRAIN_W90_H256_N2_mLSTM_bFalse_tTrue,58.466068,7.646311,4.924428,0.534917,0.507545,0.507207,0.033973
TEST_W90_H256_N2_mLSTM_bFalse_tFalse,109.780507,10.477619,6.726227,0.534351,0.510040,0.534351,0.072474
VAL_W90_H256_N2_mLSTM_bFalse_tFalse,355.712152,18.860333,12.551012,0.396476,0.449799,0.398230,0.065268


In [48]:
dfb = []
dfb.append(dfm[dfm.Loss == dfm.Loss.min()])
dfb.append(dfm[dfm.recall == dfm.recall.max()])
dfb.append(dfm[dfm.acc == dfm.acc.max()])
dfb.append(dfm[dfm.f1 == dfm.f1.max()])
dfb.append(dfm[dfm.MAE == dfm.MAE.min()])
dfb.append(dfm[dfm.RMSE == dfm.RMSE.min()])
dfb.append(dfm[dfm.MSE == dfm.MSE.min()])
resul = pd.concat(dfb)
resul = resul[~resul.index.duplicated(keep='first')]
resul

Unnamed: 0,MSE,RMSE,MAE,f1,acc,recall,Loss
TRAIN_W90_H256_N1_mLSTM_bTrue_tFalse,22.808919,4.775868,3.185392,0.522805,0.510563,0.510536,0.002762
TEST_W50_H256_N1_mLSTM_bFalse_tTrue,125.247845,11.191418,7.396815,0.630037,0.600791,0.614286,0.070256
TRAIN_W30_H256_N1_mGRU_bTrue_tTrue,18.340961,4.282635,2.705921,0.497093,0.490177,0.493269,0.00487
