In [1]:
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
#pip install pandas_ta

In [3]:
import pandas_ta as ta

In [4]:
import random
import os
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
from torch.utils.data import Dataset, DataLoader

In [5]:
bist_daily = pd.read_csv("datasets/bist_100_daily.csv")
# Extract the number of rows and columns by using the shape of the data.
numRows,numColumns = bist_daily.shape
# Extract the time interval.
last_date, first_date = bist_daily.iloc[0].Date, bist_daily.iloc[-1].Date
# Check the availability of the data.
na_cols = bist_daily.columns[bist_daily.isna().any()].tolist()

# Print the information.
print(f"There are {numRows} rows and {numColumns} columns in the initial dataset.")
print(f"The data represents the time frame between the dates '{last_date}' and '{first_date}'.")
if not na_cols:
    print("There are no NA rows.")
else:
    print(f"Columns in the dataset which include NA rows: {na_cols}.")
# Convert columns to numeric values
column_names = ["Price", "Open", "High", "Low"]
for column in column_names:
    bist_daily[column] = bist_daily[column].str.replace(',', '')
    bist_daily[column] = pd.to_numeric(bist_daily[column])
# CONVERT TO DATETIME FORMAT AND SORT DATA BY DATE
bist_daily.Date = pd.to_datetime(bist_daily.Date)
bist_daily.sort_values(by="Date", ignore_index=True,inplace=True)
bist_daily.set_index(pd.DatetimeIndex(bist_daily["Date"]), inplace=True)
bist_daily.rename(columns={"Price": "close"},inplace=True)
# Calculate Returns and append to the df DataFrame
# CUMLOGRET_1 and CUMPCTRET_1 are added (NaN values exists)
bist_daily.ta.log_return(cumulative=True, append=True)
bist_daily.ta.percent_return(cumulative=True, append=True)
# Returns a list of indicators and utility functions (to check in future)
ind_list = bist_daily.ta.indicators(as_list=True)
# RSI_14, MACD_12_26_9, MACDh_12_26_9 and MACDs_12_26_9 are added (NaN values exists)
bist_daily.ta.rsi(append=True)
bist_daily.ta.macd(append=True)
# SMA values are added (use ta in the future)
sma_values = [5, 10, 15] 
for i in sma_values:
    bist_daily['SMA'+str(i)] = bist_daily['close'].rolling(window=i).mean()
# Remove all NaN value rows
bist_daily.dropna(inplace=True)
bist_daily

There are 5000 rows and 7 columns in the initial dataset.
The data represents the time frame between the dates 'Dec 11, 2019' and 'Jan 04, 2000'.
There are no NA rows.


Unnamed: 0_level_0,date,close,open,high,low,Vol.,Change %,CUMLOGRET_1,CUMPCTRET_1,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,SMA5,SMA10,SMA15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000-02-21,2000-02-21,145.69,153.64,153.64,142.79,25.75M,-5.17%,-0.183990,-0.160515,34.352566,-7.230371,-0.392050,-6.838321,153.936,155.116,157.896000
2000-02-22,2000-02-22,139.94,145.69,147.02,137.58,29.58M,-3.95%,-0.224258,-0.199983,31.136786,-8.027940,-0.951695,-7.076245,150.206,153.060,156.330667
2000-02-23,2000-02-23,134.47,139.94,141.26,131.71,26.95M,-3.91%,-0.264130,-0.239071,28.411996,-8.997682,-1.537150,-7.460532,145.432,151.074,154.602000
2000-02-24,2000-02-24,146.52,134.47,146.52,134.14,43.02M,8.96%,-0.178310,-0.149460,40.719161,-8.693661,-0.986503,-7.707158,144.052,149.896,153.638667
2000-02-25,2000-02-25,156.18,146.52,156.32,146.52,56.16M,6.59%,-0.114462,-0.083530,48.380526,-7.585796,0.097090,-7.682886,144.560,149.916,152.803333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-05,2019-12-05,1086.59,1082.60,1090.32,1081.51,2.24B,0.89%,1.825328,2.894902,66.153619,16.788401,0.360019,16.428382,1076.116,1068.981,1068.217333
2019-12-06,2019-12-06,1088.69,1090.09,1093.15,1083.52,2.16B,0.19%,1.827259,2.896834,66.774984,17.187282,0.607120,16.580162,1080.046,1071.262,1070.543333
2019-12-09,2019-12-09,1087.86,1088.96,1091.25,1081.39,2.32B,-0.08%,1.826496,2.896072,66.257244,17.237719,0.526046,16.711673,1081.610,1074.666,1071.836667
2019-12-10,2019-12-10,1080.11,1089.12,1093.78,1076.18,2.07B,-0.71%,1.819347,2.888948,61.465048,16.462560,-0.199291,16.661851,1084.052,1076.694,1072.158000


In [6]:
# edit Vol. column

from operator import itemgetter

vols = bist_daily['Vol.'].to_list()
#[vol[-1] for vol in vols]
#list(filter(lambda vol: "-" in vol,enumerate(vols)))

indexToRemove = bist_daily.iloc[list(map(itemgetter(0),filter(lambda vol: "-" in vol,enumerate(vols))))].index
bist_daily.drop(indexToRemove,inplace=True)
bist_daily['Vol.'] = bist_daily['Vol.'].apply(
    lambda x: float(x[:-1])*(10**6) if x[-1]=="M" else (float(x[:-1])*(10**9) if x[-1]=="B" else "ERROR")).astype("int")
bist_daily

Unnamed: 0_level_0,date,close,open,high,low,Vol.,Change %,CUMLOGRET_1,CUMPCTRET_1,RSI_14,MACD_12_26_9,MACDh_12_26_9,MACDs_12_26_9,SMA5,SMA10,SMA15
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
2000-02-21,2000-02-21,145.69,153.64,153.64,142.79,25750000,-5.17%,-0.183990,-0.160515,34.352566,-7.230371,-0.392050,-6.838321,153.936,155.116,157.896000
2000-02-22,2000-02-22,139.94,145.69,147.02,137.58,29580000,-3.95%,-0.224258,-0.199983,31.136786,-8.027940,-0.951695,-7.076245,150.206,153.060,156.330667
2000-02-23,2000-02-23,134.47,139.94,141.26,131.71,26950000,-3.91%,-0.264130,-0.239071,28.411996,-8.997682,-1.537150,-7.460532,145.432,151.074,154.602000
2000-02-24,2000-02-24,146.52,134.47,146.52,134.14,43020000,8.96%,-0.178310,-0.149460,40.719161,-8.693661,-0.986503,-7.707158,144.052,149.896,153.638667
2000-02-25,2000-02-25,156.18,146.52,156.32,146.52,56160000,6.59%,-0.114462,-0.083530,48.380526,-7.585796,0.097090,-7.682886,144.560,149.916,152.803333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2019-12-05,2019-12-05,1086.59,1082.60,1090.32,1081.51,-2147483648,0.89%,1.825328,2.894902,66.153619,16.788401,0.360019,16.428382,1076.116,1068.981,1068.217333
2019-12-06,2019-12-06,1088.69,1090.09,1093.15,1083.52,-2147483648,0.19%,1.827259,2.896834,66.774984,17.187282,0.607120,16.580162,1080.046,1071.262,1070.543333
2019-12-09,2019-12-09,1087.86,1088.96,1091.25,1081.39,-2147483648,-0.08%,1.826496,2.896072,66.257244,17.237719,0.526046,16.711673,1081.610,1074.666,1071.836667
2019-12-10,2019-12-10,1080.11,1089.12,1093.78,1076.18,2069999999,-0.71%,1.819347,2.888948,61.465048,16.462560,-0.199291,16.661851,1084.052,1076.694,1072.158000


In [7]:
# seeding an arbitrary number to get same results in multiple runs
manualSeed = 999
random.seed(manualSeed)
torch.manual_seed(manualSeed)
print("Seed:", manualSeed)

Seed: 999


In [8]:
# getting number of GPUs from cuda
ngpu = torch.cuda.device_count()
print("Count of available GPUs:", ngpu)

Count of available GPUs: 1


In [9]:
# printing the name of available GPUs
for i in range(ngpu):
    print("GPU {}: {}".format(i+1, torch.cuda.get_device_name(i)))

GPU 1: GeForce RTX 2060


In [10]:
# batch size for the training
batch_size = 64

# optimizer rates
optimizer_betas = (0.9, 0.999)
learning_rate = 1e-3

# number of epochs
num_epochs = 100000

# decide which device we want to run on
device = torch.device("cuda:0" if (torch.cuda.is_available() and ngpu > 0) else "cpu")

In [11]:
class TimeseriesDataset(Dataset):
    def __init__(self, data_frame, sequence_length=2):
        self.data = torch.tensor(data_frame.values)
        self.sequence_length = sequence_length

    def __len__(self):
        return self.data.shape[0] - self.sequence_length + 1

    def __getitem__(self, index):
        return self.data[index: index + self.sequence_length].float()
    
    # Non-overlapping series
    # def __getitem__(self, index):
    #     return self.data[index * self.sequence_length: (index+1) * self.sequence_length]

In [12]:
# create pytorch dataset from the pandas DataFrame

# TODO: Convert change(%) and Volume columns to numeric values
columns_used_in_training = ["close", "open", "high", "low", "CUMLOGRET_1", "RSI_14", "MACD_12_26_9", "SMA5"]
# input dimension of the generator
data_dimension = len(columns_used_in_training)
# sequence length of input data
sequence_length = 14
train_dataset = TimeseriesDataset(bist_daily[columns_used_in_training], sequence_length)
# create the dataloader
dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
real_data_sample = next(iter(dataloader))
print("Real data sample shape:", real_data_sample.shape)

Real data sample shape: torch.Size([64, 14, 8])


In [13]:
class Generator(nn.Module):
    def __init__(self, hidden_size):
        super(Generator, self).__init__()
        self.hidden_size = hidden_size
        self.lstm = nn.LSTM(input_size=data_dimension, hidden_size=hidden_size, num_layers=2, batch_first=True)
        self.linear = nn.Linear(hidden_size, data_dimension)
        self.drop = nn.Dropout(0.2)

    def forward(self, input_sequences):
        input_sequences = self.drop(input_sequences)
        lstm_output, hidden_cell = self.lstm(input_sequences)
        res = self.linear(hidden_cell[0][-1])
        res = res.view(res.shape[0], 1, -1)
        return res

In [14]:
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(sequence_length*data_dimension, 72),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.2),
            nn.Linear(72, 100),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.2),
            nn.Linear(100, 10),
            nn.LeakyReLU(0.2),
            nn.Dropout(0.2),
            nn.Linear(10, 1),
            nn.Sigmoid(),
        )

    def forward(self, input_sequences):
        input_sequences_flattened = input_sequences.view(input_sequences.shape[0], -1)
        res = self.model(input_sequences_flattened)
        return res

In [15]:
# TODO: weight initialization of models

In [16]:
generator = Generator(hidden_size=data_dimension*10).to(device)
discriminator = Discriminator().to(device)
print("Generator and discriminator are initialized")

Generator and discriminator are initialized


In [17]:
criterion = nn.BCELoss()
optimizer_generator = optim.Adam(generator.parameters(), lr=learning_rate, betas=optimizer_betas)
optimizer_discriminator = optim.Adam(discriminator.parameters(), lr=learning_rate, betas=optimizer_betas)

real_label = 1.
fake_label = 0.

In [18]:
print("Training is started")
for epoch in range(num_epochs):
    for i, sequence_batch in enumerate(dataloader):
            ############################
            # (1) Update D network: maximize log(D(x)) + log(1 - D(G(z)))
            ###########################
            ## Training with real batch
            discriminator.zero_grad()
            # Format batch
            real_sequence = sequence_batch.to(device)
            batch_size = real_sequence.size(0)
            real_labels = torch.full((batch_size,), real_label, dtype=torch.float, device=device)
            # Forward pass real batch through D
            discriminator_output_real = discriminator(real_sequence).view(-1)
            # Calculate loss on all-real batch
            discriminator_error_real = criterion(discriminator_output_real, real_labels)
            # Calculate gradients for D in backward pass
            discriminator_error_real.backward()

            ## Training with fake batch
            # Assign first t values
            generator_input_sequence = sequence_batch[:,:-1].to(device)
            #  Generate (t+1)th value from first t values
            generated_values = generator(generator_input_sequence)
            fake_labels = torch.full((batch_size,), fake_label, dtype=torch.float, device=device)
            # Concat first t real values and generated (t+1)th values
            generator_result_concat = torch.cat((generator_input_sequence, generated_values.detach()), 1)
            # Classify all fake batch with D
            discriminator_output_fake = discriminator(generator_result_concat).view(-1)
            # Calculate D's loss on the all-fake batch
            discriminator_error_fake = criterion(discriminator_output_fake, fake_labels)
            # Calculate the gradients for this batch
            discriminator_error_fake.backward()
            # Add the gradients from the all-real and all-fake batches
            discriminator_error = discriminator_error_real + discriminator_error_fake
            # Update D
            optimizer_discriminator.step()

            ############################
            # (2) Update G network: maximize log(D(G(z)))
            ###########################
            generator.zero_grad()
            real_labels = torch.full((batch_size,), real_label, dtype=torch.float, device=device)
            # Since we just updated D, perform another forward pass of all-fake batch through D
            generator_result_concat_grad = torch.cat((generator_input_sequence, generated_values), 1)
            discriminator_output_fake = discriminator(generator_result_concat_grad).view(-1)
            # Calculate G's loss based on this output
            generator_error = criterion(discriminator_output_fake, real_labels)
            # Calculate gradients for G
            generator_error.backward()
            # Update G
            optimizer_generator.step()
    if (epoch+1) % 50 == 0 or epoch+1 == 1:
        print('\n[{}/{}]\tDiscriminator Loss: {:.4f}\tGenerator Loss: {:.4f}'
                  .format(epoch+1, num_epochs, discriminator_error.item(), generator_error.item()))
        for col_name, real, generated in zip(columns_used_in_training, sequence_batch[0][-1], generated_values[0][0]):
            print(f"{col_name} | Real:{real:.4f} / Generated:{generated:.4f}")
       

Training is started

[1/100000]	Discriminator Loss: 0.0037	Generator Loss: 32.3494
close | Real:1007.8100 / Generated:7.6378
open | Real:987.2700 / Generated:6.0550
high | Real:1016.4700 / Generated:7.2139
low | Real:978.4900 / Generated:8.4162
CUMLOGRET_1 | Real:1.7501 / Generated:5.1049
RSI_14 | Real:28.0557 / Generated:8.4303
MACD_12_26_9 | Real:-34.5650 / Generated:4.2435
SMA5 | Real:1013.0380 / Generated:8.8776

[50/100000]	Discriminator Loss: 0.0000	Generator Loss: 77.2910
close | Real:189.0000 / Generated:114.1273
open | Real:188.3300 / Generated:119.0622
high | Real:189.8000 / Generated:112.3386
low | Real:185.2100 / Generated:111.3943
CUMLOGRET_1 | Real:0.0763 / Generated:2.0197
RSI_14 | Real:55.2411 / Generated:49.1818
MACD_12_26_9 | Real:3.7336 / Generated:-0.9957
SMA5 | Real:185.5500 / Generated:110.3518

[100/100000]	Discriminator Loss: 0.0000	Generator Loss: 100.0000
close | Real:903.8300 / Generated:209.6572
open | Real:898.6900 / Generated:206.3052
high | Real:903.8300 


[1000/100000]	Discriminator Loss: 1.5976	Generator Loss: 2.7661
close | Real:88.4200 / Generated:92.0844
open | Real:88.7800 / Generated:91.5074
high | Real:88.7800 / Generated:92.6171
low | Real:86.8100 / Generated:90.7332
CUMLOGRET_1 | Real:-0.6834 / Generated:-0.9036
RSI_14 | Real:35.4567 / Generated:43.6100
MACD_12_26_9 | Real:-1.8284 / Generated:-1.5663
SMA5 | Real:89.7120 / Generated:91.2735

[1050/100000]	Discriminator Loss: 0.2131	Generator Loss: 6.1234
close | Real:120.5200 / Generated:128.7180
open | Real:126.4400 / Generated:129.2984
high | Real:127.0500 / Generated:130.0118
low | Real:119.4400 / Generated:129.3546
CUMLOGRET_1 | Real:-0.3737 / Generated:-0.6104
RSI_14 | Real:56.1878 / Generated:63.2499
MACD_12_26_9 | Real:6.4254 / Generated:7.7728
SMA5 | Real:124.8300 / Generated:127.1295

[1100/100000]	Discriminator Loss: 1.0918	Generator Loss: 26.5140
close | Real:1062.3900 / Generated:977.8774
open | Real:1074.6801 / Generated:1021.2309
high | Real:1076.8600 / Generated:


[2000/100000]	Discriminator Loss: 0.4268	Generator Loss: 7.7097
close | Real:289.9200 / Generated:301.6270
open | Real:287.1300 / Generated:301.8021
high | Real:291.6400 / Generated:306.4086
low | Real:287.1300 / Generated:296.5010
CUMLOGRET_1 | Real:0.5041 / Generated:0.2960
RSI_14 | Real:75.4913 / Generated:72.7480
MACD_12_26_9 | Real:6.6619 / Generated:7.7792
SMA5 | Real:286.4200 / Generated:301.3112

[2050/100000]	Discriminator Loss: 0.4229	Generator Loss: 5.6790
close | Real:263.9500 / Generated:260.6700
open | Real:267.7700 / Generated:261.7665
high | Real:269.6600 / Generated:266.2847
low | Real:263.6400 / Generated:254.9082
CUMLOGRET_1 | Real:0.4103 / Generated:0.1743
RSI_14 | Real:54.5050 / Generated:57.8035
MACD_12_26_9 | Real:-1.2646 / Generated:-1.9332
SMA5 | Real:260.1820 / Generated:261.8071

[2100/100000]	Discriminator Loss: 0.5800	Generator Loss: 3.4008
close | Real:284.9600 / Generated:318.5162
open | Real:283.8000 / Generated:317.7865
high | Real:291.2100 / Generated


[3000/100000]	Discriminator Loss: 0.2694	Generator Loss: 12.9303
close | Real:863.4100 / Generated:854.5900
open | Real:858.3500 / Generated:835.2922
high | Real:864.9700 / Generated:852.0206
low | Real:856.8100 / Generated:852.4462
CUMLOGRET_1 | Real:1.5954 / Generated:3.4986
RSI_14 | Real:87.6669 / Generated:67.2739
MACD_12_26_9 | Real:24.5194 / Generated:19.6402
SMA5 | Real:854.2700 / Generated:832.0701


KeyboardInterrupt: 