### First attempt, treat all stocks and all dates independently and predict last two values with only 44 features per day

In [30]:
import os
import math
import sys
import numpy as np
import matplotlib
% matplotlib inline  
import matplotlib.pyplot as plt
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from tensorboardX import SummaryWriter

import logging
import imp
imp.reload(logging)
logging.basicConfig(format='%(asctime)s %(levelname)s:%(message)s', level=logging.INFO, datefmt='%I:%M:%S')

In [67]:
class StockData():
    def __init__(self, batch_size=64, train_date_range=('2018-01-16', '2018-01-16'), 
                 test_date_range=('2018-01-17', '2018-01-19')):
        self.name = 'independent'
        self.stock_dir = './stocks_by_date'
        self.train_date_range = train_date_range
        self.test_date_range = test_date_range
        self.use_dates = []
        self.train_stocks = self.get_stocks('train')
        self.test_stocks = self.get_stocks('test')
        
        self.batch_size = batch_size
        self.train_idx = np.random.permutation(len(self.train_stocks))
        self.train_cnt = 0
        self.test_idx = np.random.permutation(len(self.test_stocks))
        self.test_cnt = 0
        
        print('number of training stocks: %d, number of testing stocks: %d' % 
              (self.train_stocks.shape[0], self.test_stocks.shape[0]))
        print('use below dates:', self.use_dates)
                
    def get_stocks(self, status='train'):
        if status == 'train':
            start_date = time.strptime(self.train_date_range[0], '%Y-%m-%d')
            end_date = time.strptime(self.train_date_range[1], '%Y-%m-%d')
        else:
            start_date = time.strptime(self.test_date_range[0], '%Y-%m-%d')
            end_date = time.strptime(self.test_date_range[1], '%Y-%m-%d')
        stocks = []
        stock_files = os.listdir(self.stock_dir)
        stock_files.sort()
        for stock_file in stock_files:
            orig_date, _ = os.path.splitext(stock_file)
            date = time.strptime(orig_date, '%Y-%m-%d')
            if date < start_date or date > end_date:
                continue
            self.use_dates.append(orig_date)
            stock_file_name = os.path.join(self.stock_dir, stock_file)
            lines = open(stock_file_name).readlines()
            for line in lines:
                line = line.strip().split(', ')
                one_id = line[0]
                name = line[1]
                feature = line[2:]
                stocks.append(feature)
        stocks = np.array(stocks).astype(np.float32)
        return stocks

    def get_next_batch(self, status='train'):
        inputs = np.zeros((self.batch_size, 44))
        outputs = np.zeros((self.batch_size, 1))
        i = 0
        restart = False
        while i < self.batch_size:
            if status == 'train':
                x = self.train_stocks[self.train_idx[self.train_cnt], 0:44]
                y = self.train_stocks[self.train_idx[self.train_cnt], 44]
                self.train_cnt = self.train_cnt + 1
                if self.train_cnt >= self.train_stocks.shape[0]:
                    self.train_idx = np.random.permutation(self.train_stocks.shape[0])
                    self.train_cnt = 0
                    restart = True
            else:
                x = self.test_stocks[self.test_idx[self.test_cnt], 0:44]
                y = self.test_stocks[self.test_idx[self.test_cnt], 44]
                self.test_cnt = self.test_cnt + 1
                if self.test_cnt >= self.test_stocks.shape[0]:
                    self.test_idx = np.random.permutation(self.test_stocks.shape[0])
                    self.test_cnt = 0
                    restart = True
            if np.isnan(x).any() or np.isnan(y).any():
                continue
            inputs[i, :] = x * 100
            outputs[i, :] = y * 100
            i = i + 1
        return inputs, outputs, restart
    
    def get_one_sample(self, stock_name='', date=''):
        inputs = np.zeros((1, 44))
        outputs = np.zeros((1, 2))
        if stock_name == '':
            stock_name = '000001.XSHE'
        if date == '':
            date = '2018-01-17'
        stock_file = os.path.join('stocks_by_name', stock_name + '.txt')
        lines = open(stock_file).readlines()
        for line in lines:
            if date in line:
                break
        line = line.strip().split(', ')
        feature = np.array(line[2:]).astype(np.float32)
        inputs[0, :] = feature[:44] * 100
        outputs[0, :] = feature[44] * 100
        return inputs, outputs

In [68]:
data = StockData()
x, y, _ = data.get_next_batch('train')
print(x, y)
x, y, _ = data.get_next_batch('test')
print(x, y)
x, y = data.get_one_sample()
print (x, y)

number of training stocks: 3461, number of testing stocks: 10382
('use below dates:', ['2018-01-16', '2018-01-17', '2018-01-18', '2018-01-19'])
(array([[-2.08699989e+00, -4.84500027e+00, -3.01900005e+00, ...,
        -1.36104004e+02,  2.52946094e+03,  2.45077002e+03],
       [-1.56000003e-01, -3.96399975e+00,  1.50000006e-01, ...,
        -6.97579956e+01,  6.43867004e+02,  6.41538025e+02],
       [ 7.82999933e-01, -5.58799982e+00, -8.10999990e-01, ...,
         6.97300034e+01,  5.35139062e+03,  5.37800879e+03],
       ...,
       [ 0.00000000e+00, -1.56099999e+00, -1.25999999e+00, ...,
        -4.65429993e+01,  6.97872009e+02,  6.97442017e+02],
       [ 2.16999993e-01, -5.19999981e-01, -5.60999990e-01, ...,
         5.28499985e+00,  2.30198901e+03,  2.30557495e+03],
       [ 1.43599999e+00, -5.28299999e+00,  1.03399992e+00, ...,
         1.37792999e+02,  5.54234009e+02,  5.60276978e+02]]), array([[-0.68999999],
       [-0.92099998],
       [ 2.15499997],
       [ 1.60099994],
       [-

In [69]:
class Interface(object):
    def __init__(self, data, model, learning_rate, train_iter, test_iter, test_interval, save_interval, 
                 init_model_path, save_model_path, tensorboard_path):
        self.data = data
        self.model = model
        self.learning_rate = learning_rate
        self.train_iter = train_iter
        self.test_iter = test_iter
        self.test_interval = test_interval
        self.save_interval = save_interval
        self.init_model_path = init_model_path
        self.save_model_path = save_model_path
        self.tensorboard_path = tensorboard_path
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.init_model()
    
    def init_model(self):
        # model = torch.nn.DataParallel(model).cuda()
        self.model = self.model.to(self.device)
        if self.init_model_path is not '':
            self.model.load_state_dict(torch.load(self.init_model_path))
        return self.model

    def train(self):
        self.model.train()
        torch.set_grad_enabled(True)
        writer = SummaryWriter(self.tensorboard_path)
        criterion = nn.L1Loss()
        optimizer = optim.Adam(self.model.parameters(), lr=self.learning_rate)
        train_loss_all = []
        for it in range(self.train_iter):
            x, y, _ = self.data.get_next_batch('train')
            x = torch.from_numpy(x).float().to(self.device)
            y = torch.from_numpy(y).float().to(self.device)
            optimizer.zero_grad()
            pred = self.model(x)
            loss = criterion(pred, y)
            loss.backward()
            optimizer.step()
            writer.add_scalar('train_loss', loss, it)
            train_loss_all.append(loss)
            if len(train_loss_all) > 100:
                train_loss_all.pop(0)
            ave_train_loss = sum(train_loss_all) / float(len(train_loss_all))
            if (it + 1) % 10 == 0:
                logging.info('iteration %d, train loss: %.3f, average train loss: %.3f', it, loss, ave_train_loss)
            if (it + 1) % self.save_interval == 0:
                logging.info('iteration %d, saving model', it)
                with open(self.save_model_path, 'w') as handle:
                    torch.save(self.model.state_dict(), handle)
            if (it + 1) % self.test_interval == 0:
                logging.info('iteration %d, testing', it)
                test_loss = self.test()
                writer.add_scalar('test_loss', test_loss, it)
                self.model.train()
                torch.set_grad_enabled(True)
        writer.close()

    def test(self):
        self.model.eval()
        torch.set_grad_enabled(False)
        test_loss_all = []
        criterion = nn.L1Loss()
        for it in range(self.test_iter):
            x, y, _ = self.data.get_next_batch('test')
            x = torch.from_numpy(x).float().to(self.device)
            y = torch.from_numpy(y).float().to(self.device)
            pred = self.model(x)
            loss = criterion(pred, y)
            test_loss_all.append(loss)
            if len(test_loss_all) > 100:
                test_loss_all.pop(0)
        test_loss = np.mean(np.array(test_loss_all))
        logging.info('average test loss: %.3f', test_loss)
        return test_loss
    
    def test_all(self):
        self.model.eval()
        torch.set_grad_enabled(False)
        test_loss_all = []
        criterion = nn.L1Loss()
        self.data.test_cnt = 0 # Restart from the first testing batch, note that this is important if you test() before.
        while True:
            x, y, restart = self.data.get_next_batch('test')   
            if restart:
                break
            x = torch.from_numpy(x).float().to(self.device)
            y = torch.from_numpy(y).float().to(self.device)
            pred = self.model(x)
            loss = criterion(pred, y)
            test_loss_all.append(loss)
        test_loss = np.mean(np.array(test_loss_all))
        logging.info('overall average test loss: %.3f', test_loss)
    
    def baseline(self):
        torch.set_grad_enabled(False)
        test_loss_all = []
        criterion = nn.L1Loss()
        self.data.test_cnt = 0 # Restart from the first testing batch, note that this is important if you test() before.
        while True:
            x, y, restart = self.data.get_next_batch('test')   
            if restart:
                break
            pred = np.zeros((self.data.batch_size, 1))
            pred = torch.from_numpy(pred).float().to(self.device)
            y = torch.from_numpy(y).float().to(self.device)
            loss = criterion(pred, y)
            test_loss_all.append(loss)
        test_loss = np.mean(np.array(test_loss_all))
        logging.info('baseline average test loss: %.3f', test_loss)
    
    def predict(self, stock_name='', date=''):
        self.model.eval()
        torch.set_grad_enabled(False)
        test_loss_all = []
        criterion = nn.L1Loss()
        x, y = self.data.get_one_sample(stock_name, date)
        x = torch.from_numpy(x).float().to(self.device)
        y = torch.from_numpy(y).float().to(self.device)
        pred = np.zeros((1, 1))
        pred = torch.from_numpy(pred).float().to(self.device)
        loss = criterion(pred, y)
        logging.info('baseline loss: %.3f', loss)
        pred = self.model(x)
        loss = criterion(pred, y)
        logging.info('prediction loss: %.3f', loss)
        return x, pred, y

In [70]:
class BaseNet(nn.Module):
    def __init__(self, input_size, output_size):
        super(BaseNet, self).__init__()
        num_hidden = 256
        self.bn0 = nn.BatchNorm1d(input_size)
        self.fc1 = nn.Linear(input_size, num_hidden)
        self.bn1 = nn.BatchNorm1d(num_hidden)
        self.fc2 = nn.Linear(num_hidden, num_hidden)
        self.bn2 = nn.BatchNorm1d(num_hidden)
        self.fc3 = nn.Linear(num_hidden, num_hidden)
        self.bn3 = nn.BatchNorm1d(num_hidden)
        self.fc4 = nn.Linear(num_hidden, num_hidden)
        self.bn4 = nn.BatchNorm1d(num_hidden)
        self.fc = nn.Linear(num_hidden, output_size)
        self.input_size = input_size
        self.output_size = output_size

    def forward(self, x):
        x = self.bn0(x)
        # x = F.dropout(x)
        x = F.relu(self.bn1(self.fc1(x)))
        # x = F.dropout(x)
        x = F.relu(self.bn2(self.fc2(x)))
        # x = F.dropout(x)
        x = F.relu(self.bn3(self.fc3(x)))
        # x = F.dropout(x)
        x = F.relu(self.bn4(self.fc4(x)))
        # x = F.dropout(x)
        pred = self.fc(x)
        return pred

In [71]:
batch_size = 256
data = StockData(batch_size)
model = BaseNet(44, 1)
learning_rate = 1e-3
train_iter = 5000
test_iter = 10
test_interval = 100
save_interval = 1001
init_model_path = ''
save_model_path = 'base_net.pth'
tensorboard_path = 'tensorboard/base_net2'
interface = Interface(data, model, learning_rate, train_iter, test_iter, test_interval, save_interval, 
                      init_model_path, save_model_path, tensorboard_path)
interface.baseline()
interface.train()
interface.test_all()

10:38:16 INFO:baseline average test loss: 1.250


number of training stocks: 3461, number of testing stocks: 10382
('use below dates:', ['2018-01-16', '2018-01-17', '2018-01-18', '2018-01-19'])


10:38:16 INFO:iteration 9, train loss: 1.593, average train loss: 1.583
10:38:17 INFO:iteration 19, train loss: 1.361, average train loss: 1.526
10:38:17 INFO:iteration 29, train loss: 1.607, average train loss: 1.485
10:38:17 INFO:iteration 39, train loss: 1.230, average train loss: 1.444
10:38:17 INFO:iteration 49, train loss: 1.260, average train loss: 1.421
10:38:17 INFO:iteration 59, train loss: 1.191, average train loss: 1.392
10:38:17 INFO:iteration 69, train loss: 1.324, average train loss: 1.368
10:38:17 INFO:iteration 79, train loss: 1.263, average train loss: 1.348
10:38:17 INFO:iteration 89, train loss: 1.200, average train loss: 1.321
10:38:18 INFO:iteration 99, train loss: 1.057, average train loss: 1.304
10:38:18 INFO:iteration 99, testing
10:38:18 INFO:average test loss: 1.519
10:38:18 INFO:iteration 109, train loss: 1.137, average train loss: 1.258
10:38:18 INFO:iteration 119, train loss: 1.033, average train loss: 1.221
10:38:18 INFO:iteration 129, train loss: 1.103, 

In [15]:
data = StockData()
model = BaseNet(44, 1)
learning_rate = 1e-3
train_iter = 5000
test_iter = 10
test_interval = 100
save_interval = 1001
init_model_path = 'base_net.pth'
save_model_path = ''
tensorboard_path = ''
interface = Interface(data, model, learning_rate, train_iter, test_iter, test_interval, save_interval, 
                      init_model_path, save_model_path, tensorboard_path)
x, pred, y = interface.predict()
print(pred, y)

06:25:28 INFO:baseline loss: 2.963
06:25:28 INFO:prediction loss: 2.986


number of training stocks: 34617, number of testing stocks: 13843
(tensor(1.00000e-02 *
       [[-3.2372, -1.3788]], device='cuda:0'), tensor([[ 2.6590,  3.2660]], device='cuda:0'))
