In [15]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import pytz
import torch
from torch import nn
from torch.autograd import Variable

In [202]:
dt = pd.read_csv('./ts_10k.csv',usecols=[0,1,2,3,4])

In [203]:
def str2ts(x):
    time_string = x + ' GMT-0500'
    dt = datetime.datetime.strptime(time_string, '%Y-%m-%d %H:%M:%S GMT%z')
    ts = int(dt.timestamp())
    return ts

In [204]:
ts = dt['date'].apply(str2ts)

In [205]:
dt.insert(1, 'ts', ts)

In [206]:
dt.head(10)

Unnamed: 0,date,ts,open,high,low,close
0,2022-02-25 15:59:00,1645822740,809.61,809.99,809.2,809.87
1,2022-02-25 15:58:00,1645822680,808.52,809.94,808.29,809.7
2,2022-02-25 15:57:00,1645822620,809.3,809.5,807.24,808.49
3,2022-02-25 15:56:00,1645822560,810.27,810.51,809.11,809.22
4,2022-02-25 15:55:00,1645822500,809.41,810.64,809.0,810.42
5,2022-02-25 15:54:00,1645822440,809.66,810.04,808.81,809.38
6,2022-02-25 15:53:00,1645822380,807.96,809.68,807.91,809.5
7,2022-02-25 15:52:00,1645822320,807.41,808.07,806.71,808.0
8,2022-02-25 15:51:00,1645822260,806.21,807.67,806.11,807.42
9,2022-02-25 15:50:00,1645822200,805.87,806.74,804.51,806.44


In [221]:
def create_dataset(dt, step):
    data_X = []
    data_Y = []
    for i in range(len(dt)- step):
        dt_continue = True
        line = '1'
        data_x = [[1.0],]
        data_y = 0
        for j in range(i+step-1, i-1, -1):
            #print(j)
            if dt.loc[j]['ts']-dt.loc[j+1]['ts'] != 60:
                dt_continue = False
                break
            
            value = dt.loc[j]['close']/dt.loc[j+1]['close']
            line += "\t{}".format(value)
            if j==i:
                data_y = value
            else:
                data_x.append([value])
        if not dt_continue:
            #print(i)
            #return
            continue
        data_X.append(data_x)
        data_Y.append(data_y)
        if(i%1000 == 0):
            print(line)
            print("==========={}".format(i))
        
    return np.array(data_X, dtype=np.float32), np.array(data_Y, dtype=np.float32)
data_X, data_Y = create_dataset(dt=dt, step=5)
        

1	1.0012849341471248	0.9985192862959947	0.9990978967400707	1.0014966171504904	1.0002099543040632
1	1.000503531011216	1.0014972508461355	1.0018342169401242	0.9986331260032102	1.0003139323161927
1	1.0002051328805215	1.000957090445047	0.9987137165623221	1.0007864412961465	0.9990433455572513
1	1.0001744382543092	0.9994222740601053	0.9999890932094322	1.000065441457163	0.999727345104754
1	0.9982594798143445	0.99893632186681	1.001086765609906	0.9980920006579308	1.0000769053295393
1	0.998945264794049	1.0019560989163656	1.0014974875486684	1.0007974658308043	1.0011952455786979
1	0.9991035018093958	1.0010942714887563	0.9996174236213586	0.9998469108802625	1.0003280983420098
1	1.0000328223980044	0.9972977112599012	1.0031813245280121	1.0023292180169934	1.0033711542657648
1	0.9993025931909848	1.00062810417224	0.999535029699978	0.9981392536081035	1.002178801542638
1	1.0018543135130304	1.005251597221448	0.9945983717473769	0.9997658105742101	1.0023870607919687


In [222]:
train_size = int(len(data_X) * 0.7)
train_X = data_X[:train_size]
train_Y = data_Y[:train_size]
test_X = data_X[train_size:]
test_Y = data_Y[train_size:]

In [223]:
train_X = train_X.reshape(5, -1, 1)
train_Y = train_Y.reshape(-1, 1)
test_X = test_X.reshape(5, -1, 1)
test_Y = test_Y.reshape(-1, 1)

train_x = torch.from_numpy(train_X)
train_y = torch.from_numpy(train_Y)
test_x = torch.from_numpy(test_X)
test_y = torch.from_numpy(test_Y)

In [224]:
train_x.size()

torch.Size([5, 6908, 1])

In [233]:
class lstm(nn.Module):
    def __init__(self,input_size=2,hidden_size=4,output_size=1,num_layer=2):
        super(lstm,self).__init__()
        self.layer1 = nn.LSTM(input_size,hidden_size,num_layer)
        self.layer2 = nn.Linear(hidden_size,output_size)
        #input_size: embedding的维度，比如一个字由128维向量描述，则该值是128
        # hidden_size：lstm内部隐含层的维度
        # lstm参数中并不定义序列的长度，就是不定义由几个lstm_cell串联

    
    def forward(self,x):
        # x的结构：[seq_length * batch * input_size]
        # seq_length: 序列长度，比如要用前5个单词预测第6个，那么seq_length=5
        # batch: 一个batch多少个数据
        # input_size: 同lstm init参数input_size
        #print("before x.size:{}".format(x.size()))
        x,_ = self.layer1(x)
        # 输出x的结构：sequence_length * batch_num * hidden_size*num_layer 
        
        s,b,h = x.size()
        #print("x.size:{}".format(x.size()))
        
        x = x[-1,:,:]
        #print("最后一个时间片：x.size:{}".format(x.size()))
        
        #x = x.view(s*b,h)
        x = self.layer2(x)
        #print(x.size)
        return x
    

model = lstm(1, 8,1,1)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)

In [234]:
def eval(model, data_x, data_y):
    model_a = model.eval() # 转换成测试模式

    var_data = Variable(data_x)
    pred_test = model_a(var_data) # 测试集的预测结果
    # 改变输出的格式
    pred_test = pred_test.view(-1).data.numpy()
    
    right = 0
    wrong = 0
    diff_random = 0
    diff_pred = 0
    for i in range(len(test_y)):
        y = data_y.view(-1).data.numpy()[i]
        pred_y = pred_test[i]
        #print(y, pred_y)
        if (y >1 and pred_y>1) or (y<1 and pred_y<1):
            right +=1
        else:
            wrong +=1
        diff_random += abs(y-1)
        diff_pred += abs(y-pred_y)

    print("all:{}  right:{}   wrong:{}   ratio:{}".format(len(test_y),right, wrong, right/len(test_y) ))
    print("diff_random:{}  diff_pred:{}".format(diff_random/len(test_y), diff_pred/len(test_y)))

In [235]:
def train():
    for e in range(5000):
        var_x = Variable(train_x)
        var_y = Variable(train_y)
        # 前向传播
        out = model(var_x)
        #print(out.shape)
        #print(var_y.shape)
        loss = criterion(out, var_y)
        # 反向传播
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        #print(e)
        #print(loss.data)
        
        if (e + 1) % 20 == 0: # 每 100 次输出结果
            print('Epoch: {}, Loss: {:.5f}'.format(e + 1, loss.data))
            eval(model, test_x, test_y)
    eval(model, test_x, test_y)

In [236]:
a=train()

Epoch: 20, Loss: 0.03718
all:2961  right:1443   wrong:1518   ratio:0.48733535967578523
diff_random:0.001814964110687833  diff_pred:0.15252212383424218
Epoch: 40, Loss: 0.00036
all:2961  right:1504   wrong:1457   ratio:0.5079365079365079
diff_random:0.001814964110687833  diff_pred:0.003052666093883624
Epoch: 60, Loss: 0.00054
all:2961  right:1504   wrong:1457   ratio:0.5079365079365079
diff_random:0.001814964110687833  diff_pred:0.025091235156962696
Epoch: 80, Loss: 0.00005
all:2961  right:1443   wrong:1518   ratio:0.48733535967578523
diff_random:0.001814964110687833  diff_pred:0.005045981372542093
Epoch: 100, Loss: 0.00001
all:2961  right:1443   wrong:1518   ratio:0.48733535967578523
diff_random:0.001814964110687833  diff_pred:0.0036912746060663525
Epoch: 120, Loss: 0.00000
all:2961  right:1441   wrong:1520   ratio:0.4866599121918271
diff_random:0.001814964110687833  diff_pred:0.0018342911734125253
Epoch: 140, Loss: 0.00000
all:2961  right:1504   wrong:1457   ratio:0.5079365079365079
d

KeyboardInterrupt: 

In [None]:
eval(model, test_x, test_y)