### GAN 和 Recurrent GAN 的结构

#### GAN:

![title](./GAN.PNG)

#### 利用GAN生成历史数据

![title](./stock_data_gen.PNG)

### 利用MLP产生一组随机的时间序列（Generator）

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import time

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.distributions.normal import Normal

In [2]:
# 产生用于输入Generator的噪声, 为正态分布
# 其大小为 m x n

def get_noise_data(m, n):
    dist = Normal(0, 1)
    return dist.sample((m, n)).requires_grad_()

In [3]:
class Generator(nn.Module):
    def __init__(self, input_size=20, num_features=4, batch_size=10, seq_len=26):
        super().__init__()
        self.input_size = input_size   # nums of input rand numbers
        self.num_features = num_features
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.output_size = self.num_features * self.seq_len
        
        # 使用MLP
        self.fc1 = nn.Linear(self.input_size, self.output_size*10)
        self.fc2 = nn.Linear(self.output_size*10, self.output_size*5)
        self.fc3 = nn.Linear(self.output_size*5, self.output_size)
        
    def forward(self, input_data):
        output = torch.sigmoid(self.fc1(input_data))
        output = torch.sigmoid(self.fc2(output))
        output = torch.sigmoid(self.fc3(output))
        
        # output size: [batch_size, channels=num_features, width=seq_len]
        output = output.view(output.size(0), self.num_features, self.seq_len)
        
        return output

In [4]:
input_size = 20
batch_size = 10
seq_len = 26

g = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)

In [5]:
input_data = get_noise_data(batch_size, input_size)

time_series = g.forward(input_data)

time_series.shape

torch.Size([10, 4, 26])

### 下面是利用CNN构造的Discriminator

In [6]:
class Discriminator(nn.Module):
    def __init__(self, batch_size=10, seq_len=26, num_features=4):
        super().__init__()
        self.input_size = seq_len * num_features
        self.batch_size = batch_size
        self.seq_len = seq_len
        
        # 使用 CNN
        self.conv1 = nn.Sequential(
            nn.Conv1d(in_channels=4, out_channels=8, kernel_size=3),
            nn.ReLU(),
        )
        self.conv2 = nn.Sequential(
            nn.Conv1d(in_channels=8, out_channels=16, kernel_size=3),
            nn.ReLU(),
        )
        self.conv3 = nn.Sequential(
            nn.Conv1d(in_channels=16, out_channels=32, kernel_size=3),
            nn.ReLU(),
        )
        self.fc1 = nn.Sequential(
            nn.Linear(640, 256),
            nn.ReLU(),
        )
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, 1)
        
    def forward(self, input_data):
        # input_data的大小：[batch_size, channels=4, width=26]
        output = self.conv1(input_data)
        output = self.conv2(output)
        output = self.conv3(output)
        output = output.view(output.size(0), -1) # [batch_size, 640]
        output = self.fc1(output)
        output = self.fc2(output)
        output = self.fc3(output)
        output = torch.sigmoid(output)
        
        return output

In [7]:
# 一起测试Generator和Discriminator
input_size = 20
batch_size = 10
seq_len = 26

g = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)
input_data = get_noise_data(batch_size, input_size)
time_series = g.forward(input_data)

print(time_series.shape)

d = Discriminator(batch_size=batch_size, seq_len=seq_len)
outputs = d.forward(time_series)

print(outputs)

torch.Size([10, 4, 26])
tensor([[0.4926],
        [0.4927],
        [0.4926],
        [0.4926],
        [0.4926],
        [0.4926],
        [0.4926],
        [0.4926],
        [0.4927],
        [0.4926]], grad_fn=<SigmoidBackward>)


### 获取真实样本

In [8]:
# 从csv文件中读取数据，将数据划分为训练集和测试集
# 再从训练集中随机抽取m个长度为n的串返回

data = pd.read_csv('stock_data_730.csv')

data.set_index(["date"], inplace=True)
data_sorted = data.sort_index()

In [9]:
# 将所有数据分割为训练集和测试集
# 与 LSTM -- predict stock price中的函数相同

def train_test_split(data, SEQ_LENGTH = 26, test_prop=0.137):  # 0.11 for 1095, 0.137 for 730, 0.3 for 365
    
    ntrain = int(len(data) *(1-test_prop))  # len(data) = 197
    predictors = data.columns[:4]  # open, high, close, low
    data_pred = data[predictors]
    num_attr = data_pred.shape[1]  # 4
    
    result = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH, num_attr))
    y = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH))
    yopen = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH))

    for index in range(len(data) - SEQ_LENGTH):
        result[index, :, :] = data_pred[index: index + SEQ_LENGTH]
        y[index, :] = data_pred[index+1: index + SEQ_LENGTH + 1].close
        yopen[index, :] = data_pred[index+1: index + SEQ_LENGTH + 1].open

    """
        xtrain的大小：ntrain x SEQ_LENGTH x 4
        ytrain的大小：ntrain x SEQ_LENGTH
        
        * xtrain的每个batch为长为SEQ_LENGTH的连续序列，一共有ntrain个batch，
          序列中每个单元都是一个四元组（open，high，close，low）
        * ytrain的每个batch为长为SEQ_LENGTH的连续序列，一共有ntrain个batch，
          序列中每个单元是xtrain中对应四元组所在日期的下一天的close price
        
        xtest 的大小：    ntest x SEQ_LENGTH x 4                
        ytest的大小：     ntest x SEQ_LENGTH      (close price)
        ytest_open的大小：ntest x SEQ_LENGTH      (open price)  
        
        * xtest的每个batch为长为SEQ_LENGTH的连续序列，一共有ntest个batch，
          序列中每个单元都是一个四元组（open，high，close，low）
          每一个序列仅包含一个新四元组，且在最后一个
        * ytest的每个batch为长为SEQ_LENGTH的连续序列，一共有ntest个batch，
          序列中每个单元是xtest中对应四元组所在日期的下一天的close price
        
        类型：numpy.ndarray
    """
    xtrain = result[:ntrain, :, :]
    ytrain = y[:ntrain]
    
    xtest = result[ntrain:, :, :]
    ytest = y[ntrain:]
    ytest_open = yopen[ntrain:]
    
    return xtrain, xtest, ytrain, ytest, ytest_open

In [10]:
xtrain, xtest, ytrain, ytest, ytest_open = train_test_split(data_sorted)  # 只需要xtrain

In [11]:
xtrain.shape  # open, high, close, low

xtrain_mean = np.mean(xtrain)
xtrain_max = np.max(xtrain)
xtrain_min = np.min(xtrain)

print(xtrain_mean, xtrain_max, xtrain_min)

15.418522797766748 21.26 9.86


In [12]:
# 获取下标从start_idx开始的连续batch_size个序列

def get_real_samples(idx, batch_size, data=xtrain):
    data = data[idx:idx+batch_size, :]
    min_val = np.min(data)
    max_val = np.max(data)
    mean_val = np.mean(data)
    data = (data-mean_val)/(max_val-min_val)
    
    data = torch.from_numpy(data).float()
    data = data.view(batch_size, 4, -1)
    
    return data.requires_grad_()

In [13]:
get_real_samples(0, 1).shape

torch.Size([1, 4, 26])

In [14]:
# 将real samples带入Discriminator中进行测试

batch_size = 10
seq_len = 26

real_samples = get_real_samples(0, batch_size)

print(real_samples.shape)

d = Discriminator(batch_size=batch_size, seq_len=seq_len)
outputs = d.forward(real_samples)

print(outputs)

torch.Size([10, 4, 26])
tensor([[0.4988],
        [0.4987],
        [0.4987],
        [0.4986],
        [0.4987],
        [0.4988],
        [0.4990],
        [0.4990],
        [0.4990],
        [0.4989]], grad_fn=<SigmoidBackward>)


### 构造GAN

In [15]:
d_learning_rate = 0.01
g_learning_rate = 0.01

input_size = 20
batch_size = 5
seq_len = 26

G = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)
D = Discriminator(batch_size=batch_size, seq_len=seq_len)

d_optimizer = optim.SGD(D.parameters(), lr=d_learning_rate) 
g_optimizer = optim.SGD(G.parameters(), lr=g_learning_rate)

In [16]:
num_epochs = 100

for epoch in range(num_epochs):
    
    G.train()
    D.train()
    
    sum_real_error = 0
    sum_gen_loss = 0
    
    start_time = time.time()
    
    for idx in range(0, len(xtrain)-batch_size+1, batch_size):
        
        """
            训练Discriminator
        """
        D.zero_grad()
        # 用真实样本训练D 
        real_data = get_real_samples(idx, batch_size)
        real_decision = D(real_data)
        real_error = -torch.sum(torch.log(real_decision))/batch_size
        sum_real_error += real_error
        real_error.backward()
    
        # 用生成样本训练D
        input_data = get_noise_data(batch_size, input_size)
        fake_data = G.forward(input_data) 
        fake_decision = D(fake_data)
        fake_error = 1 - torch.sum(torch.log(fake_decision))/batch_size
        fake_error.backward()
    
        d_optimizer.step()
    
    
        """
            训练Generator
        """
        G.zero_grad()
        # 训练G
        input_data = get_noise_data(batch_size, input_size)
        fake_data = G.forward(input_data)
        fake_decision = D(fake_data)
        gen_loss = -torch.sum(torch.log(fake_decision))/batch_size
        sum_gen_loss += gen_loss
        gen_loss.backward()

        g_optimizer.step()
    
    print("Epoch %5d:  D Loss  %5.3f  " % (epoch, sum_real_error), end="")
    print("G Loss  %5.3f  " % (sum_gen_loss), end="")
    print("duration: %5f" %(time.time()-start_time))

Epoch     0:  D Loss  31.349  G Loss  30.800  duration: 1.187165
Epoch     1:  D Loss  8.023  G Loss  7.283  duration: 0.553260
Epoch     2:  D Loss  1.480  G Loss  0.972  duration: 0.531216
Epoch     3:  D Loss  0.567  G Loss  0.287  duration: 0.561081
Epoch     4:  D Loss  0.318  G Loss  0.136  duration: 0.613552
Epoch     5:  D Loss  0.212  G Loss  0.081  duration: 0.584287
Epoch     6:  D Loss  0.155  G Loss  0.054  duration: 0.578336
Epoch     7:  D Loss  0.121  G Loss  0.039  duration: 0.561472
Epoch     8:  D Loss  0.098  G Loss  0.030  duration: 0.561109
Epoch     9:  D Loss  0.082  G Loss  0.024  duration: 0.503439
Epoch    10:  D Loss  0.070  G Loss  0.019  duration: 0.506416
Epoch    11:  D Loss  0.060  G Loss  0.016  duration: 0.575361
Epoch    12:  D Loss  0.053  G Loss  0.014  duration: 0.553040
Epoch    13:  D Loss  0.047  G Loss  0.012  duration: 0.531215
Epoch    14:  D Loss  0.043  G Loss  0.010  duration: 0.526712
Epoch    15:  D Loss  0.039  G Loss  0.009  duration:

### 利用Generator生成时间序列

In [18]:
gen_batch_size = 100

input_data = get_noise_data(gen_batch_size, input_size)
time_series = G.forward(input_data)

time_series.shape

torch.Size([100, 4, 26])

In [19]:
time_series = time_series.permute(0,2,1).detach().numpy()
time_series.shape

(100, 26, 4)

In [20]:
time_series = time_series * (xtrain_max - xtrain_min) + xtrain_mean
time_series.shape

(100, 26, 4)

In [21]:
np.save('./time_series.npy', time_series)