### GAN 和 Recurrent GAN 的结构

#### GAN:

![title](./GAN.PNG)

#### 利用GAN生成历史数据

![title](./stock_data_gen.PNG)

### 利用MLP产生一组随机的时间序列（Generator）

In [1]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd

import torch
import torch.nn as nn
import torch.optim as optim
from torch.distributions.normal import Normal

In [2]:
# 产生用于输入Generator的噪声, 为正态分布
# 其大小为 m x n

def get_noise_data(m, n):
    dist = Normal(0, 1)
    return dist.sample((m, n)).requires_grad_()

In [3]:
class Generator(nn.Module):
    def __init__(self, input_size=20, num_features=4, batch_size=10, seq_len=30):
        super().__init__()
        self.input_size = input_size   # nums of input rand numbers
        self.num_features = num_features
        self.batch_size = batch_size
        self.seq_len = seq_len
        self.output_size = self.num_features * self.seq_len
        
        # 使用MLP
        self.fc1 = nn.Linear(self.input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, self.output_size)
        
    def forward(self, input_data):
        output = torch.sigmoid(self.fc1(input_data))
        output = torch.sigmoid(self.fc2(output))
        output = torch.sigmoid(self.fc3(output))
        
        # output size: [batch_size, output_size]
        return output

In [None]:
input_size = 20
batch_size = 10
seq_len = 20

g = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)

In [None]:
input_data = get_noise_data(batch_size, input_size)

time_series = g.forward(input_data)

time_series.shape

### 下面是利用MLP构造的Discriminator

In [4]:
class Discriminator(nn.Module):
    def __init__(self, batch_size=10, seq_len=30, num_features=4):
        super().__init__()
        self.input_size = seq_len * num_features
        self.batch_size = batch_size
        self.seq_len = seq_len
        
        # 使用 MLP
        self.fc1 = nn.Linear(self.input_size, 512)
        self.fc2 = nn.Linear(512, 256)
        self.fc3 = nn.Linear(256, 128)
        self.fc4 = nn.Linear(128, 1)
        
    def forward(self, input_data):
        output = torch.sigmoid(self.fc1(input_data))
        output = torch.sigmoid(self.fc2(output))
        output = torch.sigmoid(self.fc3(output))
        output = torch.sigmoid(self.fc4(output))
        
        return output

In [None]:
# 一起测试Generator和Discriminator
input_size = 20
batch_size = 1
seq_len = 20

g = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)
input_data = get_noise_data(batch_size, input_size)
time_series = g.forward(input_data)

print(time_series.shape)

d = Discriminator(batch_size=batch_size, seq_len=seq_len)
outputs = d.forward(time_series)

print(outputs)

### 获取真实样本

In [5]:
# 从csv文件中读取数据，将数据划分为训练集和测试集
# 再从训练集中随机抽取m个长度为n的串返回

data = pd.read_csv('stock_data_730.csv')

data.set_index(["date"], inplace=True)
data_sorted = data.sort_index()

In [6]:
# 将所有数据分割为训练集和测试集
# 与 LSTM -- predict stock price中的函数相同

def train_test_split(data, SEQ_LENGTH = 30, test_prop=0.137):  # 0.11 for 1095, 0.137 for 730, 0.3 for 365
    
    ntrain = int(len(data) *(1-test_prop))  # len(data) = 197
    predictors = data.columns[:4]  # open, high, close, low
    data_pred = data[predictors]
    num_attr = data_pred.shape[1]  # 4
    
    result = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH, num_attr))
    y = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH))
    yopen = np.empty((len(data) - SEQ_LENGTH, SEQ_LENGTH))

    for index in range(len(data) - SEQ_LENGTH):
        result[index, :, :] = data_pred[index: index + SEQ_LENGTH]
        y[index, :] = data_pred[index+1: index + SEQ_LENGTH + 1].close
        yopen[index, :] = data_pred[index+1: index + SEQ_LENGTH + 1].open

    """
        xtrain的大小：ntrain x SEQ_LENGTH x 4
        ytrain的大小：ntrain x SEQ_LENGTH
        
        * xtrain的每个batch为长为SEQ_LENGTH的连续序列，一共有ntrain个batch，
          序列中每个单元都是一个四元组（open，high，close，low）
        * ytrain的每个batch为长为SEQ_LENGTH的连续序列，一共有ntrain个batch，
          序列中每个单元是xtrain中对应四元组所在日期的下一天的close price
        
        xtest 的大小：    ntest x SEQ_LENGTH x 4                
        ytest的大小：     ntest x SEQ_LENGTH      (close price)
        ytest_open的大小：ntest x SEQ_LENGTH      (open price)  
        
        * xtest的每个batch为长为SEQ_LENGTH的连续序列，一共有ntest个batch，
          序列中每个单元都是一个四元组（open，high，close，low）
          每一个序列仅包含一个新四元组，且在最后一个
        * ytest的每个batch为长为SEQ_LENGTH的连续序列，一共有ntest个batch，
          序列中每个单元是xtest中对应四元组所在日期的下一天的close price
        
        类型：numpy.ndarray
    """
    xtrain = result[:ntrain, :, :]
    ytrain = y[:ntrain]
    
    xtest = result[ntrain:, :, :]
    ytest = y[ntrain:]
    ytest_open = yopen[ntrain:]
    
    return xtrain, xtest, ytrain, ytest, ytest_open

In [7]:
xtrain, xtest, ytrain, ytest, ytest_open = train_test_split(data_sorted)  # 只需要ytrain

In [8]:
# 获取下标从start_idx开始的连续batch_size个序列

def get_real_samples(idx, batch_size, data=xtrain):
    data = data[idx:idx+batch_size, :]
    min_val = np.min(data)
    max_val = np.max(data)
    mean_val = np.mean(data)
    data = (data-mean_val)/(max_val-min_val)
    
    data = torch.from_numpy(data).float()
    data = data.view(batch_size, -1)
    
    return data.requires_grad_()

In [None]:
get_real_samples(0, 1).shape

In [None]:
# 将real samples带入Discriminator中进行测试

batch_size = 10
seq_len = 30

real_samples = get_real_samples(0, batch_size)

print(real_samples.shape)

d = Discriminator(batch_size=batch_size, seq_len=seq_len)
outputs = d.forward(real_samples)

print(outputs)

### 构造GAN

In [13]:
# lr=0.1 works !

d_learning_rate = 0.001
g_learning_rate = 0.001

input_size = 20
batch_size = 1
seq_len = 30

G = Generator(input_size=input_size, batch_size=batch_size, seq_len=seq_len)
D = Discriminator(batch_size=batch_size, seq_len=seq_len)

d_optimizer = optim.SGD(D.parameters(), lr=d_learning_rate) 
g_optimizer = optim.SGD(G.parameters(), lr=g_learning_rate)

In [14]:
num_epochs = 100

for epoch in range(num_epochs):
    
    G.train()
    D.train()
    
    sum_real_error = 0
    sum_gen_loss = 0
    
    for idx in range(0, len(xtrain)-batch_size+1, batch_size):
        
        """
            训练Discriminator
        """
        D.zero_grad()
        # 用真实样本训练D 
        real_data = get_real_samples(idx, batch_size)
        real_decision = D(real_data)
        #print(real_decision)
        real_error = -torch.sum(torch.log(real_decision))/batch_size
        #print(real_error)
        sum_real_error += real_error
        real_error.backward()
    
        # 用生成样本训练D
        input_data = get_noise_data(batch_size, input_size)
        fake_data = G.forward(input_data) 
        fake_decision = D(fake_data)
        #print(fake_decision)
        fake_error = 1 - torch.sum(torch.log(fake_decision))/batch_size
        #print(fake_error)
        fake_error.backward()
    
        d_optimizer.step()
    
    
        """
            训练Generator
        """
        G.zero_grad()
        # 训练G
        input_data = get_noise_data(batch_size, input_size)
        fake_data = G.forward(input_data)
        fake_decision = D(fake_data)
        #print(fake_decision)
        gen_loss = -torch.sum(torch.log(fake_decision))/batch_size
        #print(gen_loss)
        sum_gen_loss += gen_loss
        gen_loss.backward()

        g_optimizer.step()
    
    print("Epoch %5d:  D Loss  %5.3f ; " % (epoch, sum_real_error), end="")
    print("G Loss  %5.3f" % (sum_gen_loss))

Epoch     0:  D Loss  55.311 ; G Loss  54.562
Epoch     1:  D Loss  10.136 ; G Loss  10.119
Epoch     2:  D Loss  5.699 ; G Loss  5.695
Epoch     3:  D Loss  3.948 ; G Loss  3.946
Epoch     4:  D Loss  3.010 ; G Loss  3.010
Epoch     5:  D Loss  2.427 ; G Loss  2.427
Epoch     6:  D Loss  2.030 ; G Loss  2.030
Epoch     7:  D Loss  1.743 ; G Loss  1.743
Epoch     8:  D Loss  1.525 ; G Loss  1.525
Epoch     9:  D Loss  1.355 ; G Loss  1.355
Epoch    10:  D Loss  1.218 ; G Loss  1.218
Epoch    11:  D Loss  1.106 ; G Loss  1.106
Epoch    12:  D Loss  1.012 ; G Loss  1.012
Epoch    13:  D Loss  0.933 ; G Loss  0.933
Epoch    14:  D Loss  0.865 ; G Loss  0.865
Epoch    15:  D Loss  0.806 ; G Loss  0.806
Epoch    16:  D Loss  0.754 ; G Loss  0.754
Epoch    17:  D Loss  0.708 ; G Loss  0.709
Epoch    18:  D Loss  0.668 ; G Loss  0.668
Epoch    19:  D Loss  0.632 ; G Loss  0.632
Epoch    20:  D Loss  0.599 ; G Loss  0.599
Epoch    21:  D Loss  0.570 ; G Loss  0.570
Epoch    22:  D Loss  0.543 