1. 求滑动窗口
2. 求训练数据集: train_x, train_y
3. 求测试数据集: test_x, test_y
4. 训练模型
5. 计算误差

In [13]:
import pandas as pd
import numpy as np
import datetime
import dateutil.relativedelta
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [8]:
# 数据集路径
stockDataFile = r"./StockData.xlsx"

In [9]:
# 样本内区间
sampleStartDate = datetime.datetime.strptime('2006-1-4', '%Y-%m-%d')
sampleEndDate = datetime.datetime.strptime('2013-12-31', '%Y-%m-%d')

# 样本外区间
backtestStartDate = datetime.datetime.strptime('2014-01-02', '%Y-%m-%d')
backtestEndDate = datetime.datetime.strptime('2021-12-31', '%Y-%m-%d')

In [10]:
# 读取数据集
df = pd.read_excel(stockDataFile, sheet_name='sz50', index_col='Date', parse_dates=['Date'])
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Vol
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2004-01-02,997.0,1021.57,993.89,1011.35,8064652
2004-01-05,1008.28,1060.9,1008.28,1060.8,14468180
2004-01-06,1059.14,1086.69,1059.09,1075.66,16991334
2004-01-07,1075.56,1095.84,1070.98,1086.3,13729419
2004-01-08,1087.68,1108.29,1082.51,1102.66,10780427


In [17]:
forecastDays = 1  # 预测未来第几天的股价
pastDays = 5  # 过去n日作为一组
maValue = 30  # MA取值
stepMonth = 6  # 滑动窗口步长,月
sampleDataSize = 9  # 样本内数据量: 9年

In [45]:
stockDataDf = df.copy()

In [46]:
# 计算MA
stockDataDf['Ma'] = df.Close.rolling(maValue).mean()
stockDataDf

Unnamed: 0_level_0,Open,High,Low,Close,Vol,Ma
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2004-01-02,997.00,1021.57,993.89,1011.35,8064652,
2004-01-05,1008.28,1060.90,1008.28,1060.80,14468180,
2004-01-06,1059.14,1086.69,1059.09,1075.66,16991334,
2004-01-07,1075.56,1095.84,1070.98,1086.30,13729419,
2004-01-08,1087.68,1108.29,1082.51,1102.66,10780427,
...,...,...,...,...,...,...
2022-09-26,2612.67,2650.25,2611.53,2614.90,34384532,2721.044667
2022-09-27,2616.05,2645.74,2605.24,2642.92,26144970,2716.386333
2022-09-28,2635.26,2637.02,2608.64,2611.98,24732086,2711.143667
2022-09-29,2633.07,2644.07,2596.96,2608.98,25969016,2705.248000


### 1. 求滑动窗口

In [47]:
def getSlidingWindowDf(stockDataDf, sampleDataSize, stepMonth):
    """
    通过样本内数据量, 滑动窗口步长求滑动窗口
    :param stockDataDf: 股票数据
    :param sampleDataSize: 样本内数据量: 9年
    :param stepMonth: 滑动窗口步长,月
    :return: 滑动窗口Df
    """
    # 计算每个窗口
    slidingWindowBacktestStartDateList = []  # List, 存每个窗口样本外开始日期
    slidingWindowBacktestEndDateList = []  # List, 存每个窗口样本外结束日期
    delta = backtestStartDate  # 循环控制条件
    while (delta < backtestEndDate):  # 以相应步长遍历样本外区间
        # 计算窗口样本外开始日期
        startDate = stockDataDf.loc[delta.strftime('%Y-%m')].index[0]  # 该窗口的样本外开始日期:这个月的第一个交易日
        slidingWindowBacktestStartDateList.append(startDate)  # 加入List中

        # 计算窗口样本外结束日期
        try:

            endYearMonth = (delta + dateutil.relativedelta.relativedelta(months=stepMonth - 1)).strftime(
                '%Y-%m')  # (开始日期 + 步长 - 1)的年月
            endDate = stockDataDf.loc[endYearMonth].tail(1).index[0]  # 该窗口的样本外结束日期:(开始日期 + 步长 - 1)的月份最后一天
            slidingWindowBacktestEndDateList.append(endDate)  # 加入List中
        except Exception as e:
            # 抛出异常, 说明最后一个窗口不满步长, 该窗口样本内外结束日期为样本外结束日期
            slidingWindowBacktestEndDateList.append(backtestEndDate)

        delta = delta + dateutil.relativedelta.relativedelta(months=stepMonth)  # 循环控制条件增加相应步长

    # 构建滑动窗口Df
    slidingWindowDf = pd.DataFrame(
        data={'样本内数据年份数': sampleDataSize, '步长/月': stepMonth,
              '样本外开始': slidingWindowBacktestStartDateList, '样本外结束': slidingWindowBacktestEndDateList},
        columns=['样本内数据年份数', '步长/月',
                 '样本内开始', '样本内结束',
                 '样本外开始', '样本外结束'])

    slidingWindowDf['样本内开始'] = slidingWindowDf['样本外开始'].apply(
        lambda x:
        stockDataDf.loc[(x - dateutil.relativedelta.relativedelta(years=sampleDataSize)).strftime('%Y-%m')].index[0]
    )  # 样本内开始 = (样本外开始 - 步长)那个月的第一个交易日

    slidingWindowDf['样本内结束'] = slidingWindowDf['样本外开始'].apply(
        lambda x: stockDataDf.loc[(x - dateutil.relativedelta.relativedelta(months=1)).strftime('%Y-%m')].tail(1).index[
            0]
    )  # 样本内结束 = 样本外开始那个月的上一个月的最后一个交易日

    return slidingWindowDf

In [49]:
slidingWindowDf = getSlidingWindowDf(stockDataDf, sampleDataSize, stepMonth)
slidingWindowDf

Unnamed: 0,样本内数据年份数,步长/月,样本内开始,样本内结束,样本外开始,样本外结束
0,9,6,2005-01-04,2013-12-31,2014-01-02,2014-06-30
1,9,6,2005-07-01,2014-06-30,2014-07-01,2014-12-31
2,9,6,2006-01-04,2014-12-31,2015-01-05,2015-06-30
3,9,6,2006-07-03,2015-06-30,2015-07-01,2015-12-31
4,9,6,2007-01-04,2015-12-31,2016-01-04,2016-06-30
5,9,6,2007-07-02,2016-06-30,2016-07-01,2016-12-30
6,9,6,2008-01-02,2016-12-30,2017-01-03,2017-06-30
7,9,6,2008-07-01,2017-06-30,2017-07-03,2017-12-29
8,9,6,2009-01-05,2017-12-29,2018-01-02,2018-06-29
9,9,6,2009-07-01,2018-06-29,2018-07-02,2018-12-28


### 2. 求训练数据集: train_x, train_y
### 3. 求测试数据集: test_x, test_y
### 4. 训练模型

In [57]:
# 向量化
slidingWindowSampleStartDateList = slidingWindowDf['样本内开始'].values
slidingWindowSampleEndDateList = slidingWindowDf['样本内结束'].values
slidingWindowBacktestStartDateList = slidingWindowDf['样本外开始'].values
slidingWindowBacktestEndDateList = slidingWindowDf['样本外结束'].values

swZip = zip(slidingWindowSampleStartDateList, slidingWindowSampleEndDateList, slidingWindowBacktestStartDateList,
            slidingWindowBacktestEndDateList)   # 生成zip,用于遍历

In [None]:
def process_train_data(start_time, end_time, stock_data):
    stock_data['MA_1'] = talib.MA(stock_data["close"].values, 30)
    stock_data = stock_data[(stock_data['date'] >= start_time)].reset_index(drop=True)
    stock_data = stock_data[(stock_data['date'] <= end_time)].reset_index(drop=True)
    stock_data1 = stock_data
    stock_data = np.array(stock_data.drop(columns=['date']))
    train_x = []
    train_y = []
    for i in range(0, len(stock_data)):
        if (i + 5 >= len(stock_data)):
            break
        train_x.append(stock_data[i:i + 5].reshape(1, -1)[0])
        train_y.append(stock_data[i + 5][3])
    return train_x, train_y

In [107]:
# 遍历每个滑动窗口
for sw in swZip:
    # 处理训练数据集
    swStockDataDf = stockDataDf[sw[0]:sw[1]]  # 取当前滑动窗口股票数据, 深拷贝
    swStockData = np.array(swStockDataDf)  # 当前滑动窗口训练数据集, 二维数组

    trainX = []
    trainY = []
    # 根据将过去pastDays天数据作为一组
    for i in range(len(swStockData)):
        if (i + pastDays >= len(swStockData)):  # 最后一组不满pastDays, 丢弃
            break
        trainX.append(swStockData[i:i + pastDays].reshape(1, -1)[0])
        trainY.append(swStockData[i + pastDays][3])
    break

[]