# 择时买入--未来n天内指数是否会大幅上升

In [1]:
# 添加工程根目录到系统路径下
import os
import sys
project_root = os.getcwd()[:os.getcwd().find('/docs')]
sys.path.append(project_root)

## 1. 实验目的和流程

+ **实验目的**：在给定时间内提取数据特征，预测未来一段时间内沪深300指数是否会出现涨幅超过某一阈值
+ **实验流程**：

> 1. 以(m+n)滑窗的方式计算给定时间内所有样本的特征和标签，其中m是已知交易数据的交易日天数，n是要预测未来跌幅的交易日天数
> 2. 选取模型预测，计算正检率和误检率
> 3. 重复1，2

## 2. 导入数据

In [2]:
import pandas as pd
df = pd.read_csv('{}/data/000300-2014-2017.csv'.format(project_root), index_col='date')
df.head()

Unnamed: 0_level_0,open,high,close,low,volume,price_change,p_change,ma5,ma10,ma20,v_ma5,v_ma10,v_ma20
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
2014-11-24,2614.16,2667.671,2649.258,2602.944,279839712.0,65.803,2.55,2649.258,2649.258,2649.258,279839700.0,279839700.0,279839700.0
2014-11-25,2650.085,2686.063,2685.561,2643.843,222611696.0,36.303,1.37,2667.409,2667.409,2667.409,251225700.0,251225700.0,251225700.0
2014-11-26,2695.263,2723.36,2723.018,2690.31,243962480.0,37.457,1.4,2685.946,2685.946,2685.946,248804600.0,248804600.0,248804600.0
2014-11-27,2737.03,2754.49,2754.49,2718.704,265464736.0,31.472,1.16,2703.082,2703.082,2703.082,252969700.0,252969700.0,252969700.0
2014-11-28,2753.925,2809.543,2808.819,2740.374,375323648.0,54.329,1.97,2724.229,2724.229,2724.229,277440500.0,277440500.0,277440500.0


## 3. 实验

### 3.1 计算样本

方法：计算指定时间内收盘价上涨幅度（计算标签）

In [4]:
def get_max_rise(series):
    curr_min = series[0]
    max_rise = 0
    # 动态规划计算最大涨幅
    for i in range(1, len(series)):
        curr_min = min(curr_min, series[i])
        max_rise = max(max_rise, series[i] - curr_min)
        
    return abs(max_rise / curr_min)

方法：计算用于预测是否会较大幅度上涨的特征

In [82]:
def get_features(dataframe, upper_shadow_threshold=0.2):
    feature = list()
    # 是否出现收盘3(+)连阴
    close_diff = dataframe['close'].diff()
    close_diff[close_diff >= 0] = 0
    close_diff[close_diff < 0] = 1
    consecutive_decrease = close_diff * (close_diff.groupby((close_diff != close_diff.shift()).cumsum()).cumcount() 
                                         + 1)
#     if consecutive_decrease.max() >=3:
#         feature.append(True)
#     else:
#         feature.append(False)
    feature.append(consecutive_decrease.max())
        
    # 是否出现三均线2(+)连阴
    ma5_diff = dataframe['ma5'].diff()
    ma10_diff = dataframe['ma10'].diff()
    ma20_diff = dataframe['ma20'].diff()
    ma5_diff[ma5_diff >= 0] = 0
    ma10_diff[ma10_diff >= 0] = 0
    ma20_diff[ma20_diff >= 0] = 0
    ma5_diff[ma5_diff < 0] = 1
    ma10_diff[ma10_diff < 0] = 1
    ma20_diff[ma20_diff < 0] = 1
    
    ma5_decrease = ma5_diff * (ma5_diff.groupby((ma5_diff != ma5_diff.shift()).cumsum()).cumcount() + 1)
    ma10_decrease = ma10_diff * (ma10_diff.groupby((ma10_diff != ma10_diff.shift()).cumsum()).cumcount() + 1)
    ma20_decrease = ma20_diff * (ma20_diff.groupby((ma20_diff != ma20_diff.shift()).cumsum()).cumcount() + 1)
    
#     if (ma5_decrease.max() >= 2) and (ma10_decrease.max() >= 2) and (ma10_decrease.max() >= 2):
#         feature.append(True)
#     else:
#         feature.append(False)
    feature.append(ma5_decrease.max())
    feature.append(ma10_decrease.max())
    feature.append(ma20_decrease.max())
    # 上涨中是否出现上影
    ma10 = dataframe['ma10']
    ma20 = dataframe['ma20']
    condition_green = dataframe['close'] < dataframe['open']
    
    if (ma10[-1] > ma10[0]) and (ma20[-1] > ma20[0]):
        # 绿
        # 上影（绿）
        condition_upper_shadow = (dataframe['high'] - dataframe['open']) / (dataframe['open'] - dataframe['close'])
        condition_upper_shadow = condition_upper_shadow > upper_shadow_threshold
        condition = condition_green & condition_upper_shadow
        feature.append(condition[condition == True].shape[0])
    else:
        feature.append(0)
    
    # 上涨中是否出现镰刀
    ma_max = dataframe[['ma5', 'ma10', 'ma20']].max(1)
    ma_min = dataframe[['ma5', 'ma10', 'ma20']].min(1)
    condition_sickle = (dataframe['open'] > ma_max) & (dataframe['close'] < ma_min)
    condition_sickle = condition_green & condition_sickle
    feature.append(condition_sickle[condition_sickle == True].shape[0] > 0)
    return feature
    

方法：生成训练-测试样本

In [83]:
import numpy as np

def generate_train_test_samples(df, train_interval, test_interval):
    features = list()
    labels = list()
    for i in range(0, df.shape[0]-train_interval-test_interval):
        df_train = df.iloc[i: i+train_interval]
        feature = get_features(df_train)
        features.append(feature)
        df_test = df.iloc[i+train_interval: i+train_interval+test_interval]
        max_crash = get_max_crash(df_test['close'])
        labels.append(max_crash)
    return np.array(features), np.array(labels)

In [84]:
x, y = generate_train_test_samples(df, 42, 21)

In [85]:
import numpy as np

thresh = 0.15
label = np.copy(y)
label[label > thresh] = 1
label[label <= thresh] = 0
print('All samples num: {}'.format(label.shape[0]))
print('Positive samples num: {}'.format(label[label == 1].shape[0]))

All samples num: 671
Positive samples num: 65


### 3.2 分类模型

In [81]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier as gdbt
from sklearn.metrics import precision_score, accuracy_score, recall_score

model = gdbt()
model.fit(x, label)
print(precision_score(label, model.predict(x)))
print(recall_score(label, model.predict(x)))

0.672413793103
0.6


In [86]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier as gdbt
from sklearn.metrics import precision_score, accuracy_score, recall_score

model = gdbt()
model.fit(x, label)
print(precision_score(label, model.predict(x)))
print(recall_score(label, model.predict(x)))

0.970149253731
1.0


In [73]:
label[label == 1].shape[0] * recall_score(label, model.predict(x))

1.0

In [126]:
from matplotlib import pyplot as plt
from matplotlib.animation import FuncAnimation

positive_indices = [i for i, v in enumerate(label) if v == 1]

fig, ax = plt.subplots()
fig.set_tight_layout(True)
fig.set_figheight(6)
fig.set_figwidth(16)

# Plot a scatter that persists (isn't redrawn) and the initial line.
ax.plot(df['close'].values)

line1 = ax.axvline(x=positive_indices[0], c='red')
line2 = ax.axvline(x=positive_indices[0] + 42, c='yellow')
line3 = ax.axvline(x=positive_indices[0] + 63, c='blue')

def update(i):
    label = 'Crash rate: {}'.format(y[positive_indices[i]])
    # Update the line and the axes (with a new xlabel). Return a tuple of
    # "artists" that have to be redrawn for this frame.
    line1.set_xdata(positive_indices[i])
    line2.set_xdata(positive_indices[i] + 42)
    line3.set_xdata(positive_indices[i] + 63)
    ax.set_xlabel(label)
    return line, ax

anim = FuncAnimation(fig, update, frames=len(positive_indices), interval=200)
anim.save('test1.gif', dpi=80, writer='imagemagick')

<img src='test1.gif'>

## 4. 结论