## 股票涨跌预测

In [1]:
from __future__ import print_function

import datetime
import numpy as np
import pandas as pd
import tushare as ts

from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC

In [3]:
def create_lagged_series(symbol, start_date_str, end_date_str, lags=5):
    """
        根据start_data, end_date创建symbol的收盘价的滞后序列
        默认滞后期为5天
    """
    # 把时间字符串格式化
    # 1、由日期格式转化为字符串格式的函数为: datetime.datetime.strftime()
    # 2、由字符串格式转化为日期格式的函数为: datetime.datetime.strptime()
    date_str_fmt = '%Y-%m-%d'
    start_date = datetime.datetime.strptime(start_date_str, date_str_fmt)
    one_yr_before_start = start_date - datetime.timedelta(days=365)
    one_yr_before_start_str = one_yr_before_start.strftime(date_str_fmt)

    # 从TuShare获取数据
    # get_k_data方法传入股票代码，其实日期和终止日期即可
    hist_data = ts.get_k_data(symbol, one_yr_before_start_str, end_date_str)
    # 转化时间格式
    hist_data['date'] = pd.to_datetime(hist_data['date'])
    # 设置时间为索引
    hist_data.set_index('date', inplace=True)

    # 创建存储滞后序列的DataFrame
    hist_lag = pd.DataFrame(index=hist_data.index)
    # 把今日价格设为收盘价
    hist_lag['today'] = hist_data['close']
    hist_lag['volume'] = hist_data['volume']

    # 创建一个滞后序列
    for i in range(0, lags):
        # shift可创建滞后序列
        hist_lag['lag{}'.format(str(i + 1))] = hist_data['close'].shift(i + 1)

    # 创建收益率DataFrame
    ret_df = pd.DataFrame(index=hist_lag.index)
    ret_df['volume'] = hist_lag['volume']
    ret_df['today'] = hist_lag['today'].pct_change() * 100.0 # pct_change:计算今天和昨天数据的变化率，由变化率可知涨跌

    # 如果绝对值小于0.0001，将其设置成0.0001
    ret_df['today'] = ret_df['today'].apply(lambda x: 0.0001 if abs(x) < 0.0001 else x)

    # 为收益率DataFrame创建对应的滞后序列
    # pct_change方法：统计增长率
    for i in range(0, lags):
        ret_df['lag{}'.format(str(i + 1))] = \
            hist_lag['lag{}'.format(str(i + 1))].pct_change() * 100.0

    # 创建label列，用1, -1 标识涨/跌
    ret_df['direction'] = np.sign(ret_df['today'])
    ret_df = ret_df[ret_df.index >= start_date]

    return ret_df

In [4]:
# 创建沪深300指数的滞后序列
lag_ret_df = create_lagged_series('hs300', '2010-01-01', '2015-12-31', lags=5)
lag_ret_df



Unnamed: 0_level_0,volume,today,lag1,lag2,lag3,lag4,lag5,direction
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2010-01-04,66101080.0,-1.131392,0.472820,1.660222,0.641208,1.566523,-0.408192,-1.0
2010-01-05,85809641.0,0.814912,-1.131392,0.472820,1.660222,0.641208,1.566523,1.0
2010-01-06,78473125.0,-0.626003,0.814912,-1.131392,0.472820,1.660222,0.641208,-1.0
2010-01-07,80350037.0,-1.984089,-0.626003,0.814912,-1.131392,0.472820,1.660222,-1.0
2010-01-08,60790253.0,0.249866,-1.984089,-0.626003,0.814912,-1.131392,0.472820,1.0
2010-01-11,89980172.0,0.055228,0.249866,-1.984089,-0.626003,0.814912,-1.131392,1.0
2010-01-12,93743278.0,1.518185,0.055228,0.249866,-1.984089,-0.626003,0.814912,1.0
2010-01-13,112457901.0,-3.218521,1.518185,0.055228,0.249866,-1.984089,-0.626003,-1.0
2010-01-14,83353252.0,1.400321,-3.218521,1.518185,0.055228,0.249866,-1.984089,1.0
2010-01-15,72543102.0,0.394546,1.400321,-3.218521,1.518185,0.055228,0.249866,1.0


In [5]:
# 使用前两天（滞后）的数据作为训练集
X = lag_ret_df[['lag1', 'lag2']]
y = lag_ret_df['direction']

# 测试数据分为两部分，2015-01-01之前和之后
start_test = datetime.datetime(2015, 1, 1)

# 分割训练集和测试集
X_train = X[X.index < start_test]
X_test = X[X.index >= start_test]
y_train = y[y.index < start_test]
y_test = y[y.index >= start_test]

# 候选模型
print('准确率与混淆矩阵\n')
# 课后作业：请使用交叉验证的方式选择最优超参数以提高性能
models = [('逻辑回归', LogisticRegression()),
          ('支持向量机', SVC(C=1000000.0, cache_size=200, class_weight=None,
                        coef0=0.0, degree=3, gamma=0.0001, kernel='rbf',
                        max_iter=-1, probability=False, random_state=None,
                        shrinking=True, tol=0.001, verbose=False)),
          ('随机森林', RandomForestClassifier(n_estimators=1000, criterion='gini',
                                          max_depth=None, min_samples_split=2,
                                          min_samples_leaf=1, max_features='auto',
                                          bootstrap=True, oob_score=False, n_jobs=1,
                                          random_state=None, verbose=0))]

# 遍历所有模型
for model in models:
    # 模型训练
    model[1].fit(X_train, y_train)

    # 预测
    pred = model[1].predict(X_test)

    # 输出准确率和混淆矩阵
    print('{}:\n{:.2f}'.format(model[0], model[1].score(X_test, y_test)))
    print('{}\n'.format(confusion_matrix(pred, y_test, labels=[-1, 1])))


准确率与混淆矩阵

逻辑回归:
0.47
[[70 93]
 [36 45]]

支持向量机:
0.55
[[59 62]
 [47 76]]

随机森林:
0.58
[[51 48]
 [55 90]]

