## 机器学习策略——基于逻辑回归的股市趋势预测(及SVM策略)

In [1]:
%matplotlib inline
import seaborn
import matplotlib as mpl
mpl.rcParams['font.family'] = 'serif'
import warnings; warnings.simplefilter('ignore') #忽略可能会出现的警告信息，警告并不是错误，可以忽略；

### 1. 数据获取

In [2]:
import pandas as pd 
import numpy as np
import tushare as ts

In [4]:
pro = ts.pro_api('31e6c92e6133d112d8ee557bb245044fde5daab04104598620f23921')
#hs300 = ts.pro_bar(pro_api=api, ts_code='000001.SZ', adj='qfq', start_date='20180101', end_date='20181011')
hs300 = pro.index_daily(ts_code='399300.SZ', start_date='20160101', end_date='201910110')


In [5]:
hs300.rename(columns={'trade_date':'date','ts_code':'code'},inplace=True)

In [6]:
#hs300.head()

In [7]:
hs300.index = pd.to_datetime(hs300['date'])
hs300.drop(columns=['date','change','code'],axis=1,inplace=True)
#hs300.head()

In [8]:
#hs300['returns'] = hs300['close'].pct_change()
#hs300.rename(columns={'pct_chg':'returns'},inplace=True)

In [9]:
#hs300.dropna(inplace=True)            #有NaN值的话会影响机器学习算法；

In [10]:
hs300.sort_index(axis=0,ascending=True,inplace=True) #按从前到后的顺序排列表格，方便后续计算

In [11]:
hs300.head()

Unnamed: 0_level_0,close,open,high,low,pre_close,pct_chg,vol,amount
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2016-01-04,3469.066,3725.856,3726.245,3468.949,3731.005,-7.0206,115370674.0,145968200.0
2016-01-05,3478.78,3382.177,3518.217,3377.28,3469.066,0.28,162116984.0,196017100.0
2016-01-06,3539.808,3482.406,3543.739,3468.467,3478.78,1.7543,145966144.0,160947200.0
2016-01-07,3294.3839,3481.15,3481.15,3284.737,3539.808,-6.9333,44102641.0,47130800.0
2016-01-08,3361.563,3371.871,3418.851,3237.931,3294.384,2.0392,185959451.0,203498900.0


### 2. 数据处理——特征工程处理

#特征的生成；
for i in range(2, 8, 1):
    hs300['close - ' + str(i) + 'd'] = hs300['close'].shift(i)       #加上前7天的收盘价

hs_7d = hs300[[x for x in hs300.columns if 'close' in x]].iloc[7:]   #从第八行开始没有NaN地方开始取值作为features

hs_7d.head(10)

In [12]:
import talib as ta

In [13]:
close = hs300.close.values
hs300['dif'], hs300['dea'], hs300['macd'] = ta.MACD(close, fastperiod=12, slowperiod=26, signalperiod=9)

In [14]:
#hs300.tail(20)

In [15]:
hs300['upper'], hs300['middle'], hs300['lower'] = ta.BBANDS(
                close, 
                timeperiod=20,
                # number of non-biased standard deviations from the mean
                nbdevup=2,
                nbdevdn=2,
                # Moving average type: simple moving average here
                matype=0)

In [16]:
#MA_Type: 0=SMA, 1=EMA, 2=WMA, 3=DEMA, 4=TEMA, 5=TRIMA, 6=KAMA, 7=MAMA, 8=T3 (Default=SMA)

hs300['MA5'] = ta.MA(close,5,matype=0)
hs300['MA21'] = ta.MA(close,21,matype=0)
hs300['MA34'] = ta.MA(close,34,matype=0)
hs300['MA55'] = ta.MA(close,55,matype=0)
hs300['MA144'] = ta.MA(close,144,matype=0)

In [17]:
hs300['pct_chg'] = hs300['pct_chg'].shift(-1)*100

In [19]:
hs300.dropna(inplace=True)            #有NaN值的话会影响机器学习算法；

In [21]:
Y_label = np.array(hs300['pct_chg'])

In [22]:
len(Y_label)

604

In [23]:
import copy

In [24]:
X_train= copy.deepcopy(hs300.head(450))

In [25]:
X_test = copy.deepcopy(hs300.tail(154) )

In [26]:
X_train.drop(columns='pct_chg',inplace=True)

In [27]:
X_test.drop(columns='pct_chg',inplace=True)

In [28]:
print(len(X_test),len(X_train))

154 450


In [58]:
y_test = Y_label[450:604]

In [59]:
y_train = Y_label[0:450]

In [60]:
y_train = np.sign(y_train)
y_test = np.sign(y_test)

In [61]:
y_test

array([ 1., -1., -1., -1., -1.,  1., -1.,  1., -1., -1., -1., -1.,  1.,
       -1.,  1., -1., -1.,  1.,  1.,  1., -1.,  1.,  1., -1., -1., -1.,
       -1.,  1.,  1.,  1., -1., -1., -1., -1.,  1., -1., -1., -1., -1.,
        1., -1.,  1.,  1., -1., -1., -1., -1., -1.,  1.,  1., -1.,  1.,
        1.,  1., -1., -1., -1., -1., -1.,  1., -1., -1.,  1., -1., -1.,
       -1.,  1.,  1., -1.,  1.,  1., -1.,  1., -1.,  1., -1.,  1., -1.,
       -1., -1., -1.,  1., -1., -1.,  1., -1.,  1.,  1., -1.,  1.,  1.,
       -1., -1.,  1.,  1.,  1.,  1., -1., -1., -1., -1., -1.,  1.,  1.,
       -1.,  1.,  1.,  1., -1.,  1., -1., -1., -1., -1.,  1., -1.,  1.,
        1.,  1., -1., -1., -1., -1.,  1.,  1.,  1., -1., -1., -1., -1.,
       -1., -1.,  1., -1., -1., -1.,  1., -1., -1.,  1.,  1., -1.,  1.,
       -1.,  1., -1.,  1.,  1., -1.,  1.,  1., -1., -1.,  1.])

In [62]:
import sklearn
from sklearn import linear_model

X_train = sklearn.preprocessing.scale(X_train)    #对features进行标准化；
X_test = sklearn.preprocessing.scale(X_test)

In [34]:
len(X_train)

450

### 3. 逻辑回归预测股价趋势算法实现

In [144]:
lm = linear_model.LogisticRegression(penalty='l1',C=500)

 #计算出训练集的labels；
#y_train = np.sign(hs_7d['close'].pct_change().shift(-1))       #非常重要；拿到下一天的收益，用.shift(-1)
#y_train.replace(to_replace= np.NaN, value = 0,inplace = True)
#y_train = y_train.reshape(-1,1)
#_train[-10:]

lm.fit(X_train, y_train)

lm.score(X_train, y_train)

0.5911111111111111

In [145]:
y_predict = lm.predict(X_test)

lm.score(X_test, y_test)

0.6233766233766234

In [65]:
sklearn.metrics.accuracy_score(y_test,y_predict)

0.6103896103896104

### 4. 改变算法：SVM

In [66]:
from sklearn.svm import SVC
#clf_SVC = SVC(kernel = 'linear')

In [117]:
clf_SVC = SVC(C=50)

clf_SVC.fit(X_train, y_train)

clf_SVC.score(X_train, y_train)

0.7066666666666667

In [118]:
y_SVCpredict = clf_SVC.predict(X_test)

clf_SVC.score(X_test,y_test)

0.551948051948052

In [122]:
from sklearn.ensemble import RandomForestClassifier
#sklearn.ensemble.ExtraTreesClassifier
clf_RF = RandomForestClassifier(n_estimators=50,max_depth=20)
clf_RF.fit(X_train,y_train)

clf_RF.score(X_train, y_train)

1.0

In [123]:
pred_RF = clf_RF.predict(X_test)
clf_RF.score(X_test, y_test)

0.5844155844155844

In [140]:

clf_ERF = sklearn.ensemble.ExtraTreesClassifier(n_estimators=10,max_depth=25)
clf_ERF.fit(X_train,y_train)
clf_ERF.score(X_train, y_train)

1.0

In [141]:
pred_ERF = clf_RF.predict(X_test)
clf_ERF.score(X_test, y_test)

0.5194805194805194

In [142]:
from sklearn.neighbors import KNeighborsClassifier

clf_KNN = KNeighborsClassifier(n_neighbors=7)
clf_KNN.fit(X_train,y_train)
clf_KNN.score(X_train,y_train)

0.68

In [143]:
pred_KNN = clf_KNN.predict(X_test)
clf_KNN.score(X_test,y_test)
#plot_pic(clf_KNN, features_test, label_test)

0.4805194805194805

### 5. 逻辑回归算法在测试集的验证

In [None]:
X_test = hs_7d_test 
X_test = sklearn.preprocessing.scale(X_test)
X_test

In [None]:
hs300_test['prediction'] = np.NaN
hs300_test['prediction'].ix[7:] = lm.predict(X_test)    #给你测试集的features，返回的是预测的测试集的labels

In [None]:
hs300_test['prediction'].value_counts()

In [None]:
hs300_test.head(10)

In [None]:
hs300_test['strategy'] = (hs300_test['prediction'].shift(1) * hs300_test['returns'] + 1).cumprod()
hs300_test['cum_ret'] = (hs300_test['returns']+1).cumprod()

In [None]:
hs300_test[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))

### 6.SVM算法在测试集的验证

In [None]:
X_test = hs_7d_test 

In [None]:
hs300_test['prediction'] = np.NaN
hs300_test['prediction'].ix[7:] = clf_SVC.predict(X_test)

In [None]:
hs300_test['strategy'] = (hs300_test['prediction'].shift(1) * hs300_test['returns'] + 1).cumprod()
hs300_test['cum_ret'] = (hs300_test['returns']+1).cumprod()

In [None]:
hs300_test[['strategy','cum_ret']].dropna().plot(figsize=(10, 6))

声明：本资料仅供内部学习交流使用，切勿外传。