In [1]:
import os
import sys
import requests

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import RocCurveDisplay
from sklearn.metrics import accuracy_score, classification_report

from config import ACCOUNT_NUMBER, ACCOUNT_PASSWORD, CONSUMER_ID, REDIRECT_URI


In [2]:
import tushare as ts

def grab_price_data():
    df = ts.get_k_data('sh', autype='qfq', start='2010-01-01', end='2020-07-17')
    df.index = pd.to_datetime(df.date)
    df = df[['open', 'high', 'low', 'close', 'volume']]
    df.to_csv('../data/price_data.csv',index_label=False)

grab_price_data()


本接口即将停止更新，请尽快使用Pro版接口：https://tushare.pro/document/2


In [47]:
price_data=pd.read_csv("../data/price_data.csv")

price_data.head()


Unnamed: 0,open,high,low,close,volume
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0


In [48]:
'''
calculate change in price
'''
price_data['change_in_price'] = price_data['close'].diff()
price_data['change_in_price']




2010-01-04        NaN
2010-01-05     38.419
2010-01-06    -27.964
2010-01-07    -61.439
2010-01-08      3.221
               ...   
2020-07-13     59.970
2020-07-14    -28.670
2020-07-15    -53.320
2020-07-16   -151.200
2020-07-17      4.030
Name: change_in_price, Length: 2561, dtype: float64

In [49]:
# calculate the 14 day RSI
n=14
up_df, down_df=price_data[['change_in_price']].copy(), price_data[['change_in_price']].copy()

# For up days, if the change is less than 0 set to 0
up_df.loc['change_in_price']=up_df.loc[(up_df['change_in_price']<0),'change_in_price']=0

# For down days, if the change is greater than 0 than set to 0.
down_df.loc['change_in_price'] = down_df.loc[(down_df['change_in_price']>0),'change_in_price']=0

# We need change in price to be absolute
down_df['change_in_price']=down_df['change_in_price'].abs()

#calculate the EMA
ema_up=up_df['change_in_price'].transform(lambda x: x.ewm(span=n).mean())
ema_down=down_df['change_in_price'].transform(lambda x: x.ewm(span=n).mean())

# Calculate the relative strength
relative_strength=ema_up/ema_down

# Calculate the Relative index
relative_strength_index=100.0-(100.0/(1.0+relative_strength))

# Add the info to the data frame
price_data['down_days'] = down_df['change_in_price']
price_data['up_days']=up_df['change_in_price']
price_data['RSI']=relative_strength_index

price_data.head(30)

Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759
2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393
2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277
2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196
2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273
2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768


In [50]:
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['low']].copy(), price_data[['high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate the Stochastic Oscillator.
k_percent = 100 * ((price_data['close'] - low_14) / (high_14 - low_14))

# Add the info to the data frame.
price_data['low_14'] = low_14
price_data['high_14'] = high_14
price_data['k_percent'] = k_percent

# Display the head.
price_data.head(30)

Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,
2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,
2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,
2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,
2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,
2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,


In [51]:
#威廉指标，利用市场的摆动幅度衡量市场的超买超卖
# Calculate the Williams %R
n = 14

# Make a copy of the high and low column.
low_14, high_14 = price_data[['low']].copy(), price_data[['high']].copy()

# Group by symbol, then apply the rolling function and grab the Min and Max.
low_14 = low_14['low'].transform(lambda x: x.rolling(window = n).min())
high_14 = high_14['high'].transform(lambda x: x.rolling(window = n).max())

# Calculate William %R indicator.
r_percent = ((high_14 - price_data['close']) / (high_14 - low_14)) * - 100

# Add the info to the data frame.
price_data['r_percent'] = r_percent

# Display the head.
price_data.head(30)




Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,,
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,,
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,,
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,,
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,,
2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,,
2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,,
2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,,
2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,,
2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,,


In [52]:
#MACD
'''
MACD= EMA12-EMA26 量化价格走势的变化
signalLine= EMA(MACD) 衡量了价格平均的变化速率
'''
ema_26 = price_data['close'].transform(lambda x: x.ewm(span = 26).mean())
ema_12 = price_data['close'].transform(lambda x: x.ewm(span = 12).mean())
macd = ema_12 - ema_26

# Calculate the EMA
ema_9_macd = macd.ewm(span = 9).mean()

# Store the data in the data frame.
price_data['MACD'] = macd
price_data['MACD_EMA'] = ema_9_macd

# Print the head.
price_data.head(30)

Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,,,0.0,0.0
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,,,0.861965,0.478869
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,,,0.231152,0.377346
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,,,-2.27965,-0.522721
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,,,-3.46782,-1.398821
2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,,,-3.400519,-1.941392
2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,,,-0.525597,-1.583092
2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,,,-3.407613,-2.021559
2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,,,-3.223477,-2.299208
2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,,,-2.602875,-2.367247


In [53]:
# Calculate the Price Rate of Change
n = 9

# Calculate the Rate of Change in the Price, and store it in the Data Frame.
price_data['Price_Rate_Of_Change'] = price_data['close'].transform(lambda x: x.pct_change(periods = n))

# Print the first 30 rows
price_data.head(30)

Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change
2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,,,0.0,0.0,
2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,,,0.861965,0.478869,
2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,,,0.231152,0.377346,
2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,,,-2.27965,-0.522721,
2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,,,-3.46782,-1.398821,
2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,,,-3.400519,-1.941392,
2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,,,-0.525597,-1.583092,
2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,,,-3.407613,-2.021559,
2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,,,-3.223477,-2.299208,
2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,,,-2.602875,-2.367247,-0.006045


In [60]:
#OBV
def obv(group):

    # Grab the volume and close column.
    change = group['close'].diff()
    volume = group['volume']


    # intialize the previous OBV
    prev_obv = 0
    obv_values = []

    # calculate the On Balance Volume
    for i, j in zip(change, volume):

        if i > 0:
            current_obv = prev_obv + j
        elif i < 0:
            current_obv = prev_obv - j
        else:
            current_obv = prev_obv

        # OBV.append(current_OBV)
        prev_obv = current_obv
        obv_values.append(current_obv)

    # Return a panda series.
    return pd.Series(obv_values, index = group.index)

obv_groups=obv(price_data)

# add to the dataframe, but drop old index from obv_groups, before adding it
price_data['On_Balance_Volume'] = obv_groups.reset_index(level=0,drop=True)

# display the dataframe
price_data.head(30)


Unnamed: 0.1,Unnamed: 0,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On_Balance_Volume,Prediction
0,2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,,,0.0,0.0,,0.0,0
1,2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,,,0.861965,0.478869,,126115066.0,1
2,2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,,,0.231152,0.377346,,2463682.0,0
3,2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,,,-2.27965,-0.522721,,-126189145.0,0
4,2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,,,-3.46782,-1.398821,,-27811998.0,1
5,2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,,,-3.400519,-1.941392,,108515218.0,1
6,2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,,,-0.525597,-1.583092,,249170459.0,1
7,2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,,,-3.407613,-2.021559,,90826181.0,0
8,2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,,,-3.223477,-2.299208,,226780685.0,1
9,2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,,,-2.602875,-2.367247,-0.006045,347321820.0,1


In [61]:
'''
Building the model
'''

'''
从dataframe提取数据
'''
closed_groups=price_data['close']

closed_groups = closed_groups.transform(lambda x:x.shift(1) < x)

price_data['Prediction'] = closed_groups * 1

price_data.head()

price_data.to_csv('final_metrics.csv')


In [62]:
price_data=pd.read_csv("final_metrics.csv")
price_data.head(30)

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On_Balance_Volume,Prediction
0,0,2010-01-04,3289.75,3295.28,3243.32,3243.76,109447927.0,,,,,,,,,0.0,0.0,,0.0,0
1,1,2010-01-05,3254.468,3290.51,3221.46,3282.179,126115066.0,38.419,0.0,38.419,100.0,,,,,0.861965,0.478869,,126115066.0,1
2,2,2010-01-06,3277.517,3295.87,3253.04,3254.215,123651384.0,-27.964,27.964,0.0,54.35229,,,,,0.231152,0.377346,,2463682.0,0
3,3,2010-01-07,3253.991,3268.82,3176.71,3192.776,128652827.0,-61.439,61.439,0.0,25.195655,,,,,-2.27965,-0.522721,,-126189145.0,0
4,4,2010-01-08,3177.259,3198.92,3149.02,3195.997,98377147.0,3.221,0.0,3.221,27.546759,,,,,-3.46782,-1.398821,,-27811998.0,1
5,5,2010-01-11,3301.611,3306.75,3197.33,3212.75,136327216.0,16.753,0.0,16.753,39.044393,,,,,-3.400519,-1.941392,,108515218.0,1
6,6,2010-01-12,3205.705,3275.2,3180.09,3273.966,140655241.0,61.216,0.0,61.216,63.479277,,,,,-0.525597,-1.583092,,249170459.0,1
7,7,2010-01-13,3204.976,3232.82,3165.49,3172.658,158344278.0,-101.308,101.308,0.0,35.956196,,,,,-3.407613,-2.021559,,90826181.0,0
8,8,2010-01-14,3183.381,3219.02,3165.6,3215.55,135954504.0,42.892,0.0,42.892,47.150273,,,,,-3.223477,-2.299208,,226780685.0,1
9,9,2010-01-15,3217.19,3241.82,3197.22,3224.152,120541135.0,8.602,0.0,8.602,49.204768,,,,,-2.602875,-2.367247,-0.006045,347321820.0,1


In [63]:
'''
Removing NaN Values
'''
# We need to remove all rows that have an NaN value.
print('Before NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Any row that has a `NaN` value will be dropped.
price_data = price_data.dropna()

# Display how much we have left now.
print('After NaN Drop we have {} rows and {} columns'.format(price_data.shape[0], price_data.shape[1]))

# Print the head.
price_data.head()


Before NaN Drop we have 2561 rows and 20 columns
After NaN Drop we have 2548 rows and 20 columns


Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,open,high,low,close,volume,change_in_price,down_days,up_days,RSI,low_14,high_14,k_percent,r_percent,MACD,MACD_EMA,Price_Rate_Of_Change,On_Balance_Volume,Prediction
13,13,2010-01-21,3154.189,3176.15,3126.9,3158.863,114337079.0,7.013,0.0,7.013,36.616716,3126.9,3306.75,17.772032,-82.227968,-7.094912,-3.268117,-0.011619,568426557.0,1
14,14,2010-01-22,3118.724,3147.83,3062.63,3128.588,137758538.0,-30.275,30.275,0.0,31.788164,3062.63,3306.75,27.018679,-72.981321,-10.888528,-4.847778,-0.026196,430668019.0,0
15,15,2010-01-25,3103.32,3137.0,3092.93,3094.411,78361862.0,-34.177,34.177,0.0,27.128441,3062.63,3306.75,13.018597,-86.981403,-15.703068,-7.081716,-0.054843,352306157.0,0
16,16,2010-01-26,3094.848,3107.2,3001.96,3019.394,93507092.0,-75.017,75.017,0.0,19.783708,3001.96,3306.75,5.720004,-94.279996,-23.822345,-10.506972,-0.048308,258799065.0,0
17,17,2010-01-27,3020.54,3028.65,2972.63,2986.607,76552214.0,-32.787,32.787,0.0,17.407052,2972.63,3306.75,4.183228,-95.816772,-31.815166,-14.84679,-0.071199,182246851.0,0


In [68]:
x_cols = price_data[['RSI','k_percent','r_percent','Price_Rate_Of_Change','MACD','On_Balance_Volume']]
y_cols = price_data['Prediction']

x_train, x_test, y_train, y_test = train_test_split(x_cols,y_cols,random_state=0)

# Create our Random Forest Classifier
random_forest_classifier=RandomForestClassifier(n_estimators=100, oob_score = True, criterion='gini',random_state=50)

# fit data to the model
random_forest_classifier.fit(x_train,y_train)

# make predictions
y_pred=random_forest_classifier.predict(x_test)

In [69]:
# Print the Accuracy of our Model.
print('Correct Prediction (%): ', accuracy_score(y_test,y_pred,normalize = True)*100.0)


Correct Prediction (%):  70.17268445839875
