# 量化交易 X 機器學習

##（一）下載資料集
將會使用到的資料（train.csv、test.csv）下載下來。

In [None]:
!gdown --id 18WzLy-pv8niLX5SAST_ImM-2zISSOXsV

Downloading...
From: https://drive.google.com/uc?id=18WzLy-pv8niLX5SAST_ImM-2zISSOXsV
To: /content/train.csv
100% 200M/200M [00:02<00:00, 96.7MB/s]


## （二）導入套件
引入使用到的套件


In [None]:
!pip install pandas_ta

Collecting pandas_ta
  Downloading pandas_ta-0.3.14b.tar.gz (115 kB)
[?25l[K     |██▉                             | 10 kB 24.5 MB/s eta 0:00:01[K     |█████▊                          | 20 kB 30.0 MB/s eta 0:00:01[K     |████████▌                       | 30 kB 14.3 MB/s eta 0:00:01[K     |███████████▍                    | 40 kB 4.5 MB/s eta 0:00:01[K     |██████████████▎                 | 51 kB 5.5 MB/s eta 0:00:01[K     |█████████████████               | 61 kB 6.3 MB/s eta 0:00:01[K     |████████████████████            | 71 kB 7.2 MB/s eta 0:00:01[K     |██████████████████████▊         | 81 kB 8.1 MB/s eta 0:00:01[K     |█████████████████████████▋      | 92 kB 9.0 MB/s eta 0:00:01[K     |████████████████████████████▌   | 102 kB 7.7 MB/s eta 0:00:01[K     |███████████████████████████████▎| 112 kB 7.7 MB/s eta 0:00:01[K     |████████████████████████████████| 115 kB 7.7 MB/s 
Building wheels for collected packages: pandas-ta
  Building wheel for pandas-ta (setup.p

In [None]:
import pandas as pd 
import numpy as np
import pandas_ta as ta
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression, Ridge, LogisticRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, classification_report
from sklearn.ensemble import RandomForestClassifier

from keras.models import Sequential
from keras.layers import SimpleRNN, LSTM, GRU, Dense, Dropout, LeakyReLU
from keras.callbacks import EarlyStopping

from xgboost import XGBClassifier

##（三）設定參數
將參數整理在一起，方便管理



In [None]:
train_file_path = 'train.csv'
train_ratio = 0.8 #可以改成任意0到到1之間之間的數值（不包括0跟跟1）
minutes_combined = 60*24 #可以改成任意正整數
rows_used = 10 #可以改成任意正整數
method = 'Classification' #有'Regression', 'RNN', 'Classification' 可以做選用
regression_method = 'RidgeRegression' #有'LinearRegression', 'RidgeRegression' 可以做選用
ridge_regression_alpha = 0.1 #可以改成任意大於0的數值
rnn_method = 'SimpleRNN' #有'SimpleRNN', 'LSTM', 'GRU' 可以做選用
classification_method = 'XGBClassifier' #有'LogisticRegression', 'RandomForest', 'XGBClassifier' 可以做選用
normalize_method = 'normalize_by_change' #有'z_normalization', 'min_max_normalization', 'normalize_by_change' 可以做選用
use_ta = False

##（四）讀取及區分資料
讀取之前下載好的資料以便之後做使用，並將其區分為訓練及測試資料。

In [None]:
df = pd.read_csv(train_file_path)
print(len(df))
print(df.head())

# Todo: drop timestamp column
df = df.drop(columns=['timestamp'])
print(len(df))
print(df.head())

df = df[:int(train_ratio * len(df))]

original_train_df = df[:int(train_ratio * len(df))]
original_valid_df = df[int(train_ratio * len(df)):]

1956282
    timestamp  Count       Open  ...      Volume          VWAP      Close
0  1514764860  229.0  13835.194  ...   31.550062  13827.062093  13850.176
1  1514764920  235.0  13835.036  ...   31.046432  13840.362591  13828.102
2  1514764980  528.0  13823.900  ...   55.061820  13806.068014  13801.314
3  1514765040  435.0  13802.512  ...   38.780529  13783.598101  13768.040
4  1514765100  742.0  13766.000  ...  108.501637  13735.586842  13724.914

[5 rows x 8 columns]
1956282
   Count       Open     High       Low      Volume          VWAP      Close
0  229.0  13835.194  14013.8  13666.11   31.550062  13827.062093  13850.176
1  235.0  13835.036  14052.3  13680.00   31.046432  13840.362591  13828.102
2  528.0  13823.900  14000.4  13601.00   55.061820  13806.068014  13801.314
3  435.0  13802.512  13999.0  13576.28   38.780529  13783.598101  13768.040
4  742.0  13766.000  13955.9  13554.44  108.501637  13735.586842  13724.914


##（五）處理訓練及測試資料
將訓練及測試資料做處理

In [None]:
train_df = original_train_df
valid_df = original_valid_df

pd.options.mode.chained_assignment = None

if (minutes_combined > 1):
  train_minute_list = [i for i in range(1, len(train_df)) if i % minutes_combined == 0]
  valid_minute_list = [i for i in range(len(train_df), len(df)) if i % minutes_combined == 0]

  for index, minute in enumerate(train_minute_list):
    temp_df = train_df.loc[(minute-minutes_combined):minute] 
    train_df.loc[index,:] = np.array([temp_df['Count'].sum(), temp_df['Open'].iloc[0], temp_df['High'].max(), temp_df['Low'].min(), \
      temp_df['Volume'].sum(), temp_df['VWAP'].mul(temp_df['Volume']).sum()/temp_df['Volume'].sum(), temp_df['Close'].iloc[-1]])
  train_df = train_df.loc[:len(train_minute_list)]
  print("train data processing finished")

  valid_df = pd.concat([original_train_df.iloc[-10:], valid_df], axis=0)

  for index, minute in enumerate(valid_minute_list):
    temp_df = valid_df.loc[(minute-minutes_combined):minute] 
    valid_df.loc[len(original_train_df)+index,:] = np.array([temp_df['Count'].sum(), temp_df['Open'].iloc[0], temp_df['High'].max(), temp_df['Low'].min(), \
      temp_df['Volume'].sum(), temp_df['VWAP'].mul(temp_df['Volume']).sum()/temp_df['Volume'].sum(), temp_df['Close'].iloc[-1]])
  valid_df = valid_df.loc[len(original_train_df):len(original_train_df)+len(valid_minute_list)]
  print("valid data processing finished")

if (use_ta):
  valid_df = pd.concat([train_df.iloc[-30:], valid_df], axis=0)

  train_df['MA10'] = train_df.ta.sma(10)
  train_df['MA30'] = train_df.ta.sma(30)
  train_df.ta.stoch(high='High', low='Low', k=14, d=3, append=True)
  train_df = train_df[30:]

  # Todo: do the same for valid data
  valid_df['MA10'] = valid_df.ta.sma(10)
  valid_df['MA30'] = valid_df.ta.sma(30)
  valid_df.ta.stoch(high='High', low='Low', k=14, d=3, append=True)
  valid_df = valid_df[30:]

train data processing finished
valid data processing finished


##（六）資料標準化
資料往往需要經過標準化才能提升模型的表現

In [None]:
original_Y_train = np.array(train_df['Close'])[rows_used:]
original_Y_valid = np.array(valid_df['Close'])[rows_used:]

normalized_train_df = train_df
normalized_valid_df = valid_df
normalized_train_close = train_df['Close']
normalized_valid_close = valid_df['Close']

if (normalize_method):
	if (normalize_method == 'z_normalization'):
		train_mean = train_df.mean()
		train_std = train_df.std()
		normalized_train_df = (normalized_train_df - train_mean)/train_std
		normalized_valid_df = (normalized_valid_df - train_mean)/train_std
		normalized_train_close = normalized_train_df['Close']
		normalized_valid_close = normalized_valid_df['Close']
	if (normalize_method == 'min_max_normalization'):
		train_min = train_df.min()
		train_max = train_df.max()
		normalized_train_df = (normalized_train_df - train_min)/(train_max - train_min)
		normalized_valid_df = (normalized_valid_df - train_min)/(train_max - train_min)
		normalized_train_close = normalized_train_df['Close']
		normalized_valid_close = normalized_valid_df['Close']
	if (normalize_method == 'normalize_by_change'):
		normalized_train_df = normalized_train_df.div(normalized_train_df.shift(1))
		normalized_train_close = normalized_train_close.div(normalized_train_close.shift(1))
		normalized_train_df.iloc[0] = 1 
		normalized_train_close.iloc[0]= 1
		normalized_valid_df = normalized_valid_df.div(normalized_valid_df.shift(1))
		normalized_valid_close = normalized_valid_close.div(normalized_valid_close.shift(1))
		normalized_valid_df.iloc[0] = valid_df.iloc[0].div(train_df.iloc[-1])
		normalized_valid_close.iloc[0] = valid_df['Close'].iloc[0]/train_df['Close'].iloc[-1]
		#print(normalized_train_close[:5])
		#print(normalized_valid_close[:5])

X_train = np.array(normalized_train_df[:-rows_used])
for i in range(1, rows_used):
	X_train = np.append(X_train, np.array(normalized_train_df[i:-rows_used+i]), axis=1)

Y_train = np.array(normalized_train_close)[rows_used:]
#print(X_train.shape)
#print(X_train[:5])
#print(Y_train.shape)
#print(Y_train[:5])

X_valid = np.array(normalized_valid_df[:-rows_used])
for i in range(1, rows_used):
	X_valid = np.append(X_valid, np.array(normalized_valid_df[i:-rows_used+i]), axis=1)

Y_valid = np.array(normalized_valid_close)[rows_used:]
#print(X_valid.shape)
#print(X_valid[:5])
#print(Y_valid.shape)
#print(Y_valid[:5])

##（七）訓練模型


In [None]:
train_pred = valid_pred = []

if (method == 'Regression'):
	if (regression_method == 'LinearRegression'):
		model = LinearRegression()
	elif (regression_method == 'RidgeRegression'):
		model = Ridge(alpha=ridge_regression_alpha)
	# Todo: fit the model
	model.fit(X_train, Y_train)

	print(model.coef_)
	print(model.intercept_)

	train_pred = model.predict(X_train)
	valid_pred = model.predict(X_valid)
elif (method == 'RNN'):
	X_train = np.reshape(X_train, (X_train.shape[0], rows_used, int(X_train.shape[1]/rows_used)))
	X_valid = np.reshape(X_valid, (X_valid.shape[0], rows_used, int(X_valid.shape[1]/rows_used)))
	model = Sequential()
	if (rnn_method == 'SimpleRNN'):
		model.add(SimpleRNN(units=30, input_shape=(X_train.shape[1],X_train.shape[2])))
		model.add(Dropout(0.3))
	elif (rnn_method == 'LSTM'):
		model.add(LSTM(units=30, input_shape=(X_train.shape[1],X_train.shape[2])))
		model.add(Dropout(0.3))
	elif (rnn_method == 'GRU'):
		model.add(GRU(units=30, input_shape=(X_train.shape[1],X_train.shape[2])))
		model.add(Dropout(0.3))
	model.add(Dense(1, activation='elu'))
	model.compile(loss="mse", optimizer="adam")
	callback = EarlyStopping(monitor="val_loss", patience=5, verbose=1, mode="auto")
	history = model.fit(X_train, Y_train, epochs=100, batch_size=128, validation_data=(X_valid, Y_valid), callbacks=[callback])
	train_pred = model.predict(X_train).flatten()
	valid_pred = model.predict(X_valid).flatten()
elif (method == 'Classification'):
	if (normalize_method == 'z_normalization' or normalize_method == 'min_max_normalization'):
		Y_train = pd.DataFrame(Y_train)
		Y_train = Y_train.div(Y_train.shift(1))
		Y_train.iloc[0] = 1
		Y_train = np.array(Y_train)
		Y_valid = pd.DataFrame(Y_valid)
		Y_valid = Y_valid.div(Y_valid.shift(1))
		Y_valid.iloc[0] = 1
		Y_valid = np.array(Y_valid)
	increase_index_list = [index for index,value in enumerate(Y_train) if value > 1]
	decrease_index_list = [index for index,value in enumerate(Y_train) if value <= 1]
	Y_train[increase_index_list] = int(1)
	Y_train[decrease_index_list] = int(0)
	increase_index_list = [index for index,value in enumerate(Y_valid) if value > 1]
	decrease_index_list = [index for index,value in enumerate(Y_valid) if value <= 1]
	Y_valid[increase_index_list] = int(1)
	Y_valid[decrease_index_list] = int(0)
	
	if (classification_method == 'LogisticRegression'):
		model = LogisticRegression(verbose=1, n_jobs=-1)
	elif (classification_method == 'RandomForest'):
		model = RandomForestClassifier(n_estimators=100, max_depth=10, verbose=1, n_jobs=-1)
	elif (classification_method == 'XGBClassifier'):
		model = XGBClassifier(n_estimators=11, eta='0.01', max_depth=10, subsample=0.8, colsample_bytree=1, gamma=3, \
			eval_metric='auc', tree_method='hist', n_jobs=-1, verbosity=1, use_label_encoder =False)

	# Todo: fit and predict the model
	model.fit(X_train, Y_train)
	train_pred = model.predict(X_train)
	valid_pred = model.predict(X_valid)

##（八）衡量模型結果

In [None]:
if (method == 'Regression' or method == 'RNN'):
  if (normalize_method == 'normalize_by_change'):
    #print(valid_pred[:10])
    #print(np.array(valid_df['Close'])[rows_used-1:-1][:10])
    #print(len(valid_pred))
    #print(len(np.array(valid_df['Close'])[rows_used-1:-1]))
    train_pred = train_pred * np.array(train_df['Close'])[rows_used-1:-1]
    valid_pred = valid_pred * np.array(valid_df['Close'])[rows_used-1:-1]
    #print(valid_pred[:10])
  elif (normalize_method == 'z_normalization'):
    train_pred = train_pred * train_df['Close'].std() + train_df['Close'].mean()
    valid_pred = valid_pred * train_df['Close'].std() + train_df['Close'].mean()
  elif (normalize_method == 'min_max_normalization'):
    train_pred = train_pred * (train_df['Close'].max()-train_df['Close'].min()) + train_df['Close'].min()
    valid_pred = valid_pred * (train_df['Close'].max()-train_df['Close'].min()) + train_df['Close'].min()
  print("Train RMSE: {}".format(mean_squared_error(original_Y_train, train_pred, squared=False)))
  print("Train MAE: {}".format(mean_absolute_error(original_Y_train, train_pred)))
  print("Validation RMSE: {}".format(mean_squared_error(original_Y_valid, valid_pred, squared=False)))
  print("Validation MAE: {}".format(mean_absolute_error(original_Y_valid, valid_pred)))
  plt.plot([i for i in range(1,len(original_Y_valid)-1)], original_Y_valid[1:-1])
  plt.plot([i for i in range(1,len(original_Y_valid)-1)], valid_pred[1:-1])
  plt.title('Predict Curve')
  plt.legend(['real', 'predict'], loc='upper left')
  plt.show()
  plt.plot([i for i in range(1,50)], original_Y_valid[1:50])
  plt.plot([i for i in range(1,50)], valid_pred[1:50])
  plt.title('Predict Curve')
  plt.legend(['real', 'predict'], loc='upper left')
  plt.show()
elif (method == 'Classification'):
  target_names = ['Decrease', 'Increase']
  #print(train_pred)
  print(classification_report(Y_train, train_pred, target_names=target_names, labels=[0,1]))
  print(classification_report(Y_valid, valid_pred, target_names=target_names, labels=[0,1]))

              precision    recall  f1-score   support

    Decrease       0.97      0.99      0.98       419
    Increase       0.99      0.98      0.98       441

    accuracy                           0.98       860
   macro avg       0.98      0.98      0.98       860
weighted avg       0.98      0.98      0.98       860

              precision    recall  f1-score   support

    Decrease       0.48      0.47      0.47        88
    Increase       0.61      0.62      0.62       120

    accuracy                           0.56       208
   macro avg       0.55      0.55      0.55       208
weighted avg       0.56      0.56      0.56       208

