# Trend (MA) & Sklearn

### Import Library

In [261]:
import numpy as np
import pandas as pd
import numpy as np
import seaborn as sns

import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [12, 6]
plt.rcParams['figure.dpi'] = 120

import warnings
warnings.filterwarnings('ignore')

### Load Price Data

In [262]:
import os
from pathlib import Path
notebook_path = os.getcwd()
algo_dir = Path(notebook_path).parent.parent
csv_file = str(algo_dir) + '/vn-stock-data/VN30ps/VN30F1M_5minutes.csv'
is_file = os.path.isfile(csv_file)
if is_file:
    dataset = pd.read_csv(csv_file, index_col='Date', parse_dates=True)
else:
    dataset = pd.read_csv("https://raw.githubusercontent.com/zuongthaotn/vn-stock-data/main/VN30ps/VN30F1M_5minutes.csv", index_col='Date', parse_dates=True)

In [263]:
data = dataset.copy()
data["ma_line"] = data["Close"].rolling(20).mean()
data['above_ma'] = data.apply(lambda r: 1 if r['Close'] > r['ma_line'] else 0, axis=1)
data['below_ma'] = data.apply(lambda r: 1 if r['Close'] < r['ma_line'] else 0, axis=1)
data['total_above_ma'] = data['above_ma'].rolling(150).sum()
data['total_below_ma'] = data['below_ma'].rolling(150).sum()
data['trend'] = data.apply(lambda r: 'switch' if r['total_above_ma'] == r['total_below_ma'] else ('up' if r['total_above_ma'] > r['total_below_ma'] else 'down'), axis=1)
data.dropna(inplace=True)

In [264]:
def cal_high_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.max()

def cal_low_after(tick):
  tick = tick[100*tick.index.hour+tick.index.minute > 1355]
  tick = tick[100*tick.index.hour+tick.index.minute < 1430]
  return tick.min()


def cal_price(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1355]
  if not tick.empty:
    return tick[0]

def cal_close(tick):
  tick = tick[100*tick.index.hour+tick.index.minute == 1425]
  if not tick.empty:
    return tick[0]
  else:
    return 0

In [265]:
data2 = dataset.copy()
data2['price'] = data2.Close
data2['next_high'] = data2.High
data2['next_low'] = data2.Low
price = data2.resample("D").agg({
    'next_high':cal_high_after,
    'next_low': cal_low_after,
    'price': cal_price,
    'Close': cal_close
    })
def group_data(r):
    group = 3
    if r['next_high'] - r['price'] > 3 and r['price'] - r['next_low'] > 3:
        # Do nothing group
        group = 0
    elif r['return'] > 0 and r['price'] - r['next_low'] < 3:
        # Long group
        group = 1
    elif r['return'] < 0 and r['next_high'] - r['price'] < 3:
        # Short group
        group = 2
    return group
price['return'] = 1000 * (price.Close - price.price) / price.price
price['group'] = price.apply(lambda r: group_data(r), axis=1)
price = price.dropna()
price = price[['group', 'return']]

In [266]:
data = data.assign(time_d=pd.PeriodIndex(data.index, freq='1D').to_timestamp())
data = pd.merge(data, price, left_on="time_d", right_index=True, how="left")
data = data.dropna()
data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ma_line,above_ma,below_ma,total_above_ma,total_below_ma,trend,time_d,group,return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-08-15 14:25:00,951.4,952.4,951.0,951.2,3377,955.960,0,1,82.0,49.0,up,2018-08-15,2,-8.340284
2018-08-15 14:30:00,951.5,951.5,951.5,951.5,24,955.625,0,1,82.0,50.0,up,2018-08-15,2,-8.340284
2018-08-15 14:45:00,946.6,946.6,946.6,946.6,2498,955.055,0,1,82.0,51.0,up,2018-08-15,2,-8.340284
2018-08-16 09:00:00,942.4,942.4,941.0,942.0,1666,954.330,0,1,82.0,52.0,up,2018-08-16,1,1.590668
2018-08-16 09:05:00,942.0,942.1,941.0,941.8,1002,953.660,0,1,82.0,53.0,up,2018-08-16,1,1.590668
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-16 14:15:00,1300.5,1302.1,1299.1,1302.0,9624,1305.255,0,1,68.0,82.0,down,2024-07-16,2,-0.997621
2024-07-16 14:20:00,1302.0,1303.4,1301.1,1302.4,7682,1305.025,0,1,68.0,82.0,down,2024-07-16,2,-0.997621
2024-07-16 14:25:00,1302.4,1302.4,1301.1,1301.8,4741,1304.760,0,1,68.0,82.0,down,2024-07-16,2,-0.997621
2024-07-16 14:30:00,1301.9,1302.0,1301.9,1302.0,91,1304.475,0,1,68.0,82.0,down,2024-07-16,2,-0.997621


In [267]:
_2pm_data = data[100*data.index.hour + data.index.minute == 1355]
_2pm_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ma_line,above_ma,below_ma,total_above_ma,total_below_ma,trend,time_d,group,return
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1
2018-08-16 13:55:00,942.2,943.0,941.6,943.0,1757,938.710,1,0,68.0,82.0,down,2018-08-16,1,1.590668
2018-08-17 13:55:00,947.5,947.5,945.8,946.5,2820,951.525,0,1,69.0,81.0,down,2018-08-17,1,0.528262
2018-08-20 13:55:00,949.9,949.9,946.0,946.5,4374,949.290,0,1,69.0,81.0,down,2018-08-20,2,-1.901743
2018-08-21 13:55:00,949.6,950.2,949.0,950.1,2323,948.480,1,0,92.0,58.0,up,2018-08-21,1,4.841596
2018-08-22 13:55:00,959.3,959.4,958.1,958.2,2156,957.925,1,0,91.0,59.0,up,2018-08-22,2,-3.235233
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-10 13:55:00,1316.6,1317.4,1315.9,1316.3,5079,1317.265,0,1,81.0,69.0,up,2024-07-10,2,-4.558231
2024-07-11 13:55:00,1311.6,1312.5,1310.7,1312.2,5298,1312.535,0,1,77.0,73.0,up,2024-07-11,2,-4.267642
2024-07-12 13:55:00,1302.8,1303.5,1302.4,1302.8,2534,1304.930,0,1,58.0,92.0,down,2024-07-12,0,-2.533006
2024-07-15 13:55:00,1300.0,1301.0,1299.9,1299.9,3014,1301.905,0,1,56.0,94.0,down,2024-07-15,2,-2.230941


In [268]:
len(_2pm_data[_2pm_data['group'] == 0])

221

In [269]:
len(_2pm_data[_2pm_data['group'] == 1])

610

In [270]:
len(_2pm_data[_2pm_data['group'] == 2])

553

### Kmeans Clustering

In [271]:
from sklearn.model_selection import train_test_split
from sklearn import metrics

In [272]:
 # split dataset in features and target variable
feature_cols = ["total_above_ma", "total_below_ma"]
X = _2pm_data[feature_cols]  # Features
y = _2pm_data['group'] # Target variable
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=False)  # 80% training and 20% test

In [273]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3, random_state=0).fit(X_train)

In [274]:
# Predict the response for test dataset
y_pred = kmeans.predict(X_test)

In [275]:
y_test

Date
2023-05-15 13:55:00    2
2023-05-16 13:55:00    2
2023-05-17 13:55:00    2
2023-05-18 13:55:00    1
2023-05-19 13:55:00    0
                      ..
2024-07-10 13:55:00    2
2024-07-11 13:55:00    2
2024-07-12 13:55:00    0
2024-07-15 13:55:00    2
2024-07-16 13:55:00    2
Name: group, Length: 295, dtype: int64

In [276]:
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.36610169491525424
R2_score: -0.8559070836610003


In [277]:
X_result = X_test.copy()
X_result = X_result.assign(Group=y_test)
X_result = X_result.assign(Predicts=y_pred)
X_result

Unnamed: 0_level_0,total_above_ma,total_below_ma,Group,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2023-05-15 13:55:00,101.0,49.0,2,1
2023-05-16 13:55:00,87.0,63.0,2,2
2023-05-17 13:55:00,90.0,60.0,2,1
2023-05-18 13:55:00,85.0,65.0,1,2
2023-05-19 13:55:00,78.0,72.0,0,2
...,...,...,...,...
2024-07-10 13:55:00,81.0,69.0,2,2
2024-07-11 13:55:00,77.0,73.0,2,2
2024-07-12 13:55:00,58.0,92.0,0,0
2024-07-15 13:55:00,56.0,94.0,2,0


In [278]:
import xgboost as xgb
# Create XGBClassifier model
model_xgb = xgb.XGBClassifier(n_estimators=100, random_state=42)

# Train XGBClassifier
model_xgb = model_xgb.fit(X_train, y_train)

In [279]:
# Predict the response for test dataset
y_pred = model_xgb.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.37966101694915255
R2_score: -0.22686165386244017


In [280]:
X_result = X_test.copy()
X_result = X_result.assign(Predicts=y_pred)
X_result = X_result[['Predicts']]
_2pm_data = pd.merge(_2pm_data, X_result, left_index=True, right_index=True, how="left")
_2pm_data.dropna(inplace=True)

In [281]:
_2pm_data

Unnamed: 0_level_0,Open,High,Low,Close,Volume,ma_line,above_ma,below_ma,total_above_ma,total_below_ma,trend,time_d,group,return,Predicts
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2023-05-15 13:55:00,1070.6,1071.0,1069.1,1069.6,3038,1070.035,0,1,101.0,49.0,up,2023-05-15,2,-2.617801,1.0
2023-05-16 13:55:00,1066.4,1067.0,1066.3,1067.0,4284,1067.880,0,1,87.0,63.0,up,2023-05-16,2,-0.093721,1.0
2023-05-17 13:55:00,1071.2,1071.2,1069.5,1070.0,6275,1072.120,0,1,90.0,60.0,up,2023-05-17,2,-3.551402,2.0
2023-05-18 13:55:00,1072.2,1072.4,1071.6,1072.4,2356,1070.905,1,0,85.0,65.0,up,2023-05-18,1,1.585229,2.0
2023-05-19 13:55:00,1058.8,1059.5,1056.7,1056.7,5026,1057.950,0,1,78.0,72.0,up,2023-05-19,0,5.110249,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-07-10 13:55:00,1316.6,1317.4,1315.9,1316.3,5079,1317.265,0,1,81.0,69.0,up,2024-07-10,2,-4.558231,2.0
2024-07-11 13:55:00,1311.6,1312.5,1310.7,1312.2,5298,1312.535,0,1,77.0,73.0,up,2024-07-11,2,-4.267642,1.0
2024-07-12 13:55:00,1302.8,1303.5,1302.4,1302.8,2534,1304.930,0,1,58.0,92.0,down,2024-07-12,0,-2.533006,1.0
2024-07-15 13:55:00,1300.0,1301.0,1299.9,1299.9,3014,1301.905,0,1,56.0,94.0,down,2024-07-15,2,-2.230941,2.0


In [283]:
Long = _2pm_data[(_2pm_data.group == 1) & (_2pm_data.group == _2pm_data.Predicts)]
# Long
Long['return'].sum()

290.6199693554074

In [285]:
Short = _2pm_data[(_2pm_data.group == 2) & (_2pm_data.group == _2pm_data.Predicts)]
# Short
Short['return'].sum()

-253.827260242612

In [286]:
Loss = _2pm_data[(_2pm_data.group == 0) & (_2pm_data.Predicts != 0)]
# Loss
Loss['return'].count()

58

In [287]:
from sklearn.ensemble import RandomForestClassifier

#Create a Gaussian Classifier
clf=RandomForestClassifier(n_estimators=100)

#Train the model using the training sets 	y_pred=clf.predict(X_test)
clf.fit(X_train,y_train)

y_pred=clf.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.3898305084745763
R2_score: -0.2714748049119833


In [288]:
from sklearn.ensemble import GradientBoostingClassifier
# Create GradientBoostingClassifier model
gbc = GradientBoostingClassifier(learning_rate=0.01, random_state=50, n_estimators=150)

# Train
gbc = gbc.fit(X_train, y_train)

# Predict the response for test dataset
y_pred = gbc.predict(X_test)
# Model Accuracy, how often is the classifier correct?
print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("R2_score:", metrics.r2_score(y_test, y_pred))

Accuracy: 0.4033898305084746
R2_score: -0.06625431008408422
